From 3e3bdeba229579734cb2241e1ac1b86376c1d17f Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Fri, 5 Mar 2021 15:46:12 +0300
Subject: [PATCH 01/16] Fix CamVid

---
 datumaro/plugins/camvid_format.py | 12 ++++++++++--
 tests/test_camvid_format.py       | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index ace780148b..2a3c9aaf6c 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -7,7 +7,6 @@
 import os.path as osp
 from collections import OrderedDict
 from enum import Enum
-from glob import glob
 
 import numpy as np
 from datumaro.components.converter import Converter
@@ -157,6 +156,15 @@ def _load_items(self, path):
         with open(path, encoding='utf-8') as f:
             for line in f:
                 objects = line.split()
+                if 2 < len(objects):
+                    if len(objects) % 2:
+                        raise Exception("Line %s: image and gt file must have  "
+                            "the same name" % line)
+                    else:
+                        mid = int(len(objects) / 2)
+                        objects[0] = ' '.join(objects[i] for i in range(mid))
+                        objects[1] = ' '.join(objects[i] for i in range(mid, 2 * mid))
+                        objects = objects[:2]
                 image = objects[0]
                 item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)]
                 image_path = osp.join(self._dataset_dir,
@@ -262,7 +270,7 @@ def save_segm_lists(self, subset_name, segm_list):
             return
 
         ann_file = osp.join(self._save_dir, subset_name + '.txt')
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w' , encoding='utf-8') as f:
             for item in segm_list:
                 if segm_list[item]:
                     path_mask = '/%s/%s' % (subset_name + CamvidPath.SEGM_DIR,
diff --git a/tests/test_camvid_format.py b/tests/test_camvid_format.py
index 85e0b6e7d9..1a1c17d429 100644
--- a/tests/test_camvid_format.py
+++ b/tests/test_camvid_format.py
@@ -145,6 +145,22 @@ def __iter__(self):
             self._test_save_and_load(TestExtractor(),
                 partial(CamvidConverter.convert, label_map='camvid'), test_dir)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        class TestExtractor(TestExtractorBase):
+            def __iter__(self):
+                return iter([
+                    DatasetItem(id='кириллица в имени файла',
+                        image=np.ones((1, 5, 3)), annotations=[
+                            Mask(image=np.array([[1, 0, 0, 1, 0]]), label=0),
+                            Mask(image=np.array([[0, 1, 1, 0, 1]]), label=3),
+                        ]
+                    ),
+                ])
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(TestExtractor(),
+                partial(CamvidConverter.convert, label_map='camvid'), test_dir)
+
     def test_can_save_with_no_masks(self):
         class TestExtractor(TestExtractorBase):
             def __iter__(self):

From 382e7019ebad0bd9048bf73ccb9b2dba5b120b18 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Wed, 10 Mar 2021 11:08:09 +0300
Subject: [PATCH 02/16] add test and fixes to other formats

---
 datumaro/plugins/imagenet_format.py       |  2 +-
 datumaro/plugins/imagenet_txt_format.py   | 20 +++++++---
 datumaro/plugins/lfw_format.py            | 46 +++++++++++++++++------
 datumaro/plugins/vgg_face2_format.py      |  8 ++--
 datumaro/plugins/voc_format/converter.py  | 16 ++++----
 datumaro/plugins/voc_format/extractor.py  | 10 +++--
 datumaro/plugins/widerface_format.py      |  4 +-
 datumaro/plugins/yolo_format/converter.py |  8 ++--
 datumaro/plugins/yolo_format/extractor.py | 10 ++---
 tests/test_camvid_format.py               |  2 +-
 tests/test_coco_format.py                 | 10 +++++
 tests/test_cvat_format.py                 | 34 +++++++++++++++++
 tests/test_datumaro_format.py             |  9 +++++
 tests/test_icdar_format.py                | 12 ++++++
 tests/test_image_dir_format.py            | 11 +++++-
 tests/test_imagenet_format.py             | 18 +++++++++
 tests/test_imagenet_txt_format.py         | 19 ++++++++++
 tests/test_labelme_format.py              | 42 +++++++++++++++++++++
 tests/test_lfw_format.py                  | 24 ++++++++++++
 tests/test_market1501_format.py           | 18 +++++++++
 tests/test_mots_format.py                 | 13 +++++++
 tests/test_tfrecord_format.py             | 21 +++++++++++
 tests/test_vgg_face2_format.py            | 16 ++++++++
 tests/test_voc_format.py                  | 14 +++++++
 tests/test_widerface_format.py            | 20 ++++++++++
 tests/test_yolo_format.py                 | 18 +++++++++
 26 files changed, 378 insertions(+), 47 deletions(-)

diff --git a/datumaro/plugins/imagenet_format.py b/datumaro/plugins/imagenet_format.py
index 9702262008..829534d700 100644
--- a/datumaro/plugins/imagenet_format.py
+++ b/datumaro/plugins/imagenet_format.py
@@ -1,4 +1,4 @@
-
+#_*_ coding:utf-8 _*_
 # Copyright (C) 2020 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
diff --git a/datumaro/plugins/imagenet_txt_format.py b/datumaro/plugins/imagenet_txt_format.py
index 36ee68a7c5..f24c7e6bc1 100644
--- a/datumaro/plugins/imagenet_txt_format.py
+++ b/datumaro/plugins/imagenet_txt_format.py
@@ -49,9 +49,14 @@ def _load_items(self, path):
         items = {}
         with open(path, encoding='utf-8') as f:
             for line in f:
-                item = line.split()
-                item_id = item[0]
-                label_ids = [int(id) for id in item[1:]]
+                item = line.split('\"')
+                if len(item) == 3:
+                    item_id = item[1]
+                    label_ids = [int(id) for id in item[2].split()]
+                else:
+                    item = line.split()
+                    item_id = item[0]
+                    label_ids = [int(id) for id in item[1:]]
                 anno = []
                 for label in label_ids:
                     assert 0 <= label and \
@@ -96,10 +101,13 @@ def apply(self):
                     self._save_image(item,
                         osp.join(self._save_dir, ImagenetTxtPath.IMAGE_DIR,
                             self._make_image_filename(item)))
-
+            annotation = ''
+            for item_id, item_labels in labels.items():
+                if 1 < len(item_id.split()):
+                    item_id = '\"' + item_id + '\"'
+                annotation += '%s %s\n' % (item_id, ' '.join(item_labels))
             with open(annotation_file, 'w', encoding='utf-8') as f:
-                f.writelines(['%s %s\n' % (item_id, ' '.join(labels[item_id]))
-                    for item_id in labels])
+                f.write(annotation)
 
         labels_file = osp.join(subset_dir, ImagenetTxtPath.LABELS_FILE)
         with open(labels_file, 'w', encoding='utf-8') as f:
diff --git a/datumaro/plugins/lfw_format.py b/datumaro/plugins/lfw_format.py
index 3d16a2949f..44370475f4 100644
--- a/datumaro/plugins/lfw_format.py
+++ b/datumaro/plugins/lfw_format.py
@@ -31,10 +31,14 @@ def _load_items(self, path):
         images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR)
         with open(path, encoding='utf-8') as f:
             for line in f:
-                pair = line.strip().split()
+                pair = line.strip().split('\t')
                 if len(pair) == 3:
-                    image1 = self.get_image_name(pair[0], pair[1])
-                    image2 = self.get_image_name(pair[0], pair[2])
+                    if pair[0] == '-':
+                        image1 = pair[1]
+                        image2 = pair[2]
+                    else:
+                        image1 = self.get_image_name(pair[0], pair[1])
+                        image2 = self.get_image_name(pair[0], pair[2])
                     if image1 not in items:
                         items[image1] = DatasetItem(id=image1, subset=self._subset,
                             image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
@@ -47,8 +51,14 @@ def _load_items(self, path):
                     attributes = items[image1].attributes
                     attributes['positive_pairs'].append(image2)
                 elif len(pair) == 4:
-                    image1 = self.get_image_name(pair[0], pair[1])
-                    image2 = self.get_image_name(pair[2], pair[3])
+                    if pair[0] == '-':
+                        image1 = pair[1]
+                    else:
+                        image1 = self.get_image_name(pair[0], pair[1])
+                    if pair[2] == '-':
+                        image2 = pair[3]
+                    else:
+                        image2 = self.get_image_name(pair[2], pair[3])
                     if image1 not in items:
                         items[image1] = DatasetItem(id=image1, subset=self._subset,
                             image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
@@ -102,17 +112,31 @@ def apply(self):
                     self._save_image(item, osp.join(self._save_dir, subset_name,
                         LfwPath.IMAGES_DIR, item.id + LfwPath.IMAGE_EXT))
 
-                person1, num1 = LfwPath.PATTERN.search(item.id).groups()
-                num1 = int(num1)
+                search = LfwPath.PATTERN.search(item.id)
+                if search:
+                    person1, num1 = search.groups()
+                    num1 = int(num1)
+                else:
+                    person1 = '-'
+                    num1 = item.id
                 if 'positive_pairs' in item.attributes:
                     for pair in item.attributes['positive_pairs']:
-                        num2 = LfwPath.PATTERN.search(pair).groups()[1]
-                        num2 = int(num2)
+                        search = LfwPath.PATTERN.search(pair)
+                        if search:
+                            num2 = search.groups()[1]
+                            num2 = int(num2)
+                        else:
+                            num2 = pair
                         positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))
                 if 'negative_pairs' in item.attributes:
                     for pair in item.attributes['negative_pairs']:
-                        person2, num2 = LfwPath.PATTERN.search(pair).groups()
-                        num2 = int(num2)
+                        search = LfwPath.PATTERN.search(pair)
+                        if search:
+                            person2, num2 = search.groups()
+                            num2 = int(num2)
+                        else:
+                            person2 = '-'
+                            num2 = pair
                         negative_pairs.append('%s\t%s\t%s\t%s' % \
                             (person1, num1, person2, num2))
 
diff --git a/datumaro/plugins/vgg_face2_format.py b/datumaro/plugins/vgg_face2_format.py
index c38478193b..e4e718606e 100644
--- a/datumaro/plugins/vgg_face2_format.py
+++ b/datumaro/plugins/vgg_face2_format.py
@@ -68,7 +68,7 @@ def _split_item_path(path):
 
         items = {}
 
-        with open(path) as content:
+        with open(path, encoding='utf-8') as content:
             landmarks_table = list(csv.DictReader(content))
         for row in landmarks_table:
             item_id = row['NAME_ID']
@@ -96,7 +96,7 @@ def _split_item_path(path):
         bboxes_path = osp.join(self._dataset_dir, VggFace2Path.ANNOTATION_DIR,
             VggFace2Path.BBOXES_FILE + self._subset + '.csv')
         if osp.isfile(bboxes_path):
-            with open(bboxes_path) as content:
+            with open(bboxes_path, encoding='utf-8') as content:
                 bboxes_table = list(csv.DictReader(content))
             for row in bboxes_table:
                 item_id = row['NAME_ID']
@@ -224,7 +224,7 @@ def apply(self):
             landmarks_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
                 VggFace2Path.LANDMARKS_FILE + subset_name + '.csv')
             os.makedirs(osp.dirname(landmarks_path), exist_ok=True)
-            with open(landmarks_path, 'w', newline='') as file:
+            with open(landmarks_path, 'w', encoding='utf-8', newline='') as file:
                 columns = ['NAME_ID', 'P1X', 'P1Y', 'P2X', 'P2Y',
                     'P3X', 'P3Y', 'P4X', 'P4Y', 'P5X', 'P5Y']
                 writer = csv.DictWriter(file, fieldnames=columns)
@@ -235,7 +235,7 @@ def apply(self):
                 bboxes_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
                     VggFace2Path.BBOXES_FILE + subset_name + '.csv')
                 os.makedirs(osp.dirname(bboxes_path), exist_ok=True)
-                with open(bboxes_path, 'w', newline='') as file:
+                with open(bboxes_path, 'w', encoding='utf-8', newline='') as file:
                     columns = ['NAME_ID', 'X', 'Y', 'W', 'H']
                     writer = csv.DictWriter(file, fieldnames=columns)
                     writer.writeheader()
diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py
index a022d04265..73c9925292 100644
--- a/datumaro/plugins/voc_format/converter.py
+++ b/datumaro/plugins/voc_format/converter.py
@@ -296,7 +296,7 @@ def save_subsets(self):
                             VocTask.action_classification}:
                         ann_path = osp.join(self._ann_dir, item.id + '.xml')
                         os.makedirs(osp.dirname(ann_path), exist_ok=True)
-                        with open(ann_path, 'w') as f:
+                        with open(ann_path, 'w', encoding='utf-8') as f:
                             f.write(ET.tostring(root_elem,
                                 encoding='unicode', pretty_print=True))
 
@@ -350,7 +350,7 @@ def save_subsets(self):
     @staticmethod
     def _get_filtered_lines(path, patch, subset, items=None):
         lines = {}
-        with open(path) as f:
+        with open(path, encoding='utf-8') as f:
             for line in f:
                 item, text, _ = line.split(maxsplit=1) + ['', '']
                 if not patch or patch.updated_items.get((item, subset)) != \
@@ -367,7 +367,7 @@ def save_action_lists(self, subset_name, action_list):
         items = {k: True for k in action_list}
         if self._patch and osp.isfile(ann_file):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
                 f.write('%s\n' % item)
 
@@ -392,7 +392,7 @@ def _write_item(f, item, objs, action):
             if self._patch and osp.isfile(ann_file):
                 lines = self._get_filtered_lines(ann_file, None, subset_name)
 
-            with open(ann_file, 'w') as f:
+            with open(ann_file, 'w', encoding='utf-8') as f:
                 for item in items:
                     if item in action_list:
                         _write_item(f, item, action_list[item], action)
@@ -418,7 +418,7 @@ def _write_item(f, item, item_labels):
                 lines = self._get_filtered_lines(ann_file, self._patch,
                     subset_name, items)
 
-            with open(ann_file, 'w') as f:
+            with open(ann_file, 'w', encoding='utf-8') as f:
                 for item in items:
                     if item in class_lists:
                         _write_item(f, item, class_lists[item])
@@ -433,7 +433,7 @@ def save_clsdet_lists(self, subset_name, clsdet_list):
         if self._patch and osp.isfile(ann_file):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
 
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
                 f.write('%s\n' % item)
 
@@ -445,7 +445,7 @@ def save_segm_lists(self, subset_name, segm_list):
         if self._patch and osp.isfile(ann_file):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
 
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
                 f.write('%s\n' % item)
 
@@ -465,7 +465,7 @@ def _write_item(f, item, item_layouts):
         if self._patch and osp.isfile(ann_file):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
 
-        with open(ann_file, 'w') as f:
+        with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
                 if item in layout_list:
                     _write_item(f, item, layout_list[item])
diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py
index 655d72b893..b84ac0fd23 100644
--- a/datumaro/plugins/voc_format/extractor.py
+++ b/datumaro/plugins/voc_format/extractor.py
@@ -58,8 +58,8 @@ def _load_categories(dataset_path):
 
     @staticmethod
     def _load_subset_list(subset_path):
-        with open(subset_path) as f:
-            return [line.split()[0] for line in f]
+        with open(subset_path, encoding='utf-8') as f:
+            return [line.strip() for line in f]
 
 class VocClassificationExtractor(_VocExtractor):
     def __iter__(self):
@@ -78,11 +78,13 @@ def _load_annotations(self):
         anno_files = [s for s in dir_items(task_dir, '.txt')
             if s.endswith('_' + osp.basename(self._path))]
         for ann_filename in anno_files:
-            with open(osp.join(task_dir, ann_filename)) as f:
+            with open(osp.join(task_dir, ann_filename), encoding='utf-8') as f:
                 label = ann_filename[:ann_filename.rfind('_')]
                 label_id = self._get_label_id(label)
                 for line in f:
-                    item, present = line.split()
+                    objects = line.split()
+                    item = ' '.join(objects[i] for i in range(len(objects) - 1))
+                    present = objects[-1]
                     if present == '1':
                         annotations[item].append(label_id)
 
diff --git a/datumaro/plugins/widerface_format.py b/datumaro/plugins/widerface_format.py
index 87005b66ad..5a968f9c8c 100644
--- a/datumaro/plugins/widerface_format.py
+++ b/datumaro/plugins/widerface_format.py
@@ -62,7 +62,7 @@ def _load_categories(self):
     def _load_items(self, path):
         items = {}
 
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
             lines = f.readlines()
 
         image_ids = [image_id for image_id, line in enumerate(lines)
@@ -178,5 +178,5 @@ def apply(self):
             annotation_path = osp.join(save_dir, WiderFacePath.ANNOTATIONS_DIR,
                 'wider_face_' + subset_name + '_bbx_gt.txt')
             os.makedirs(osp.dirname(annotation_path), exist_ok=True)
-            with open(annotation_path, 'w') as f:
+            with open(annotation_path, 'w', encoding='utf-8') as f:
                 f.write(wider_annotation)
diff --git a/datumaro/plugins/yolo_format/converter.py b/datumaro/plugins/yolo_format/converter.py
index 351636b5d8..fb71b8f172 100644
--- a/datumaro/plugins/yolo_format/converter.py
+++ b/datumaro/plugins/yolo_format/converter.py
@@ -39,7 +39,7 @@ def apply(self):
         label_categories = extractor.categories()[AnnotationType.label]
         label_ids = {label.name: idx
             for idx, label in enumerate(label_categories.items)}
-        with open(osp.join(save_dir, 'obj.names'), 'w') as f:
+        with open(osp.join(save_dir, 'obj.names'), 'w', encoding='utf-8') as f:
             f.writelines('%s\n' % l[0]
                 for l in sorted(label_ids.items(), key=lambda x: x[1]))
 
@@ -88,15 +88,15 @@ def apply(self):
 
                 annotation_path = osp.join(subset_dir, '%s.txt' % item.id)
                 os.makedirs(osp.dirname(annotation_path), exist_ok=True)
-                with open(annotation_path, 'w') as f:
+                with open(annotation_path, 'w', encoding='utf-8') as f:
                     f.write(yolo_annotation)
 
             subset_list_name = '%s.txt' % subset_name
             subset_lists[subset_name] = subset_list_name
-            with open(osp.join(save_dir, subset_list_name), 'w') as f:
+            with open(osp.join(save_dir, subset_list_name), 'w', encoding='utf-8') as f:
                 f.writelines('%s\n' % s for s in image_paths.values())
 
-        with open(osp.join(save_dir, 'obj.data'), 'w') as f:
+        with open(osp.join(save_dir, 'obj.data'), 'w', encoding='utf-8') as f:
             f.write('classes = %s\n' % len(label_ids))
 
             for subset_name, subset_list_name in subset_lists.items():
diff --git a/datumaro/plugins/yolo_format/extractor.py b/datumaro/plugins/yolo_format/extractor.py
index 54774f08cb..5b5e8d20ec 100644
--- a/datumaro/plugins/yolo_format/extractor.py
+++ b/datumaro/plugins/yolo_format/extractor.py
@@ -52,14 +52,14 @@ def __init__(self, config_path, image_info=None):
         if isinstance(image_info, str):
             if not osp.isfile(image_info):
                 raise Exception("Can't read image meta file '%s'" % image_info)
-            with open(image_info) as f:
+            with open(image_info, encoding='utf-8') as f:
                 image_info = {}
                 for line in f:
                     image_name, h, w = line.strip().split()
                     image_info[image_name] = (int(h), int(w))
         self._image_info = image_info
 
-        with open(config_path, 'r') as f:
+        with open(config_path, 'r', encoding='utf-8') as f:
             config_lines = f.readlines()
 
         subsets = OrderedDict()
@@ -89,7 +89,7 @@ def __init__(self, config_path, image_info=None):
                 raise Exception("Not found '%s' subset list file" % subset_name)
 
             subset = YoloExtractor.Subset(subset_name, self)
-            with open(list_path, 'r') as f:
+            with open(list_path, 'r', encoding='utf-8') as f:
                 subset.items = OrderedDict(
                     (self.name_from_path(p), self.localize_path(p))
                     for p in f
@@ -143,7 +143,7 @@ def _get(self, item_id, subset_name):
     @staticmethod
     def _parse_annotations(anno_path, image):
         lines = []
-        with open(anno_path, 'r') as f:
+        with open(anno_path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = line.strip()
                 if line:
@@ -174,7 +174,7 @@ def _parse_annotations(anno_path, image):
     def _load_categories(names_path):
         label_categories = LabelCategories()
 
-        with open(names_path, 'r') as f:
+        with open(names_path, 'r', encoding='utf-8') as f:
             for label in f:
                 label_categories.add(label.strip())
 
diff --git a/tests/test_camvid_format.py b/tests/test_camvid_format.py
index 1a1c17d429..dba9d0116d 100644
--- a/tests/test_camvid_format.py
+++ b/tests/test_camvid_format.py
@@ -149,7 +149,7 @@ def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
         class TestExtractor(TestExtractorBase):
             def __iter__(self):
                 return iter([
-                    DatasetItem(id='кириллица в имени файла',
+                    DatasetItem(id='кириллица с пробелом',
                         image=np.ones((1, 5, 3)), annotations=[
                             Mask(image=np.array([[1, 0, 0, 1, 0]]), label=0),
                             Mask(image=np.array([[0, 1, 1, 0, 1]]), label=3),
diff --git a/tests/test_coco_format.py b/tests/test_coco_format.py
index c2ee51bd8e..d57133d458 100644
--- a/tests/test_coco_format.py
+++ b/tests/test_coco_format.py
@@ -424,6 +424,16 @@ def test_can_save_and_load_images(self):
             self._test_save_and_load(expected_dataset,
                 CocoImageInfoConverter.convert, test_dir)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', subset='train',
+                attributes={'id': 1}),
+        ])
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(expected_dataset,
+                CocoImageInfoConverter.convert, test_dir)
+
     def test_can_save_and_load_labels(self):
         expected_dataset = Dataset.from_iterable([
             DatasetItem(id=1, subset='train',
diff --git a/tests/test_cvat_format.py b/tests/test_cvat_format.py
index 4caeaeed0d..c23c32a4b1 100644
--- a/tests/test_cvat_format.py
+++ b/tests/test_cvat_format.py
@@ -237,6 +237,40 @@ def test_can_save_and_load(self):
                 partial(CvatConverter.convert, save_images=True), test_dir,
                 target_dataset=target_dataset)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        label_categories = LabelCategories()
+        for i in range(10):
+            label_categories.add(str(i))
+        label_categories.items[2].attributes.update(['a1', 'a2', 'empty'])
+        label_categories.attributes.update(['occluded'])
+
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом',
+                subset='s1', image=np.zeros((5, 10, 3)),
+                annotations=[
+                    Label(1),
+                ]
+            ),
+        ], categories={
+            AnnotationType.label: label_categories,
+        })
+
+        target_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом',
+                subset='s1', image=np.zeros((5, 10, 3)),
+                annotations=[
+                    Label(1),
+                ], attributes={'frame': 0}
+            ),
+        ], categories={
+            AnnotationType.label: label_categories,
+        })
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(source_dataset,
+                partial(CvatConverter.convert, save_images=True), test_dir,
+                target_dataset=target_dataset)
+
     def test_relative_paths(self):
         source_dataset = Dataset.from_iterable([
             DatasetItem(id='1', image=np.ones((4, 2, 3))),
diff --git a/tests/test_datumaro_format.py b/tests/test_datumaro_format.py
index fa063fcfda..582bad7fdd 100644
--- a/tests/test_datumaro_format.py
+++ b/tests/test_datumaro_format.py
@@ -102,6 +102,15 @@ def test_relative_paths(self):
             self._test_save_and_load(test_dataset,
                 partial(DatumaroConverter.convert, save_images=True), test_dir)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        test_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', image=np.ones((4, 2, 3))),
+        ])
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(test_dataset,
+                partial(DatumaroConverter.convert, save_images=True), test_dir)
+
     def test_inplace_save_writes_only_updated_data(self):
         with TestDir() as path:
             # generate initial dataset
diff --git a/tests/test_icdar_format.py b/tests/test_icdar_format.py
index 5583446531..69a4c89109 100644
--- a/tests/test_icdar_format.py
+++ b/tests/test_icdar_format.py
@@ -184,3 +184,15 @@ def test_can_save_and_load_with_no_subsets(self):
         with TestDir() as test_dir:
             self._test_save_and_load(expected_dataset,
                 IcdarTextLocalizationConverter.convert, test_dir)
+
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        expected_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 1, 3, 5),
+                ]),
+        ])
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(expected_dataset,
+                IcdarTextLocalizationConverter.convert, test_dir)
diff --git a/tests/test_image_dir_format.py b/tests/test_image_dir_format.py
index f7f21b0888..8ff52eb4cc 100644
--- a/tests/test_image_dir_format.py
+++ b/tests/test_image_dir_format.py
@@ -28,4 +28,13 @@ def test_relative_paths(self):
 
         with TestDir() as test_dir:
             test_save_and_load(self, dataset, ImageDirConverter.convert,
-                test_dir, importer='image_dir')
\ No newline at end of file
+                test_dir, importer='image_dir')
+
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', image=np.ones((4, 2, 3))),
+        ])
+
+        with TestDir() as test_dir:
+            test_save_and_load(self, dataset, ImageDirConverter.convert,
+                test_dir, importer='image_dir')
diff --git a/tests/test_imagenet_format.py b/tests/test_imagenet_format.py
index 2b4ef79fb1..2b093e2bc2 100644
--- a/tests/test_imagenet_format.py
+++ b/tests/test_imagenet_format.py
@@ -83,6 +83,24 @@ def test_can_save_and_load_with_multiple_labels(self):
             compare_datasets(self, source_dataset, parsed_dataset,
                 require_images=True)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id="кириллица с пробелом",
+                image=np.ones((8, 8, 3)),
+                annotations=[Label(0), Label(1)]
+            ),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable(
+                'label_' + str(label) for label in range(2)),
+        })
+
+        with TestDir() as test_dir:
+            ImagenetConverter.convert(source_dataset, test_dir, save_images=True)
+
+            parsed_dataset = Dataset.import_from(test_dir, 'imagenet')
+
+            compare_datasets(self, source_dataset, parsed_dataset,
+                require_images=True)
 
 DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'imagenet_dataset')
 
diff --git a/tests/test_imagenet_txt_format.py b/tests/test_imagenet_txt_format.py
index 4f5dda37c5..a5ad3b778c 100644
--- a/tests/test_imagenet_txt_format.py
+++ b/tests/test_imagenet_txt_format.py
@@ -90,6 +90,25 @@ def test_can_save_dataset_with_no_subsets(self):
             compare_datasets(self, source_dataset, parsed_dataset,
                 require_images=True)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id="кириллица с пробелом",
+                image=np.ones((8, 8, 3)),
+                annotations=[Label(0), Label(1)]
+            ),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable(
+                'label_' + str(label) for label in range(2)),
+        })
+
+        with TestDir() as test_dir:
+            ImagenetTxtConverter.convert(source_dataset, test_dir, save_images=True)
+
+            parsed_dataset = Dataset.import_from(test_dir, 'imagenet_txt')
+
+            compare_datasets(self, source_dataset, parsed_dataset,
+                require_images=True)
+
 DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'imagenet_txt_dataset')
 
 class ImagenetTxtImporterTest(TestCase):
diff --git a/tests/test_labelme_format.py b/tests/test_labelme_format.py
index 244a590b07..fbeae19d57 100644
--- a/tests/test_labelme_format.py
+++ b/tests/test_labelme_format.py
@@ -87,6 +87,48 @@ def test_can_save_and_load(self):
                 partial(LabelMeConverter.convert, save_images=True),
                 test_dir, target_dataset=target_dataset)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', subset='train',
+                image=np.ones((16, 16, 3)),
+                annotations=[
+                    Polygon([0, 4, 4, 4, 5, 6], label=3, attributes={
+                        'occluded': True,
+                        'a1': 'qwe',
+                        'a2': True,
+                        'a3': 123,
+                    }),
+                ]
+            ),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable(
+                'label_' + str(label) for label in range(10)),
+        })
+
+        target_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', subset='train',
+                image=np.ones((16, 16, 3)),
+                annotations=[
+                    Polygon([0, 4, 4, 4, 5, 6], label=0, id=0,
+                        attributes={
+                            'occluded': True, 'username': '',
+                            'a1': 'qwe',
+                            'a2': True,
+                            'a3': 123,
+                        }
+                    ),
+                ]
+            ),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable([
+                'label_3']),
+        })
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(
+                source_dataset,
+                partial(LabelMeConverter.convert, save_images=True),
+                test_dir, target_dataset=target_dataset)
 
 DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'labelme_dataset')
 
diff --git a/tests/test_lfw_format.py b/tests/test_lfw_format.py
index 541cccaa02..975ba84057 100644
--- a/tests/test_lfw_format.py
+++ b/tests/test_lfw_format.py
@@ -101,6 +101,30 @@ def test_can_save_and_load_with_no_subsets(self):
 
             compare_datasets(self, source_dataset, parsed_dataset)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом',
+                image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': [],
+                    'negative_pairs': ['name0/name0_0002']
+                },
+            ),
+            DatasetItem(id='name0/name0_0002',
+                image=np.ones((2, 5, 3)),
+                attributes = {
+                    'positive_pairs': [],
+                    'negative_pairs': []
+                },
+            ),
+        ])
+
+        with TestDir() as test_dir:
+            LfwConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'lfw')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
 DUMMY_DATASET_DIR = osp.join(osp.dirname(__file__), 'assets', 'lfw_dataset')
 
 class LfwImporterTest(TestCase):
diff --git a/tests/test_market1501_format.py b/tests/test_market1501_format.py
index d5422acc09..a53ef89d6f 100644
--- a/tests/test_market1501_format.py
+++ b/tests/test_market1501_format.py
@@ -62,6 +62,24 @@ def test_can_save_dataset_with_no_subsets(self):
 
             compare_datasets(self, source_dataset, parsed_dataset)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом',
+                image=np.ones((2, 5, 3)),
+                attributes = {
+                    'camera_id': 1,
+                    'person_id': 1,
+                    'query': True
+                }
+            ),
+        ])
+
+        with TestDir() as test_dir:
+            Market1501Converter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'market1501')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
     def test_can_save_dataset_with_no_save_images(self):
         source_dataset = Dataset.from_iterable([
             DatasetItem(id='0001_c2s3_000001_00',
diff --git a/tests/test_mots_format.py b/tests/test_mots_format.py
index f8358dda3c..dfe613f773 100644
--- a/tests/test_mots_format.py
+++ b/tests/test_mots_format.py
@@ -66,6 +66,19 @@ def test_can_save_masks(self):
                 partial(MotsPngConverter.convert, save_images=True),
                 test_dir, target_dataset=target)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        source = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', subset='a',
+                image=np.ones((5, 1)), annotations=[
+                    Mask(np.array([[1, 0, 0, 0, 0]]), label=0,
+                        attributes={'track_id': 2}),
+            ]),
+        ], categories=['a'])
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(source,
+                partial(MotsPngConverter.convert, save_images=True), test_dir)
+
 class MotsImporterTest(TestCase):
     def test_can_detect(self):
         self.assertTrue(MotsImporter.detect(DUMMY_DATASET_DIR))
diff --git a/tests/test_tfrecord_format.py b/tests/test_tfrecord_format.py
index 8b63c71a1b..96e70ee4a8 100644
--- a/tests/test_tfrecord_format.py
+++ b/tests/test_tfrecord_format.py
@@ -121,6 +121,27 @@ def test_can_save_dataset_with_no_subsets(self):
                 partial(TfDetectionApiConverter.convert, save_images=True),
                 test_dir)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        test_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом',
+                image=np.ones((16, 16, 3)),
+                annotations=[
+                    Bbox(2, 1, 4, 4, label=2),
+                    Bbox(4, 2, 8, 4, label=3),
+                ],
+                attributes={'source_id': ''}
+            ),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable(
+                'label_' + str(label) for label in range(10)),
+        })
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(
+                test_dataset,
+                partial(TfDetectionApiConverter.convert, save_images=True),
+                test_dir)
+
     def test_can_save_dataset_with_image_info(self):
         test_dataset = Dataset.from_iterable([
             DatasetItem(id='1/q.e',
diff --git a/tests/test_vgg_face2_format.py b/tests/test_vgg_face2_format.py
index 38eb7aacdf..da95614d9a 100644
--- a/tests/test_vgg_face2_format.py
+++ b/tests/test_vgg_face2_format.py
@@ -71,6 +71,22 @@ def test_can_save_dataset_with_no_subsets(self):
 
             compare_datasets(self, source_dataset, parsed_dataset)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Points([4.23, 4.32, 5.34, 4.45, 3.54,
+                        3.56, 4.52, 3.51, 4.78, 3.34], label=0),
+                ]
+            ),
+        ], categories=['a'])
+
+        with TestDir() as test_dir:
+            VggFace2Converter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'vgg_face2')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
     def test_can_save_dataset_with_no_save_images(self):
         source_dataset = Dataset.from_iterable([
             DatasetItem(id='1', image=np.ones((8, 8, 3)),
diff --git a/tests/test_voc_format.py b/tests/test_voc_format.py
index fddafd6470..f936eb6746 100644
--- a/tests/test_voc_format.py
+++ b/tests/test_voc_format.py
@@ -415,6 +415,20 @@ def __iter__(self):
             self._test_save_and_load(TestExtractor(),
                 partial(VocConverter.convert, label_map='voc'), test_dir)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        class TestExtractor(TestExtractorBase):
+            def __iter__(self):
+                return iter([
+                    DatasetItem(id='кириллица с пробелом', annotations=[
+                        Label(2),
+                        Label(3),
+                    ]),
+                ])
+
+        with TestDir() as test_dir:
+            self._test_save_and_load(TestExtractor(),
+                partial(VocConverter.convert, label_map='voc'), test_dir)
+
     def test_can_save_dataset_with_images(self):
         class TestExtractor(TestExtractorBase):
             def __iter__(self):
diff --git a/tests/test_widerface_format.py b/tests/test_widerface_format.py
index 03f15e623c..46163554c9 100644
--- a/tests/test_widerface_format.py
+++ b/tests/test_widerface_format.py
@@ -83,6 +83,26 @@ def test_can_save_dataset_with_no_subsets(self):
 
             compare_datasets(self, source_dataset, parsed_dataset)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 1, 2, 3, label=1, attributes = {
+                        'blur': '2', 'expression': '0', 'illumination': '0',
+                        'occluded': '0', 'pose': '2', 'invalid': '0'}),
+                ]
+            ),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable(
+                'label_' + str(i) for i in range(3)),
+        })
+
+        with TestDir() as test_dir:
+            WiderFaceConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'wider_face')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
     def test_can_save_dataset_with_non_widerface_attributes(self):
         source_dataset = Dataset.from_iterable([
             DatasetItem(id='a/b/1', image=np.ones((8, 8, 3)),
diff --git a/tests/test_yolo_format.py b/tests/test_yolo_format.py
index 5c46eb27a0..f21420e11c 100644
--- a/tests/test_yolo_format.py
+++ b/tests/test_yolo_format.py
@@ -90,6 +90,24 @@ def test_can_load_dataset_with_exact_image_info(self):
 
             compare_datasets(self, source_dataset, parsed_dataset)
 
+    def test_can_save_dataset_with_cyrillic_and_spaces_in_filename(self):
+        source_dataset = Dataset.from_iterable([
+            DatasetItem(id='кириллица с пробелом', subset='train', image=np.ones((8, 8, 3)),
+                annotations=[
+                    Bbox(0, 2, 4, 2, label=2),
+                    Bbox(0, 1, 2, 3, label=4),
+                ]),
+        ], categories={
+            AnnotationType.label: LabelCategories.from_iterable(
+                'label_' + str(i) for i in range(10)),
+        })
+
+        with TestDir() as test_dir:
+            YoloConverter.convert(source_dataset, test_dir, save_images=True)
+            parsed_dataset = Dataset.import_from(test_dir, 'yolo')
+
+            compare_datasets(self, source_dataset, parsed_dataset)
+
     def test_relative_paths(self):
         source_dataset = Dataset.from_iterable([
             DatasetItem(id='1', subset='train',

From 0a09ba063e5328927500917f286fa00472290ac1 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Wed, 10 Mar 2021 11:47:52 +0300
Subject: [PATCH 03/16] some fixes

---
 datumaro/plugins/imagenet_format.py      | 1 -
 datumaro/plugins/voc_format/extractor.py | 9 ++++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/datumaro/plugins/imagenet_format.py b/datumaro/plugins/imagenet_format.py
index 829534d700..8dbe6cc6d6 100644
--- a/datumaro/plugins/imagenet_format.py
+++ b/datumaro/plugins/imagenet_format.py
@@ -1,4 +1,3 @@
-#_*_ coding:utf-8 _*_
 # Copyright (C) 2020 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py
index b84ac0fd23..4be5bec2c1 100644
--- a/datumaro/plugins/voc_format/extractor.py
+++ b/datumaro/plugins/voc_format/extractor.py
@@ -58,8 +58,15 @@ def _load_categories(dataset_path):
 
     @staticmethod
     def _load_subset_list(subset_path):
+        subset_list = []
         with open(subset_path, encoding='utf-8') as f:
-            return [line.strip() for line in f]
+            for line in f:
+                objects = line.split('\"')
+                if 1 < len(objects):
+                    subset_list.append(objects[1])
+                else:
+                    subset_list.append(line.split()[0])
+            return subset_list
 
 class VocClassificationExtractor(_VocExtractor):
     def __iter__(self):

From fac8f111bdf4a0d689aead614a27d2a72e8ad8a9 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Wed, 10 Mar 2021 12:27:56 +0300
Subject: [PATCH 04/16] fix voc format

---
 datumaro/plugins/voc_format/converter.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py
index 73c9925292..934fc1ffd1 100644
--- a/datumaro/plugins/voc_format/converter.py
+++ b/datumaro/plugins/voc_format/converter.py
@@ -369,6 +369,8 @@ def save_action_lists(self, subset_name, action_list):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
         with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
+                if 1 < len(item.split()):
+                    item = '\"' + item + '\"'
                 f.write('%s\n' % item)
 
         if not items and not self._patch:
@@ -435,6 +437,8 @@ def save_clsdet_lists(self, subset_name, clsdet_list):
 
         with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
+                if 1 < len(item.split()):
+                    item = '\"' + item + '\"'
                 f.write('%s\n' % item)
 
     def save_segm_lists(self, subset_name, segm_list):
@@ -447,10 +451,14 @@ def save_segm_lists(self, subset_name, segm_list):
 
         with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
+                if 1 < len(item.split()):
+                    item = '\"' + item + '\"'
                 f.write('%s\n' % item)
 
     def save_layout_lists(self, subset_name, layout_list):
         def _write_item(f, item, item_layouts):
+            if 1 < len(item.split()):
+                item = '\"' + item + '\"'
             if item_layouts:
                 for obj_id in item_layouts:
                     f.write('%s % d\n' % (item, 1 + obj_id))

From 6c8812841d657a2fc3d8f2fffc07cb3e9faea9d8 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Thu, 11 Mar 2021 10:53:13 +0300
Subject: [PATCH 05/16] fix camvid

---
 datumaro/plugins/camvid_format.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index 2a3c9aaf6c..3f5767bfa1 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -7,6 +7,7 @@
 import os.path as osp
 from collections import OrderedDict
 from enum import Enum
+import re
 
 import numpy as np
 from datumaro.components.converter import Converter
@@ -57,6 +58,7 @@ class CamvidPath:
     LABELMAP_FILE = 'label_colors.txt'
     SEGM_DIR = "annot"
     IMAGE_EXT = '.png'
+    PATTERN = re.compile(r'(.+[.]\S+) (.+)?')
 
 
 def parse_label_map(path):
@@ -155,16 +157,8 @@ def _load_items(self, path):
         items = {}
         with open(path, encoding='utf-8') as f:
             for line in f:
+                objects = CamvidPath.PATTERN.search(line).groups()
                 objects = line.split()
-                if 2 < len(objects):
-                    if len(objects) % 2:
-                        raise Exception("Line %s: image and gt file must have  "
-                            "the same name" % line)
-                    else:
-                        mid = int(len(objects) / 2)
-                        objects[0] = ' '.join(objects[i] for i in range(mid))
-                        objects[1] = ' '.join(objects[i] for i in range(mid, 2 * mid))
-                        objects = objects[:2]
                 image = objects[0]
                 item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)]
                 image_path = osp.join(self._dataset_dir,

From 135b4ae13f9076f7345b3539779bb9e63c0d707f Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Thu, 11 Mar 2021 11:22:23 +0300
Subject: [PATCH 06/16] fix voc

---
 datumaro/plugins/camvid_format.py        |  5 ++---
 datumaro/plugins/voc_format/converter.py | 16 ++++++++--------
 datumaro/plugins/voc_format/extractor.py |  9 ++++-----
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index 3f5767bfa1..4161b197dd 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -5,9 +5,9 @@
 
 import os
 import os.path as osp
+import re
 from collections import OrderedDict
 from enum import Enum
-import re
 
 import numpy as np
 from datumaro.components.converter import Converter
@@ -16,8 +16,7 @@
     MaskCategories, SourceExtractor)
 from datumaro.util import find, str_to_bool
 from datumaro.util.image import save_image
-from datumaro.util.mask_tools import lazy_mask, paint_mask, generate_colormap
-
+from datumaro.util.mask_tools import generate_colormap, lazy_mask, paint_mask
 
 CamvidLabelMap = OrderedDict([
     ('Void', (0, 0, 0)),
diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py
index 934fc1ffd1..c3aac6b18a 100644
--- a/datumaro/plugins/voc_format/converter.py
+++ b/datumaro/plugins/voc_format/converter.py
@@ -369,8 +369,8 @@ def save_action_lists(self, subset_name, action_list):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
         with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
-                if 1 < len(item.split()):
-                    item = '\"' + item + '\"'
+                # if 1 < len(item.split()):
+                #     item = '\"' + item + '\"'
                 f.write('%s\n' % item)
 
         if not items and not self._patch:
@@ -437,8 +437,8 @@ def save_clsdet_lists(self, subset_name, clsdet_list):
 
         with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
-                if 1 < len(item.split()):
-                    item = '\"' + item + '\"'
+                # if 1 < len(item.split()):
+                #     item = '\"' + item + '\"'
                 f.write('%s\n' % item)
 
     def save_segm_lists(self, subset_name, segm_list):
@@ -451,14 +451,14 @@ def save_segm_lists(self, subset_name, segm_list):
 
         with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
-                if 1 < len(item.split()):
-                    item = '\"' + item + '\"'
+                # if 1 < len(item.split()):
+                #     item = '\"' + item + '\"'
                 f.write('%s\n' % item)
 
     def save_layout_lists(self, subset_name, layout_list):
         def _write_item(f, item, item_layouts):
-            if 1 < len(item.split()):
-                item = '\"' + item + '\"'
+            # if 1 < len(item.split()):
+            #     item = '\"' + item + '\"'
             if item_layouts:
                 for obj_id in item_layouts:
                     f.write('%s % d\n' % (item, 1 + obj_id))
diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py
index 4be5bec2c1..23cc14806c 100644
--- a/datumaro/plugins/voc_format/extractor.py
+++ b/datumaro/plugins/voc_format/extractor.py
@@ -61,11 +61,10 @@ def _load_subset_list(subset_path):
         subset_list = []
         with open(subset_path, encoding='utf-8') as f:
             for line in f:
-                objects = line.split('\"')
-                if 1 < len(objects):
-                    subset_list.append(objects[1])
-                else:
-                    subset_list.append(line.split()[0])
+                line = line.strip().split()
+                if 2 < len(line):
+                    line[0] = ' '.join(line[i] for i in range(len(line)))
+                subset_list.append(line[0])
             return subset_list
 
 class VocClassificationExtractor(_VocExtractor):

From d48d98ed53a21d012b3ab24dea3d474ae61c2868 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Thu, 11 Mar 2021 11:56:41 +0300
Subject: [PATCH 07/16] some fixes

---
 datumaro/plugins/camvid_format.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index 4161b197dd..a4a5440466 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -156,14 +156,17 @@ def _load_items(self, path):
         items = {}
         with open(path, encoding='utf-8') as f:
             for line in f:
-                objects = CamvidPath.PATTERN.search(line).groups()
-                objects = line.split()
+                search = CamvidPath.PATTERN.search(line)
+                if search:
+                    objects = CamvidPath.PATTERN.search(line).groups()
+                else:
+                    raise Exception("Line %s: invalid path format" % line)
                 image = objects[0]
                 item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)]
                 image_path = osp.join(self._dataset_dir,
                     (image, image[1:])[image[0] == '/'])
                 item_annotations = []
-                if 1 < len(objects):
+                if objects[1] != None:
                     gt = objects[1]
                     gt_path = osp.join(self._dataset_dir,
                         (gt, gt[1:]) [gt[0] == '/'])

From 1b42acc23d474671a56ef9e64f668f0478e803d3 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Thu, 11 Mar 2021 12:16:39 +0300
Subject: [PATCH 08/16] fix regex in camvid

---
 datumaro/plugins/camvid_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index 9b467e6a18..d1e43a3ca6 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -61,7 +61,7 @@ class CamvidPath:
     SEGM_DIR = "annot"
     IMAGE_EXT = '.jpg'
     MASK_EXT = '.png'
-    PATTERN = re.compile(r'(.+[.]\S+) (.+)?')
+    PATTERN = re.compile(r'(.+[.]\S+) (.+[.]\S+)?')
 
 
 def parse_label_map(path):

From 2d837d63940301fb193ef459da2baaebe73e2904 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Fri, 12 Mar 2021 09:41:14 +0300
Subject: [PATCH 09/16] add exception if unexpected number of quotes

---
 datumaro/plugins/imagenet_txt_format.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/datumaro/plugins/imagenet_txt_format.py b/datumaro/plugins/imagenet_txt_format.py
index aceee4e5b3..751ce9d11c 100644
--- a/datumaro/plugins/imagenet_txt_format.py
+++ b/datumaro/plugins/imagenet_txt_format.py
@@ -50,9 +50,13 @@ def _load_items(self, path):
         with open(path, encoding='utf-8') as f:
             for line in f:
                 item = line.split('\"')
-                if len(item) == 3:
-                    item_id = item[1]
-                    label_ids = [int(id) for id in item[2].split()]
+                if 1 < len(item):
+                    if len(item) == 3:
+                        item_id = item[1]
+                        label_ids = [int(id) for id in item[2].split()]
+                    else:
+                        raise Exception("Line %s: unexpected number "
+                            "of quotes in filename" % line)
                 else:
                     item = line.split()
                     item_id = item[0]

From 385ad9b38542a31ef4f159ca641b4e0d631a44d8 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Sat, 13 Mar 2021 19:35:12 +0300
Subject: [PATCH 10/16] fix voc format

---
 datumaro/plugins/voc_format/converter.py | 10 ++--------
 datumaro/plugins/voc_format/extractor.py | 22 +++++++++++++++-------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py
index 9ddad7c716..bbf4f8a8de 100644
--- a/datumaro/plugins/voc_format/converter.py
+++ b/datumaro/plugins/voc_format/converter.py
@@ -370,8 +370,6 @@ def save_action_lists(self, subset_name, action_list):
             self._get_filtered_lines(ann_file, self._patch, subset_name, items)
         with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
-                # if 1 < len(item.split()):
-                #     item = '\"' + item + '\"'
                 f.write('%s\n' % item)
 
         if not items and not self._patch:
@@ -438,8 +436,6 @@ def save_clsdet_lists(self, subset_name, clsdet_list):
 
         with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
-                # if 1 < len(item.split()):
-                #     item = '\"' + item + '\"'
                 f.write('%s\n' % item)
 
     def save_segm_lists(self, subset_name, segm_list):
@@ -452,14 +448,12 @@ def save_segm_lists(self, subset_name, segm_list):
 
         with open(ann_file, 'w', encoding='utf-8') as f:
             for item in items:
-                # if 1 < len(item.split()):
-                #     item = '\"' + item + '\"'
                 f.write('%s\n' % item)
 
     def save_layout_lists(self, subset_name, layout_list):
         def _write_item(f, item, item_layouts):
-            # if 1 < len(item.split()):
-            #     item = '\"' + item + '\"'
+            if 1 < len(item.split()):
+                item = '\"' + item + '\"'
             if item_layouts:
                 for obj_id in item_layouts:
                     f.write('%s % d\n' % (item, 1 + obj_id))
diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py
index 23cc14806c..e883a26703 100644
--- a/datumaro/plugins/voc_format/extractor.py
+++ b/datumaro/plugins/voc_format/extractor.py
@@ -61,10 +61,20 @@ def _load_subset_list(subset_path):
         subset_list = []
         with open(subset_path, encoding='utf-8') as f:
             for line in f:
-                line = line.strip().split()
-                if 2 < len(line):
-                    line[0] = ' '.join(line[i] for i in range(len(line)))
-                subset_list.append(line[0])
+                dirname = osp.basename(osp.dirname(subset_path))
+                if dirname == VocPath.TASK_DIR[VocTask.person_layout]:
+                    objects = line.split('\"')
+                    if 1 < len(objects):
+                        if len(objects) == 3:
+                            line = objects[1]
+                        else:
+                            raise Exception("Line %s: unexpected number "
+                                "of quotes in filename" % line)
+                    else:
+                        line = line.split()[0]
+                else:
+                    line = line.strip()
+                subset_list.append(line)
             return subset_list
 
 class VocClassificationExtractor(_VocExtractor):
@@ -88,9 +98,7 @@ def _load_annotations(self):
                 label = ann_filename[:ann_filename.rfind('_')]
                 label_id = self._get_label_id(label)
                 for line in f:
-                    objects = line.split()
-                    item = ' '.join(objects[i] for i in range(len(objects) - 1))
-                    present = objects[-1]
+                    item, present = line.rsplit(maxsplit=1)
                     if present == '1':
                         annotations[item].append(label_id)
 

From 1c5638abcc936327870e5dd5afb1bc06bf1095e5 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Mon, 15 Mar 2021 16:06:28 +0300
Subject: [PATCH 11/16] some fixes

---
 datumaro/plugins/camvid_format.py        |  6 +++---
 datumaro/plugins/voc_format/extractor.py | 18 +++++++++++-------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index d1e43a3ca6..40d69334c6 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -61,7 +61,7 @@ class CamvidPath:
     SEGM_DIR = "annot"
     IMAGE_EXT = '.jpg'
     MASK_EXT = '.png'
-    PATTERN = re.compile(r'(.+[.]\S+) (.+[.]\S+)?')
+    PATTERN = re.compile(r'(.+\.\S+)(?:\s+(.+\.\S+)?)?\s*')
 
 
 def parse_label_map(path):
@@ -160,9 +160,9 @@ def _load_items(self, path):
         items = {}
         with open(path, encoding='utf-8') as f:
             for line in f:
-                search = CamvidPath.PATTERN.search(line)
+                search = CamvidPath.PATTERN.search(line.strip())
                 if search:
-                    objects = CamvidPath.PATTERN.search(line).groups()
+                    objects = search.groups()
                 else:
                     raise Exception("Line %s: invalid path format" % line)
                 image = objects[0]
diff --git a/datumaro/plugins/voc_format/extractor.py b/datumaro/plugins/voc_format/extractor.py
index e883a26703..dcee3523be 100644
--- a/datumaro/plugins/voc_format/extractor.py
+++ b/datumaro/plugins/voc_format/extractor.py
@@ -24,10 +24,11 @@
 _inverse_inst_colormap = invert_colormap(VocInstColormap)
 
 class _VocExtractor(SourceExtractor):
-    def __init__(self, path):
+    def __init__(self, path, task):
         assert osp.isfile(path), path
         self._path = path
         self._dataset_dir = osp.dirname(osp.dirname(osp.dirname(path)))
+        self._task = task
 
         super().__init__(subset=osp.splitext(osp.basename(path))[0])
 
@@ -56,13 +57,11 @@ def _load_categories(dataset_path):
             label_map = parse_label_map(label_map_path)
         return make_voc_categories(label_map)
 
-    @staticmethod
-    def _load_subset_list(subset_path):
+    def _load_subset_list(self, subset_path):
         subset_list = []
         with open(subset_path, encoding='utf-8') as f:
             for line in f:
-                dirname = osp.basename(osp.dirname(subset_path))
-                if dirname == VocPath.TASK_DIR[VocTask.person_layout]:
+                if self._task == VocTask.person_layout:
                     objects = line.split('\"')
                     if 1 < len(objects):
                         if len(objects) == 3:
@@ -78,6 +77,9 @@ def _load_subset_list(subset_path):
             return subset_list
 
 class VocClassificationExtractor(_VocExtractor):
+    def __init__(self, path):
+        super().__init__(path, VocTask.classification)
+
     def __iter__(self):
         raw_anns = self._load_annotations()
         for item_id in self._items:
@@ -110,8 +112,7 @@ def _parse_annotations(raw_anns, item_id):
 
 class _VocXmlExtractor(_VocExtractor):
     def __init__(self, path, task):
-        super().__init__(path)
-        self._task = task
+        super().__init__(path, task)
 
     def __iter__(self):
         anno_dir = osp.join(self._dataset_dir, VocPath.ANNOTATIONS_DIR)
@@ -246,6 +247,9 @@ def __init__(self, path):
         super().__init__(path, task=VocTask.action_classification)
 
 class VocSegmentationExtractor(_VocExtractor):
+    def __init__(self, path):
+        super().__init__(path, task=VocTask.segmentation)
+
     def __iter__(self):
         for item_id in self._items:
             log.debug("Reading item '%s'" % item_id)

From be16492346932125c447306dd12ad29bcd2d4f0c Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Mon, 15 Mar 2021 18:36:34 +0300
Subject: [PATCH 12/16] fix camvid extractor

---
 datumaro/plugins/camvid_format.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index 40d69334c6..28b7631ec2 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -61,7 +61,7 @@ class CamvidPath:
     SEGM_DIR = "annot"
     IMAGE_EXT = '.jpg'
     MASK_EXT = '.png'
-    PATTERN = re.compile(r'(.+\.\S+)(?:\s+(.+\.\S+)?)?\s*')
+    PATTERN = re.compile(r'(.+\.\S+)(?:\s+(.+\.\S+)?)\s*')
 
 
 def parse_label_map(path):
@@ -160,7 +160,7 @@ def _load_items(self, path):
         items = {}
         with open(path, encoding='utf-8') as f:
             for line in f:
-                search = CamvidPath.PATTERN.search(line.strip())
+                search = CamvidPath.PATTERN.search(line.strip('\n'))
                 if search:
                     objects = search.groups()
                 else:

From dd6eecfe2dead6041f6ccef115ab49dd512bd185 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Tue, 16 Mar 2021 11:56:34 +0300
Subject: [PATCH 13/16] update Changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e475ce93e1..dd8fb88a98 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 -
 
 ### Fixed
--
+- The ability to work with file names containing Cyrillic and spaces (<https://github.com/openvinotoolkit/datumaro/pull/148>)
 
 ### Security
 -

From 68e41dc400a946c270268631c73fa41b709da5e7 Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Tue, 16 Mar 2021 16:09:00 +0300
Subject: [PATCH 14/16] fix regex

---
 datumaro/plugins/camvid_format.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index 28b7631ec2..9df865e686 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -61,7 +61,7 @@ class CamvidPath:
     SEGM_DIR = "annot"
     IMAGE_EXT = '.jpg'
     MASK_EXT = '.png'
-    PATTERN = re.compile(r'(.+\.\S+)(?:\s+(.+\.\S+)?)\s*')
+    PATTERN = re.compile(r'(.+\.\S+)?(?:\s*(.+\.\S+))')
 
 
 def parse_label_map(path):
@@ -160,9 +160,11 @@ def _load_items(self, path):
         items = {}
         with open(path, encoding='utf-8') as f:
             for line in f:
-                search = CamvidPath.PATTERN.search(line.strip('\n'))
+                search = CamvidPath.PATTERN.search(line.strip())
                 if search:
                     objects = search.groups()
+                    if not objects[0]:
+                        objects = [objects[1], objects[0]]
                 else:
                     raise Exception("Line %s: invalid path format" % line)
                 image = objects[0]

From 8b3f93c7ebb43efcf6ac2f510f171176867cd2fb Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Tue, 16 Mar 2021 16:46:01 +0300
Subject: [PATCH 15/16] fix regex

---
 datumaro/plugins/camvid_format.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index 9df865e686..e59cf756b1 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -61,7 +61,7 @@ class CamvidPath:
     SEGM_DIR = "annot"
     IMAGE_EXT = '.jpg'
     MASK_EXT = '.png'
-    PATTERN = re.compile(r'(.+\.\S+)?(?:\s*(.+\.\S+))')
+    PATTERN = re.compile(r'(.+?\.\S+)(?:\s+(.+\.\S+)?)?')
 
 
 def parse_label_map(path):
@@ -163,8 +163,6 @@ def _load_items(self, path):
                 search = CamvidPath.PATTERN.search(line.strip())
                 if search:
                     objects = search.groups()
-                    if not objects[0]:
-                        objects = [objects[1], objects[0]]
                 else:
                     raise Exception("Line %s: invalid path format" % line)
                 image = objects[0]

From 37da6d10bc9a8d921d6c7ff4a4f0095cf114671a Mon Sep 17 00:00:00 2001
From: yasakova-anastasia <anastasia.yasakova@intel.com>
Date: Tue, 16 Mar 2021 17:44:14 +0300
Subject: [PATCH 16/16] fix camvid extractor

---
 datumaro/plugins/camvid_format.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/datumaro/plugins/camvid_format.py b/datumaro/plugins/camvid_format.py
index e59cf756b1..8577a423ba 100644
--- a/datumaro/plugins/camvid_format.py
+++ b/datumaro/plugins/camvid_format.py
@@ -6,7 +6,6 @@
 import logging as log
 import os
 import os.path as osp
-import re
 from collections import OrderedDict
 from enum import Enum
 
@@ -61,7 +60,6 @@ class CamvidPath:
     SEGM_DIR = "annot"
     IMAGE_EXT = '.jpg'
     MASK_EXT = '.png'
-    PATTERN = re.compile(r'(.+?\.\S+)(?:\s+(.+\.\S+)?)?')
 
 
 def parse_label_map(path):
@@ -160,17 +158,23 @@ def _load_items(self, path):
         items = {}
         with open(path, encoding='utf-8') as f:
             for line in f:
-                search = CamvidPath.PATTERN.search(line.strip())
-                if search:
-                    objects = search.groups()
+                line = line.strip()
+                objects = line.split('\"')
+                if 1 < len(objects):
+                    if len(objects) == 5:
+                        objects[0] = objects[1]
+                        objects[1] = objects[3]
+                    else:
+                        raise Exception("Line %s: unexpected number "
+                            "of quotes in filename" % line)
                 else:
-                    raise Exception("Line %s: invalid path format" % line)
+                    objects = line.split()
                 image = objects[0]
                 item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)]
                 image_path = osp.join(self._dataset_dir,
                     (image, image[1:])[image[0] == '/'])
                 item_annotations = []
-                if objects[1] != None:
+                if 1 < len(objects):
                     gt = objects[1]
                     gt_path = osp.join(self._dataset_dir,
                         (gt, gt[1:]) [gt[0] == '/'])
@@ -271,10 +275,12 @@ def save_segm_lists(self, subset_name, segm_list):
         ann_file = osp.join(self._save_dir, subset_name + '.txt')
         with open(ann_file, 'w', encoding='utf-8') as f:
             for (image_path, mask_path) in segm_list.values():
-                f.write('/%s %s\n' % (
-                    image_path.replace('\\', '/'),
-                    mask_path.replace('\\', '/'))
-                )
+                image_path = '/' + image_path.replace('\\', '/')
+                mask_path = mask_path.replace('\\', '/')
+                if 1 < len(image_path.split()) or 1 < len(mask_path.split()):
+                    image_path = '\"' + image_path + '\"'
+                    mask_path = '\"' + mask_path + '\"'
+                f.write('%s %s\n' % (image_path, mask_path))
 
     def save_label_map(self):
         path = osp.join(self._save_dir, CamvidPath.LABELMAP_FILE)