Skip to content

Commit

Permalink
Fix dataset formats (cyrilic and spaces in filename) (#148)
Browse files Browse the repository at this point in the history
* support different languages and spaces in filenames in formats

* update Changelog
  • Loading branch information
yasakova-anastasia authored Mar 17, 2021
1 parent a698cac commit ef12d30
Show file tree
Hide file tree
Showing 28 changed files with 442 additions and 59 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-

### Fixed
-
- The ability to work with file names containing Cyrillic and spaces (<https://github.com/openvinotoolkit/datumaro/pull/148>)

### Security
-
Expand Down
27 changes: 19 additions & 8 deletions datumaro/plugins/camvid_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
from datumaro.util import find, str_to_bool
from datumaro.util.annotation_util import make_label_id_mapping
from datumaro.util.image import save_image
from datumaro.util.mask_tools import lazy_mask, paint_mask, generate_colormap

from datumaro.util.mask_tools import generate_colormap, lazy_mask, paint_mask

CamvidLabelMap = OrderedDict([
('Void', (0, 0, 0)),
Expand Down Expand Up @@ -159,7 +158,17 @@ def _load_items(self, path):
items = {}
with open(path, encoding='utf-8') as f:
for line in f:
objects = line.split()
line = line.strip()
objects = line.split('\"')
if 1 < len(objects):
if len(objects) == 5:
objects[0] = objects[1]
objects[1] = objects[3]
else:
raise Exception("Line %s: unexpected number "
"of quotes in filename" % line)
else:
objects = line.split()
image = objects[0]
item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)]
image_path = osp.join(self._dataset_dir,
Expand Down Expand Up @@ -264,12 +273,14 @@ def save_segm_lists(self, subset_name, segm_list):
return

ann_file = osp.join(self._save_dir, subset_name + '.txt')
with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for (image_path, mask_path) in segm_list.values():
f.write('/%s %s\n' % (
image_path.replace('\\', '/'),
mask_path.replace('\\', '/'))
)
image_path = '/' + image_path.replace('\\', '/')
mask_path = mask_path.replace('\\', '/')
if 1 < len(image_path.split()) or 1 < len(mask_path.split()):
image_path = '\"' + image_path + '\"'
mask_path = '\"' + mask_path + '\"'
f.write('%s %s\n' % (image_path, mask_path))

def save_label_map(self):
path = osp.join(self._save_dir, CamvidPath.LABELMAP_FILE)
Expand Down
1 change: 0 additions & 1 deletion datumaro/plugins/imagenet_format.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# Copyright (C) 2020 Intel Corporation
#
# SPDX-License-Identifier: MIT
Expand Down
24 changes: 19 additions & 5 deletions datumaro/plugins/imagenet_txt_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,18 @@ def _load_items(self, path):
items = {}
with open(path, encoding='utf-8') as f:
for line in f:
item = line.split()
item_id = item[0]
label_ids = [int(id) for id in item[1:]]
item = line.split('\"')
if 1 < len(item):
if len(item) == 3:
item_id = item[1]
label_ids = [int(id) for id in item[2].split()]
else:
raise Exception("Line %s: unexpected number "
"of quotes in filename" % line)
else:
item = line.split()
item_id = item[0]
label_ids = [int(id) for id in item[1:]]
anno = []
for label in label_ids:
assert 0 <= label and \
Expand Down Expand Up @@ -95,9 +104,14 @@ def apply(self):
if self._save_images and item.has_image:
self._save_image(item, subdir=ImagenetTxtPath.IMAGE_DIR)

annotation = ''
for item_id, item_labels in labels.items():
if 1 < len(item_id.split()):
item_id = '\"' + item_id + '\"'
annotation += '%s %s\n' % (item_id, ' '.join(item_labels))

with open(annotation_file, 'w', encoding='utf-8') as f:
f.writelines(['%s %s\n' % (item_id, ' '.join(labels[item_id]))
for item_id in labels])
f.write(annotation)

labels_file = osp.join(subset_dir, ImagenetTxtPath.LABELS_FILE)
with open(labels_file, 'w', encoding='utf-8') as f:
Expand Down
46 changes: 35 additions & 11 deletions datumaro/plugins/lfw_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,14 @@ def _load_items(self, path):
images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR)
with open(path, encoding='utf-8') as f:
for line in f:
pair = line.strip().split()
pair = line.strip().split('\t')
if len(pair) == 3:
image1 = self.get_image_name(pair[0], pair[1])
image2 = self.get_image_name(pair[0], pair[2])
if pair[0] == '-':
image1 = pair[1]
image2 = pair[2]
else:
image1 = self.get_image_name(pair[0], pair[1])
image2 = self.get_image_name(pair[0], pair[2])
if image1 not in items:
items[image1] = DatasetItem(id=image1, subset=self._subset,
image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
Expand All @@ -47,8 +51,14 @@ def _load_items(self, path):
attributes = items[image1].attributes
attributes['positive_pairs'].append(image2)
elif len(pair) == 4:
image1 = self.get_image_name(pair[0], pair[1])
image2 = self.get_image_name(pair[2], pair[3])
if pair[0] == '-':
image1 = pair[1]
else:
image1 = self.get_image_name(pair[0], pair[1])
if pair[2] == '-':
image2 = pair[3]
else:
image2 = self.get_image_name(pair[2], pair[3])
if image1 not in items:
items[image1] = DatasetItem(id=image1, subset=self._subset,
image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
Expand Down Expand Up @@ -102,17 +112,31 @@ def apply(self):
self._save_image(item, osp.join(self._save_dir, subset_name,
LfwPath.IMAGES_DIR, item.id + LfwPath.IMAGE_EXT))

person1, num1 = LfwPath.PATTERN.search(item.id).groups()
num1 = int(num1)
search = LfwPath.PATTERN.search(item.id)
if search:
person1, num1 = search.groups()
num1 = int(num1)
else:
person1 = '-'
num1 = item.id
if 'positive_pairs' in item.attributes:
for pair in item.attributes['positive_pairs']:
num2 = LfwPath.PATTERN.search(pair).groups()[1]
num2 = int(num2)
search = LfwPath.PATTERN.search(pair)
if search:
num2 = search.groups()[1]
num2 = int(num2)
else:
num2 = pair
positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))
if 'negative_pairs' in item.attributes:
for pair in item.attributes['negative_pairs']:
person2, num2 = LfwPath.PATTERN.search(pair).groups()
num2 = int(num2)
search = LfwPath.PATTERN.search(pair)
if search:
person2, num2 = search.groups()
num2 = int(num2)
else:
person2 = '-'
num2 = pair
negative_pairs.append('%s\t%s\t%s\t%s' % \
(person1, num1, person2, num2))

Expand Down
8 changes: 4 additions & 4 deletions datumaro/plugins/vgg_face2_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _split_item_path(path):

items = {}

with open(path) as content:
with open(path, encoding='utf-8') as content:
landmarks_table = list(csv.DictReader(content))
for row in landmarks_table:
item_id = row['NAME_ID']
Expand Down Expand Up @@ -96,7 +96,7 @@ def _split_item_path(path):
bboxes_path = osp.join(self._dataset_dir, VggFace2Path.ANNOTATION_DIR,
VggFace2Path.BBOXES_FILE + self._subset + '.csv')
if osp.isfile(bboxes_path):
with open(bboxes_path) as content:
with open(bboxes_path, encoding='utf-8') as content:
bboxes_table = list(csv.DictReader(content))
for row in bboxes_table:
item_id = row['NAME_ID']
Expand Down Expand Up @@ -221,7 +221,7 @@ def apply(self):
landmarks_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
VggFace2Path.LANDMARKS_FILE + subset_name + '.csv')
os.makedirs(osp.dirname(landmarks_path), exist_ok=True)
with open(landmarks_path, 'w', newline='') as file:
with open(landmarks_path, 'w', encoding='utf-8', newline='') as file:
columns = ['NAME_ID', 'P1X', 'P1Y', 'P2X', 'P2Y',
'P3X', 'P3Y', 'P4X', 'P4Y', 'P5X', 'P5Y']
writer = csv.DictWriter(file, fieldnames=columns)
Expand All @@ -232,7 +232,7 @@ def apply(self):
bboxes_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
VggFace2Path.BBOXES_FILE + subset_name + '.csv')
os.makedirs(osp.dirname(bboxes_path), exist_ok=True)
with open(bboxes_path, 'w', newline='') as file:
with open(bboxes_path, 'w', encoding='utf-8', newline='') as file:
columns = ['NAME_ID', 'X', 'Y', 'W', 'H']
writer = csv.DictWriter(file, fieldnames=columns)
writer.writeheader()
Expand Down
18 changes: 10 additions & 8 deletions datumaro/plugins/voc_format/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def save_subsets(self):
VocTask.action_classification}:
ann_path = osp.join(self._ann_dir, item.id + '.xml')
os.makedirs(osp.dirname(ann_path), exist_ok=True)
with open(ann_path, 'w') as f:
with open(ann_path, 'w', encoding='utf-8') as f:
f.write(ET.tostring(root_elem,
encoding='unicode', pretty_print=True))

Expand Down Expand Up @@ -351,7 +351,7 @@ def save_subsets(self):
@staticmethod
def _get_filtered_lines(path, patch, subset, items=None):
lines = {}
with open(path) as f:
with open(path, encoding='utf-8') as f:
for line in f:
item, text, _ = line.split(maxsplit=1) + ['', '']
if not patch or patch.updated_items.get((item, subset)) != \
Expand All @@ -368,7 +368,7 @@ def save_action_lists(self, subset_name, action_list):
items = {k: True for k in action_list}
if self._patch and osp.isfile(ann_file):
self._get_filtered_lines(ann_file, self._patch, subset_name, items)
with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
f.write('%s\n' % item)

Expand All @@ -393,7 +393,7 @@ def _write_item(f, item, objs, action):
if self._patch and osp.isfile(ann_file):
lines = self._get_filtered_lines(ann_file, None, subset_name)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
if item in action_list:
_write_item(f, item, action_list[item], action)
Expand All @@ -419,7 +419,7 @@ def _write_item(f, item, item_labels):
lines = self._get_filtered_lines(ann_file, self._patch,
subset_name, items)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
if item in class_lists:
_write_item(f, item, class_lists[item])
Expand All @@ -434,7 +434,7 @@ def save_clsdet_lists(self, subset_name, clsdet_list):
if self._patch and osp.isfile(ann_file):
self._get_filtered_lines(ann_file, self._patch, subset_name, items)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
f.write('%s\n' % item)

Expand All @@ -446,12 +446,14 @@ def save_segm_lists(self, subset_name, segm_list):
if self._patch and osp.isfile(ann_file):
self._get_filtered_lines(ann_file, self._patch, subset_name, items)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
f.write('%s\n' % item)

def save_layout_lists(self, subset_name, layout_list):
def _write_item(f, item, item_layouts):
if 1 < len(item.split()):
item = '\"' + item + '\"'
if item_layouts:
for obj_id in item_layouts:
f.write('%s % d\n' % (item, 1 + obj_id))
Expand All @@ -466,7 +468,7 @@ def _write_item(f, item, item_layouts):
if self._patch and osp.isfile(ann_file):
self._get_filtered_lines(ann_file, self._patch, subset_name, items)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
if item in layout_list:
_write_item(f, item, layout_list[item])
Expand Down
38 changes: 29 additions & 9 deletions datumaro/plugins/voc_format/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
_inverse_inst_colormap = invert_colormap(VocInstColormap)

class _VocExtractor(SourceExtractor):
def __init__(self, path):
def __init__(self, path, task):
assert osp.isfile(path), path
self._path = path
self._dataset_dir = osp.dirname(osp.dirname(osp.dirname(path)))
self._task = task

super().__init__(subset=osp.splitext(osp.basename(path))[0])

Expand Down Expand Up @@ -56,12 +57,29 @@ def _load_categories(dataset_path):
label_map = parse_label_map(label_map_path)
return make_voc_categories(label_map)

@staticmethod
def _load_subset_list(subset_path):
with open(subset_path) as f:
return [line.split()[0] for line in f]
def _load_subset_list(self, subset_path):
subset_list = []
with open(subset_path, encoding='utf-8') as f:
for line in f:
if self._task == VocTask.person_layout:
objects = line.split('\"')
if 1 < len(objects):
if len(objects) == 3:
line = objects[1]
else:
raise Exception("Line %s: unexpected number "
"of quotes in filename" % line)
else:
line = line.split()[0]
else:
line = line.strip()
subset_list.append(line)
return subset_list

class VocClassificationExtractor(_VocExtractor):
def __init__(self, path):
super().__init__(path, VocTask.classification)

def __iter__(self):
raw_anns = self._load_annotations()
for item_id in self._items:
Expand All @@ -78,11 +96,11 @@ def _load_annotations(self):
anno_files = [s for s in dir_items(task_dir, '.txt')
if s.endswith('_' + osp.basename(self._path))]
for ann_filename in anno_files:
with open(osp.join(task_dir, ann_filename)) as f:
with open(osp.join(task_dir, ann_filename), encoding='utf-8') as f:
label = ann_filename[:ann_filename.rfind('_')]
label_id = self._get_label_id(label)
for line in f:
item, present = line.split()
item, present = line.rsplit(maxsplit=1)
if present == '1':
annotations[item].append(label_id)

Expand All @@ -94,8 +112,7 @@ def _parse_annotations(raw_anns, item_id):

class _VocXmlExtractor(_VocExtractor):
def __init__(self, path, task):
super().__init__(path)
self._task = task
super().__init__(path, task)

def __iter__(self):
anno_dir = osp.join(self._dataset_dir, VocPath.ANNOTATIONS_DIR)
Expand Down Expand Up @@ -230,6 +247,9 @@ def __init__(self, path):
super().__init__(path, task=VocTask.action_classification)

class VocSegmentationExtractor(_VocExtractor):
def __init__(self, path):
super().__init__(path, task=VocTask.segmentation)

def __iter__(self):
for item_id in self._items:
log.debug("Reading item '%s'" % item_id)
Expand Down
4 changes: 2 additions & 2 deletions datumaro/plugins/widerface_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _load_categories(self):
def _load_items(self, path):
items = {}

with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()

image_ids = [image_id for image_id, line in enumerate(lines)
Expand Down Expand Up @@ -178,5 +178,5 @@ def apply(self):
annotation_path = osp.join(save_dir, WiderFacePath.ANNOTATIONS_DIR,
'wider_face_' + subset_name + '_bbx_gt.txt')
os.makedirs(osp.dirname(annotation_path), exist_ok=True)
with open(annotation_path, 'w') as f:
with open(annotation_path, 'w', encoding='utf-8') as f:
f.write(wider_annotation)
Loading

0 comments on commit ef12d30

Please sign in to comment.