Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dataset formats (cyrilic and spaces in filename) #148

Merged
merged 17 commits into from
Mar 17, 2021
Merged
15 changes: 10 additions & 5 deletions datumaro/plugins/camvid_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging as log
import os
import os.path as osp
import re
from collections import OrderedDict
from enum import Enum

Expand All @@ -18,8 +19,7 @@
from datumaro.util import find, str_to_bool
from datumaro.util.annotation_util import make_label_id_mapping
from datumaro.util.image import save_image
from datumaro.util.mask_tools import lazy_mask, paint_mask, generate_colormap

from datumaro.util.mask_tools import generate_colormap, lazy_mask, paint_mask

CamvidLabelMap = OrderedDict([
('Void', (0, 0, 0)),
Expand Down Expand Up @@ -61,6 +61,7 @@ class CamvidPath:
SEGM_DIR = "annot"
IMAGE_EXT = '.jpg'
MASK_EXT = '.png'
PATTERN = re.compile(r'(.+[.]\S+) (.+[.]\S+)?')
zhiltsov-max marked this conversation as resolved.
Show resolved Hide resolved


def parse_label_map(path):
Expand Down Expand Up @@ -159,13 +160,17 @@ def _load_items(self, path):
items = {}
with open(path, encoding='utf-8') as f:
for line in f:
objects = line.split()
search = CamvidPath.PATTERN.search(line)
if search:
objects = CamvidPath.PATTERN.search(line).groups()
else:
raise Exception("Line %s: invalid path format" % line)
image = objects[0]
item_id = ('/'.join(image.split('/')[2:]))[:-len(CamvidPath.IMAGE_EXT)]
image_path = osp.join(self._dataset_dir,
(image, image[1:])[image[0] == '/'])
item_annotations = []
if 1 < len(objects):
if objects[1] != None:
gt = objects[1]
gt_path = osp.join(self._dataset_dir,
(gt, gt[1:]) [gt[0] == '/'])
Expand Down Expand Up @@ -264,7 +269,7 @@ def save_segm_lists(self, subset_name, segm_list):
return

ann_file = osp.join(self._save_dir, subset_name + '.txt')
with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for (image_path, mask_path) in segm_list.values():
f.write('/%s %s\n' % (
image_path.replace('\\', '/'),
Expand Down
1 change: 0 additions & 1 deletion datumaro/plugins/imagenet_format.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# Copyright (C) 2020 Intel Corporation
#
# SPDX-License-Identifier: MIT
Expand Down
24 changes: 19 additions & 5 deletions datumaro/plugins/imagenet_txt_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,18 @@ def _load_items(self, path):
items = {}
with open(path, encoding='utf-8') as f:
for line in f:
item = line.split()
item_id = item[0]
label_ids = [int(id) for id in item[1:]]
item = line.split('\"')
zhiltsov-max marked this conversation as resolved.
Show resolved Hide resolved
if 1 < len(item):
if len(item) == 3:
item_id = item[1]
label_ids = [int(id) for id in item[2].split()]
else:
raise Exception("Line %s: unexpected number "
"of quotes in filename" % line)
else:
item = line.split()
item_id = item[0]
label_ids = [int(id) for id in item[1:]]
anno = []
for label in label_ids:
assert 0 <= label and \
Expand Down Expand Up @@ -95,9 +104,14 @@ def apply(self):
if self._save_images and item.has_image:
self._save_image(item, subdir=ImagenetTxtPath.IMAGE_DIR)

annotation = ''
for item_id, item_labels in labels.items():
if 1 < len(item_id.split()):
item_id = '\"' + item_id + '\"'
annotation += '%s %s\n' % (item_id, ' '.join(item_labels))

with open(annotation_file, 'w', encoding='utf-8') as f:
f.writelines(['%s %s\n' % (item_id, ' '.join(labels[item_id]))
for item_id in labels])
f.write(annotation)

labels_file = osp.join(subset_dir, ImagenetTxtPath.LABELS_FILE)
with open(labels_file, 'w', encoding='utf-8') as f:
Expand Down
46 changes: 35 additions & 11 deletions datumaro/plugins/lfw_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,14 @@ def _load_items(self, path):
images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR)
with open(path, encoding='utf-8') as f:
for line in f:
pair = line.strip().split()
pair = line.strip().split('\t')
if len(pair) == 3:
image1 = self.get_image_name(pair[0], pair[1])
image2 = self.get_image_name(pair[0], pair[2])
if pair[0] == '-':
image1 = pair[1]
image2 = pair[2]
else:
image1 = self.get_image_name(pair[0], pair[1])
image2 = self.get_image_name(pair[0], pair[2])
if image1 not in items:
items[image1] = DatasetItem(id=image1, subset=self._subset,
image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
Expand All @@ -47,8 +51,14 @@ def _load_items(self, path):
attributes = items[image1].attributes
attributes['positive_pairs'].append(image2)
elif len(pair) == 4:
image1 = self.get_image_name(pair[0], pair[1])
image2 = self.get_image_name(pair[2], pair[3])
if pair[0] == '-':
image1 = pair[1]
else:
image1 = self.get_image_name(pair[0], pair[1])
if pair[2] == '-':
image2 = pair[3]
else:
image2 = self.get_image_name(pair[2], pair[3])
if image1 not in items:
items[image1] = DatasetItem(id=image1, subset=self._subset,
image=osp.join(images_dir, image1 + LfwPath.IMAGE_EXT),
Expand Down Expand Up @@ -102,17 +112,31 @@ def apply(self):
self._save_image(item, osp.join(self._save_dir, subset_name,
LfwPath.IMAGES_DIR, item.id + LfwPath.IMAGE_EXT))

person1, num1 = LfwPath.PATTERN.search(item.id).groups()
num1 = int(num1)
search = LfwPath.PATTERN.search(item.id)
if search:
person1, num1 = search.groups()
num1 = int(num1)
else:
person1 = '-'
num1 = item.id
if 'positive_pairs' in item.attributes:
for pair in item.attributes['positive_pairs']:
num2 = LfwPath.PATTERN.search(pair).groups()[1]
num2 = int(num2)
search = LfwPath.PATTERN.search(pair)
if search:
num2 = search.groups()[1]
num2 = int(num2)
else:
num2 = pair
positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))
if 'negative_pairs' in item.attributes:
for pair in item.attributes['negative_pairs']:
person2, num2 = LfwPath.PATTERN.search(pair).groups()
num2 = int(num2)
search = LfwPath.PATTERN.search(pair)
if search:
person2, num2 = search.groups()
num2 = int(num2)
else:
person2 = '-'
num2 = pair
negative_pairs.append('%s\t%s\t%s\t%s' % \
(person1, num1, person2, num2))

Expand Down
8 changes: 4 additions & 4 deletions datumaro/plugins/vgg_face2_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _split_item_path(path):

items = {}

with open(path) as content:
with open(path, encoding='utf-8') as content:
landmarks_table = list(csv.DictReader(content))
for row in landmarks_table:
item_id = row['NAME_ID']
Expand Down Expand Up @@ -96,7 +96,7 @@ def _split_item_path(path):
bboxes_path = osp.join(self._dataset_dir, VggFace2Path.ANNOTATION_DIR,
VggFace2Path.BBOXES_FILE + self._subset + '.csv')
if osp.isfile(bboxes_path):
with open(bboxes_path) as content:
with open(bboxes_path, encoding='utf-8') as content:
bboxes_table = list(csv.DictReader(content))
for row in bboxes_table:
item_id = row['NAME_ID']
Expand Down Expand Up @@ -221,7 +221,7 @@ def apply(self):
landmarks_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
VggFace2Path.LANDMARKS_FILE + subset_name + '.csv')
os.makedirs(osp.dirname(landmarks_path), exist_ok=True)
with open(landmarks_path, 'w', newline='') as file:
with open(landmarks_path, 'w', encoding='utf-8', newline='') as file:
columns = ['NAME_ID', 'P1X', 'P1Y', 'P2X', 'P2Y',
'P3X', 'P3Y', 'P4X', 'P4Y', 'P5X', 'P5Y']
writer = csv.DictWriter(file, fieldnames=columns)
Expand All @@ -232,7 +232,7 @@ def apply(self):
bboxes_path = osp.join(save_dir, VggFace2Path.ANNOTATION_DIR,
VggFace2Path.BBOXES_FILE + subset_name + '.csv')
os.makedirs(osp.dirname(bboxes_path), exist_ok=True)
with open(bboxes_path, 'w', newline='') as file:
with open(bboxes_path, 'w', encoding='utf-8', newline='') as file:
columns = ['NAME_ID', 'X', 'Y', 'W', 'H']
writer = csv.DictWriter(file, fieldnames=columns)
writer.writeheader()
Expand Down
24 changes: 16 additions & 8 deletions datumaro/plugins/voc_format/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def save_subsets(self):
VocTask.action_classification}:
ann_path = osp.join(self._ann_dir, item.id + '.xml')
os.makedirs(osp.dirname(ann_path), exist_ok=True)
with open(ann_path, 'w') as f:
with open(ann_path, 'w', encoding='utf-8') as f:
f.write(ET.tostring(root_elem,
encoding='unicode', pretty_print=True))

Expand Down Expand Up @@ -351,7 +351,7 @@ def save_subsets(self):
@staticmethod
def _get_filtered_lines(path, patch, subset, items=None):
lines = {}
with open(path) as f:
with open(path, encoding='utf-8') as f:
for line in f:
item, text, _ = line.split(maxsplit=1) + ['', '']
if not patch or patch.updated_items.get((item, subset)) != \
Expand All @@ -368,8 +368,10 @@ def save_action_lists(self, subset_name, action_list):
items = {k: True for k in action_list}
if self._patch and osp.isfile(ann_file):
self._get_filtered_lines(ann_file, self._patch, subset_name, items)
with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
# if 1 < len(item.split()):
# item = '\"' + item + '\"'
f.write('%s\n' % item)

if not items and not self._patch:
Expand All @@ -393,7 +395,7 @@ def _write_item(f, item, objs, action):
if self._patch and osp.isfile(ann_file):
lines = self._get_filtered_lines(ann_file, None, subset_name)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
if item in action_list:
_write_item(f, item, action_list[item], action)
Expand All @@ -419,7 +421,7 @@ def _write_item(f, item, item_labels):
lines = self._get_filtered_lines(ann_file, self._patch,
subset_name, items)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
if item in class_lists:
_write_item(f, item, class_lists[item])
Expand All @@ -434,8 +436,10 @@ def save_clsdet_lists(self, subset_name, clsdet_list):
if self._patch and osp.isfile(ann_file):
self._get_filtered_lines(ann_file, self._patch, subset_name, items)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
# if 1 < len(item.split()):
# item = '\"' + item + '\"'
f.write('%s\n' % item)

def save_segm_lists(self, subset_name, segm_list):
Expand All @@ -446,12 +450,16 @@ def save_segm_lists(self, subset_name, segm_list):
if self._patch and osp.isfile(ann_file):
self._get_filtered_lines(ann_file, self._patch, subset_name, items)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
# if 1 < len(item.split()):
# item = '\"' + item + '\"'
f.write('%s\n' % item)

def save_layout_lists(self, subset_name, layout_list):
def _write_item(f, item, item_layouts):
# if 1 < len(item.split()):
# item = '\"' + item + '\"'
if item_layouts:
for obj_id in item_layouts:
f.write('%s % d\n' % (item, 1 + obj_id))
Expand All @@ -466,7 +474,7 @@ def _write_item(f, item, item_layouts):
if self._patch and osp.isfile(ann_file):
self._get_filtered_lines(ann_file, self._patch, subset_name, items)

with open(ann_file, 'w') as f:
with open(ann_file, 'w', encoding='utf-8') as f:
for item in items:
if item in layout_list:
_write_item(f, item, layout_list[item])
Expand Down
16 changes: 12 additions & 4 deletions datumaro/plugins/voc_format/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,14 @@ def _load_categories(dataset_path):

@staticmethod
def _load_subset_list(subset_path):
with open(subset_path) as f:
return [line.split()[0] for line in f]
subset_list = []
with open(subset_path, encoding='utf-8') as f:
for line in f:
line = line.strip().split()
if 2 < len(line):
line[0] = ' '.join(line[i] for i in range(len(line)))
subset_list.append(line[0])
return subset_list

class VocClassificationExtractor(_VocExtractor):
def __iter__(self):
Expand All @@ -78,11 +84,13 @@ def _load_annotations(self):
anno_files = [s for s in dir_items(task_dir, '.txt')
if s.endswith('_' + osp.basename(self._path))]
for ann_filename in anno_files:
with open(osp.join(task_dir, ann_filename)) as f:
with open(osp.join(task_dir, ann_filename), encoding='utf-8') as f:
label = ann_filename[:ann_filename.rfind('_')]
label_id = self._get_label_id(label)
for line in f:
item, present = line.split()
objects = line.split()
zhiltsov-max marked this conversation as resolved.
Show resolved Hide resolved
item = ' '.join(objects[i] for i in range(len(objects) - 1))
present = objects[-1]
if present == '1':
annotations[item].append(label_id)

Expand Down
4 changes: 2 additions & 2 deletions datumaro/plugins/widerface_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _load_categories(self):
def _load_items(self, path):
items = {}

with open(path, 'r') as f:
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()

image_ids = [image_id for image_id, line in enumerate(lines)
Expand Down Expand Up @@ -178,5 +178,5 @@ def apply(self):
annotation_path = osp.join(save_dir, WiderFacePath.ANNOTATIONS_DIR,
'wider_face_' + subset_name + '_bbx_gt.txt')
os.makedirs(osp.dirname(annotation_path), exist_ok=True)
with open(annotation_path, 'w') as f:
with open(annotation_path, 'w', encoding='utf-8') as f:
f.write(wider_annotation)
8 changes: 4 additions & 4 deletions datumaro/plugins/yolo_format/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def apply(self):
label_categories = extractor.categories()[AnnotationType.label]
label_ids = {label.name: idx
for idx, label in enumerate(label_categories.items)}
with open(osp.join(save_dir, 'obj.names'), 'w') as f:
with open(osp.join(save_dir, 'obj.names'), 'w', encoding='utf-8') as f:
f.writelines('%s\n' % l[0]
for l in sorted(label_ids.items(), key=lambda x: x[1]))

Expand Down Expand Up @@ -88,15 +88,15 @@ def apply(self):

annotation_path = osp.join(subset_dir, '%s.txt' % item.id)
os.makedirs(osp.dirname(annotation_path), exist_ok=True)
with open(annotation_path, 'w') as f:
with open(annotation_path, 'w', encoding='utf-8') as f:
f.write(yolo_annotation)

subset_list_name = '%s.txt' % subset_name
subset_lists[subset_name] = subset_list_name
with open(osp.join(save_dir, subset_list_name), 'w') as f:
with open(osp.join(save_dir, subset_list_name), 'w', encoding='utf-8') as f:
f.writelines('%s\n' % s for s in image_paths.values())

with open(osp.join(save_dir, 'obj.data'), 'w') as f:
with open(osp.join(save_dir, 'obj.data'), 'w', encoding='utf-8') as f:
f.write('classes = %s\n' % len(label_ids))

for subset_name, subset_list_name in subset_lists.items():
Expand Down
Loading