Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Classification task in LFW dataset format #222

Merged
merged 4 commits into from
Apr 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- LabelMe format saves dataset items with their relative paths by subsets without changing names (<https://github.com/openvinotoolkit/datumaro/pull/200>)
- Allowed arbitrary subset count and names in classification and detection splitters (<https://github.com/openvinotoolkit/datumaro/pull/207>)
- Annotation-less dataset elements are now participate in subset splitting (<https://github.com/openvinotoolkit/datumaro/pull/211>)
- Classification task in LFW dataset format (<https://github.com/openvinotoolkit/datumaro/pull/222>)

### Deprecated
-
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ CVAT annotations ---> Publication, statistics etc.
- [LabelMe](http://labelme.csail.mit.edu/Release3.0)
- [ICDAR13/15](https://rrc.cvc.uab.es/?ch=2) (`word_recognition`, `text_localization`, `text_segmentation`)
- [Market-1501](https://www.aitribune.com/dataset/2018051063) (`person re-identification`)
- [LFW](http://vis-www.cs.umass.edu/lfw/) (`person re-identification`, `landmarks`)
- [LFW](http://vis-www.cs.umass.edu/lfw/) (`classification`, `person re-identification`, `landmarks`)
- Dataset building
- Merging multiple datasets into one
- Dataset filtering by a custom criteria:
Expand Down
242 changes: 177 additions & 65 deletions datumaro/plugins/lfw_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@

from datumaro.components.converter import Converter
from datumaro.components.extractor import (AnnotationType, DatasetItem,
Importer, Points, SourceExtractor)
Importer, Label, LabelCategories, Points, SourceExtractor)
from datumaro.util.image import find_images


class LfwPath:
IMAGES_DIR = 'images'
LANDMARKS_FILE = 'landmarks.txt'
PAIRS_FILE = 'pairs.txt'
PEOPLE_FILE = 'people.txt'
IMAGE_EXT = '.jpg'
PATTERN = re.compile(r'([\w]+)_([-\d]+)')
PATTERN = re.compile(r'([\w-]+)_([-\d]+)')

class LfwExtractor(SourceExtractor):
def __init__(self, path, subset=None):
Expand All @@ -29,59 +30,101 @@ def __init__(self, path, subset=None):
super().__init__(subset=subset)

self._dataset_dir = osp.dirname(osp.dirname(path))

people_file = osp.join(osp.dirname(path), LfwPath.PEOPLE_FILE)
self._categories = self._load_categories(people_file)

self._items = list(self._load_items(path).values())

def _load_categories(self, path):
label_cat = LabelCategories()
if osp.isfile(path):
with open(path, encoding='utf-8') as labels_file:
for line in labels_file:
objects = line.strip().split('\t')
if len(objects) == 2:
label_cat.add(objects[0])
return { AnnotationType.label: label_cat }

def _load_items(self, path):
items = {}
label_categories = self._categories.get(AnnotationType.label)

images_dir = osp.join(self._dataset_dir, self._subset, LfwPath.IMAGES_DIR)
if osp.isdir(images_dir):
images = { osp.splitext(osp.relpath(p, images_dir))[0]: p
images = { osp.splitext(osp.relpath(p, images_dir))[0].replace('\\', '/'): p
for p in find_images(images_dir, recursive=True) }
else:
images = {}

with open(path, encoding='utf-8') as f:
for line in f:
pair = line.strip().split('\t')
if len(pair) == 3:
if pair[0] == '-':
image1 = pair[1]
image2 = pair[2]
else:
image1 = self.get_image_name(pair[0], pair[1])
image2 = self.get_image_name(pair[0], pair[2])
if image1 not in items:
items[image1] = DatasetItem(id=image1, subset=self._subset,
image=images.get(image1),
attributes={'positive_pairs': [], 'negative_pairs': []})
if image2 not in items:
items[image2] = DatasetItem(id=image2, subset=self._subset,
image=images.get(image2),
attributes={'positive_pairs': [], 'negative_pairs': []})
if len(pair) == 1 and pair[0] != '':
annotations = []
image = pair[0]
item_id = pair[0]
objects = item_id.split('/')
if 1 < len(objects):
label_name = objects[0]
label = label_categories.find(label_name)[0]
if label != None:
annotations.append(Label(label))
item_id = item_id[len(label_name) + 1:]
if item_id not in items:
items[item_id] = DatasetItem(id=item_id, subset=self._subset,
image=images.get(image), annotations=annotations)
elif len(pair) == 3:
image1, id1 = self.get_image_name(pair[0], pair[1])
image2, id2 = self.get_image_name(pair[0], pair[2])
label = label_categories.find(pair[0])[0]
if label == None:
raise Exception("Line %s: people file doesn't "
"contain person %s " % (line, pair[0]))
if id1 not in items:
annotations = []
annotations.append(Label(label))
items[id1] = DatasetItem(id=id1, subset=self._subset,
image=images.get(image1), annotations=annotations)
if id2 not in items:
annotations = []
annotations.append(Label(label))
items[id2] = DatasetItem(id=id2, subset=self._subset,
image=images.get(image2), annotations=annotations)

# pairs form a directed graph
items[image1].attributes['positive_pairs'].append(image2)
if not items[id1].annotations[0].attributes.get('positive_pairs'):
items[id1].annotations[0].attributes['positive_pairs'] = []
items[id1].annotations[0].attributes['positive_pairs'].append(image2)

elif len(pair) == 4:
if pair[0] == '-':
image1 = pair[1]
else:
image1 = self.get_image_name(pair[0], pair[1])
image1, id1 = self.get_image_name(pair[0], pair[1])
if pair[2] == '-':
image2 = pair[3]
id2 = pair[3]
else:
image2 = self.get_image_name(pair[2], pair[3])
if image1 not in items:
items[image1] = DatasetItem(id=image1, subset=self._subset,
image=images.get(image1),
attributes={'positive_pairs': [], 'negative_pairs': []})
if image2 not in items:
items[image2] = DatasetItem(id=image2, subset=self._subset,
image=images.get(image2),
attributes={'positive_pairs': [], 'negative_pairs': []})
image2, id2 = self.get_image_name(pair[2], pair[3])
if id1 not in items:
annotations = []
label = label_categories.find(pair[0])[0]
if label == None:
raise Exception("Line %s: people file doesn't "
"contain person %s " % (line, pair[0]))
annotations.append(Label(label))
items[id1] = DatasetItem(id=id1, subset=self._subset,
image=images.get(image1), annotations=annotations)
if id2 not in items:
annotations = []
label = label_categories.find(pair[2])[0]
if label != None:
annotations.append(Label(label))
items[id2] = DatasetItem(id=id2, subset=self._subset,
image=images.get(image2), annotations=annotations)

# pairs form a directed graph
items[image1].attributes['negative_pairs'].append(image2)
if not items[id1].annotations[0].attributes.get('negative_pairs'):
items[id1].annotations[0].attributes['negative_pairs'] = []
items[id1].annotations[0].attributes['negative_pairs'].append(image2)

landmarks_file = osp.join(self._dataset_dir, self._subset,
LfwPath.LANDMARKS_FILE)
Expand All @@ -91,10 +134,15 @@ def _load_items(self, path):
line = line.split('\t')

item_id = osp.splitext(line[0])[0]
objects = item_id.split('/')
if 1 < len(objects):
label_name = objects[0]
label = label_categories.find(label_name)[0]
if label != None:
item_id = item_id[len(label_name) + 1:]
if item_id not in items:
items[item_id] = DatasetItem(id=item_id, subset=self._subset,
image=osp.join(images_dir, line[0]),
attributes={'positive_pairs': [], 'negative_pairs': []})
image=osp.join(images_dir, line[0]))

annotations = items[item_id].annotations
annotations.append(Points([float(p) for p in line[1:]]))
Expand All @@ -103,7 +151,15 @@ def _load_items(self, path):

@staticmethod
def get_image_name(person, image_id):
return '{}/{}_{:04d}'.format(person, person, int(image_id))
image, item_id = '', ''
try:
image_id = int(image_id)
image = '{}/{}_{:04d}'.format(person, person, image_id)
item_id = '{}_{:04d}'.format(person, image_id)
except ValueError:
image = '{}/{}'.format(person, image_id)
item_id = image_id
return image, item_id

class LfwImporter(Importer):
@classmethod
Expand All @@ -115,42 +171,90 @@ class LfwConverter(Converter):

def apply(self):
for subset_name, subset in self._extractor.subsets().items():
label_categories = self._extractor.categories()[AnnotationType.label]
labels = {}
for label in label_categories:
f = label.name
labels[label.name] = 0

positive_pairs = []
negative_pairs = []
neutral_items = []
landmarks = []
included_items = []

for item in subset:
anns = [ann for ann in item.annotations
if ann.type == AnnotationType.label]
label, label_name = None, None
if anns:
label = anns[0]
label_name = label_categories[anns[0].label].name
labels[label_name] += 1

if self._save_images and item.has_image:
self._save_image(item,
subdir=osp.join(subset_name, LfwPath.IMAGES_DIR))

search = LfwPath.PATTERN.search(item.id)
if search:
person1, num1 = search.groups()
num1 = int(num1)
else:
person1 = '-'
subdir=osp.join(subset_name, LfwPath.IMAGES_DIR)
if label_name:
subdir=osp.join(subdir, label_name)
self._save_image(item, subdir=subdir)

if label != None:
person1 = label_name
num1 = item.id
if 'positive_pairs' in item.attributes:
for pair in item.attributes['positive_pairs']:
search = LfwPath.PATTERN.search(pair)
if search:
num2 = search.groups()[1]
num2 = int(num2)
else:
num2 = pair
positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))
if 'negative_pairs' in item.attributes:
for pair in item.attributes['negative_pairs']:
search = LfwPath.PATTERN.search(pair)
if search:
person2, num2 = search.groups()
num2 = int(num2)
else:
person2 = '-'
num2 = pair
negative_pairs.append('%s\t%s\t%s\t%s' % \
(person1, num1, person2, num2))
if num1.startswith(person1):
num1 = int(num1.replace(person1, '')[1:])
curr_item = person1 + '/' + str(num1)

if 'positive_pairs' in label.attributes:
if curr_item not in included_items:
included_items.append(curr_item)
for pair in label.attributes['positive_pairs']:
search = LfwPath.PATTERN.search(pair)
if search:
num2 = search.groups()[1]
num2 = int(num2)
else:
num2 = pair
if num2.startswith(person1):
num2 = num2.replace(person1, '')[1:]
curr_item = person1 + '/' + str(num2)
if curr_item not in included_items:
included_items.append(curr_item)
positive_pairs.append('%s\t%s\t%s' % (person1, num1, num2))

if 'negative_pairs' in label.attributes:
if curr_item not in included_items:
included_items.append(curr_item)
for pair in label.attributes['negative_pairs']:
search = LfwPath.PATTERN.search(pair)
curr_item = ''
if search:
person2, num2 = search.groups()
num2 = int(num2)
curr_item += person2 + '/'
else:
person2 = '-'
num2 = pair
objects = pair.split('/')
if 1 < len(objects) and objects[0] in labels:
person2 = objects[0]
num2 = pair.replace(person2, '')[1:]
curr_item += person2 + '/'
curr_item += str(num2)
if curr_item not in included_items:
included_items.append(curr_item)
negative_pairs.append('%s\t%s\t%s\t%s' % \
(person1, num1, person2, num2))

if 'positive_pairs' not in label.attributes and \
'negative_pairs' not in label.attributes and \
curr_item not in included_items:
neutral_items.append('%s/%s' % (person1, item.id))
included_items.append(curr_item)

elif item.id not in included_items:
neutral_items.append(item.id)
included_items.append(item.id)

item_landmarks = [p for p in item.annotations
if p.type == AnnotationType.points]
Expand All @@ -163,9 +267,17 @@ def apply(self):
with open(pairs_file, 'w', encoding='utf-8') as f:
f.writelines(['%s\n' % pair for pair in positive_pairs])
f.writelines(['%s\n' % pair for pair in negative_pairs])
f.writelines(['%s\n' % item for item in neutral_items])

if landmarks:
landmarks_file = osp.join(self._save_dir, subset_name,
LfwPath.LANDMARKS_FILE)
with open(landmarks_file, 'w', encoding='utf-8') as f:
f.writelines(['%s\n' % landmark for landmark in landmarks])

if labels:
people_file = osp.join(self._save_dir, subset_name,
LfwPath.PEOPLE_FILE)
with open(people_file, 'w', encoding='utf-8') as f:
f.writelines(['%s\t%d\n' % (label, labels[label])
for label in labels])
2 changes: 1 addition & 1 deletion docs/user_manual.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ List of supported formats:
- Market-1501 (`person re-identification`)
- [Format specification](https://www.aitribune.com/dataset/2018051063)
- [Dataset example](../tests/assets/market1501_dataset)
- LFW (`person re-identification`, `landmarks`)
- LFW (`classification`, `person re-identification`, `landmarks`)
- [Format specification](http://vis-www.cs.umass.edu/lfw/)
- [Dataset example](../tests/assets/lfw_dataset)

Expand Down
3 changes: 3 additions & 0 deletions tests/assets/lfw_dataset/test/people.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
2
name0 2
name1 2
Loading