Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merging Places dataset #4132

Merged
merged 5 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
57 changes: 57 additions & 0 deletions docs/source/user_guide/dataset_zoo/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ This page lists all of the datasets available in the Dataset Zoo.
+--------------------------------------------------------------------+---------------------------------------------------------------------------+
| :ref:`Open Images V7 <dataset-zoo-open-images-v7>` | image, classification, detection, segmentation, keypoints, relationships |
+--------------------------------------------------------------------+---------------------------------------------------------------------------+
| :ref:`Places <dataset-zoo-places>` | image, classification |
+--------------------------------------------------------------------+---------------------------------------------------------------------------+
| :ref:`Quickstart <dataset-zoo-quickstart>` | image, quickstart |
+--------------------------------------------------------------------+---------------------------------------------------------------------------+
| :ref:`Quickstart Geo <dataset-zoo-quickstart-geo>` | image, location, quickstart |
Expand Down Expand Up @@ -3197,6 +3199,61 @@ Images V7 by passing them to
:alt: open-images-v7
:align: center

.. _dataset-zoo-places:

Places
------

Places is a scene recognition dataset of 10 million images comprising ~400
unique scene categories.

The images are labeled with scene semantic categories, comprising a large
and diverse list of the types of environments encountered in the world.

**Details**

- Dataset name: ``places``
- Dataset source: http://places2.csail.mit.edu/download-private.html
- Dataset size: 29 GB
- Tags: ``image, classification``
- Supported splits: ``train, validation, test``
- ZooDataset classes:
:class:`PlacesDataset <fiftyone.zoo.datasets.base.PlacesDataset>`

**Full split stats**

- Train split: 1,803,460 images, with between 3,068 and 5,000 per category
- Test split: 328,500 images, with 900 images per category
- Validation split: 36,500 images, with 100 images per category

**Example usage**

.. tabs::

.. group-tab:: Python

.. code-block:: python
:linenos:

import fiftyone as fo
import fiftyone.zoo as foz

dataset = foz.load_zoo_dataset("places", split="validation")

session = fo.launch_app(dataset)

.. group-tab:: CLI

.. code-block:: shell

fiftyone zoo datasets load places --split validation

fiftyone app launch places-validation

.. image:: /images/dataset_zoo/places-validation.png
:alt: places-validation
:align: center

.. _dataset-zoo-quickstart:

Quickstart
Expand Down
11 changes: 11 additions & 0 deletions fiftyone/types/dataset_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,3 +887,14 @@ def get_dataset_exporter_cls(self):
import fiftyone.utils.data as foud

return foud.LegacyFiftyOneDatasetExporter


class PlacesDataset(ImageClassificationDataset):
"""A labeled dataset consisting of images and their associated lables
from the `Places dataset <http://places2.csail.mit.edu/index.html>`.
"""

def get_dataset_importer_cls(self):
import fiftyone.utils.places as foup

return foup.PlacesDatasetImporter
313 changes: 313 additions & 0 deletions fiftyone/utils/places.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
"""
Utilities for working with the
`Places dataset <http://places2.csail.mit.edu/index.html>`_.

| Copyright 2017-2024, Voxel51, Inc.
| `voxel51.com <https://voxel51.com/>`_
|
"""
import os
import logging
import shutil
import json
import random

import eta.core.serial as etas
import eta.core.utils as etau
import eta.core.web as etaw

import fiftyone.utils.data as foud
import fiftyone.core.labels as fol

logger = logging.getLogger(__name__)


def download_places_dataset_split(dataset_dir, split, raw_dir=None):
"""Utility that downloads splits of the
`Places dataset <http://places2.csail.mit.edu/index.html>`.

Any existing files are not re-downloaded.

Args:
dataset_dir: the directory to download the dataset
split: the split to download. Supported values are
``("train", "validation", "test")``
raw_dir (None): a directory in which full annotations files may be
stored to avoid re-downloads in the future

Returns:
a tuple of:
- num_samples: the total number of downloaded images
- classes: the list of all classes
- did_download: whether any content was downloaded (True) or if all
necessary files were already downloaded (False)
"""
if split not in _IMAGE_DOWNLOAD_LINKS:
raise ValueError(
"Unsupported split '%s'; supported values are %s"
% (split, tuple(_IMAGE_DOWNLOAD_LINKS.keys()))
)

if raw_dir is None:
raw_dir = os.path.join(dataset_dir, "raw")

if not os.path.isdir(raw_dir):
etau.ensure_dir(raw_dir)
logger.info("Downloading annotations to %s if necessary!", raw_dir)

annot_tar = os.path.join(
raw_dir, os.path.basename(_ANNOTATION_DOWNLOAD_LINK)
)
if not os.path.isfile(annot_tar):
etaw.download_file(_ANNOTATION_DOWNLOAD_LINK, path=annot_tar)
etau.extract_tar(annot_tar, delete_tar=True)
else:
logger.info("Found annotations at '%s'", raw_dir)

did_download = False
images_dir = os.path.join(dataset_dir, "data")

if not os.path.isdir(images_dir):
etau.ensure_dir(images_dir)
logger.info(
"Downloading %s split from %s to %s",
split,
_IMAGE_DOWNLOAD_LINKS[split],
images_dir,
)

images_tar = os.path.join(
images_dir, os.path.basename(_IMAGE_DOWNLOAD_LINKS[split])
)
if not os.path.isfile(images_tar):
etaw.download_file(_IMAGE_DOWNLOAD_LINKS[split], path=images_tar)

logger.info("Extracting and moving images...")

etau.extract_tar(images_tar, delete_tar=True)

if split == "validation" or split == "test":
src = os.path.join(images_dir, _TAR_NAMES[split])
dst = images_dir

for f in os.listdir(src):
_dst = os.path.join(dst, f)
if os.path.isfile(_dst):
os.remove(_dst)
elif os.path.isdir(_dst):
shutil.rmtree(_dst, ignore_errors=True)

shutil.move(os.path.join(src, f), dst)

etau.delete_dir(src)

did_download = True

if split == "train":
src = os.path.join(images_dir, _TAR_NAMES[split])
dst = images_dir

for root, dirs, files in os.walk(src):
for file in files:
if (
file.endswith(".png")
or file.endswith(".jpg")
or file.endswith(".jpeg")
):
rel_path = os.path.relpath(root, src)
new_name = os.path.splitext(
os.path.join(rel_path, file)
)[0]
new_filename = (
new_name.replace(os.path.sep, "_")
+ os.path.splitext(file)[1]
)
destination_path = os.path.join(dst, new_filename)
shutil.move(os.path.join(root, file), destination_path)

for root, dirs, files in os.walk(src, topdown=False):
for dir_name in dirs:
dir_path = os.path.join(root, dir_name)
shutil.rmtree(dir_path)

did_download = True
else:
logger.info("Found %s split at '%s'", split, images_dir)

categories_map = {}
with open(
os.path.join(raw_dir, _ANNOTATION_FILE["categories"]), "r"
) as file:
for line in file:
components = line.strip().split()

category = components[0]
key = int(components[1])

categories_map[key] = category

if split != "test" and did_download:
labels_dir = os.path.join(dataset_dir, "labels")
etau.ensure_dir(labels_dir)
txt_file = os.path.join(raw_dir, _ANNOTATION_FILE[split])
json_file = os.path.join(labels_dir, "labels.json")

if split == "validation":
data = {}

with open(txt_file, "r") as file:
for line in file:
components = line.strip().split()

file_name = components[0]
category = int(components[1])

data[file_name] = categories_map[category]

with open(json_file, "w") as outfile:
json.dump(data, outfile, indent=4)

if split == "train":
data = {}

with open(txt_file, "r") as file:
for line in file:
components = line.strip().split()

file_name = components[0][1:].replace("/", "_")
category = int(components[1])

data[file_name] = categories_map[category]

with open(json_file, "w") as outfile:
json.dump(data, outfile, indent=4)

num_samples = len(etau.list_files(os.path.join(images_dir)))
classes = list(categories_map.values())

return num_samples, classes, did_download


class PlacesDatasetImporter(foud.LabeledImageDatasetImporter):
"""Class for importing datasets written by
:meth:download_places_dataset_split`.

Args:
dataset_dir: the dataset directory
shuffle (False): whether to randomly shuffle the order in which the
samples are imported
seed (None): a random seed to use when shuffling
max_samples (None): a maximum number of samples to load. By default,
all samples are imported
"""

def __init__(
self,
dataset_dir,
shuffle=False,
seed=None,
max_samples=None,
):
super().__init__(
dataset_dir=dataset_dir,
shuffle=shuffle,
seed=seed,
max_samples=max_samples,
)

self._images_map = None
self._labels_map = None
self._uuids = None
self._iter_uuids = None

@property
def has_image_metadata(self):
return False

@property
def has_dataset_info(self):
return False

@property
def label_cls(self):
return fol.Classification

def __iter__(self):
self._iter_uuids = iter(self._uuids)
return self

def __len__(self):
return len(self._uuids)

def __next__(self):
image_id = next(self._iter_uuids)
image_path = self._images_map[image_id]
uuid = os.path.basename(image_path)

label = self._labels_map.get(uuid, None)
if label is not None:
label = fol.Classification(label=label)

return image_path, None, label

def setup(self):
dataset_dir = self.dataset_dir

if self.seed is not None:
random.seed(self.seed)

data_dir = os.path.join(dataset_dir, "data")
labels_dir = os.path.join(dataset_dir, "labels")

images_map = {
os.path.splitext(filename)[0]: os.path.join(data_dir, filename)
for filename in etau.list_files(data_dir)
}

labels_path = os.path.join(labels_dir, "labels.json")
if os.path.isfile(labels_path):
labels_map = etas.load_json(labels_path)
else:
labels_map = {}

uuids = list(images_map.keys())

if self.shuffle:
random.shuffle(uuids)

if self.max_samples is not None:
uuids = uuids[: self.max_samples]

self._images_map = images_map
self._labels_map = labels_map
self._uuids = uuids

@staticmethod
def _get_num_samples(dataset_dir):
return len(etau.list_files(os.path.join(dataset_dir, "data")))


_IMAGE_DOWNLOAD_LINKS = {
"train": "http://data.csail.mit.edu/places/places365/train_256_places365standard.tar",
"validation": "http://data.csail.mit.edu/places/places365/val_256.tar",
"test": "http://data.csail.mit.edu/places/places365/test_256.tar",
}

_TAR_NAMES = {
"train": "data_256",
"validation": "val_256",
"test": "test_256",
}

_ANNOTATION_DOWNLOAD_LINK = "http://data.csail.mit.edu/places/places365/filelist_places365-standard.tar"

_ANNOTATION_FILE = {
"categories": "categories_places365.txt",
"train": "places365_train_standard.txt",
"validation": "places365_val.txt",
"test": "places365_test.txt",
}

_SPLIT_SIZES = {"train": 1803460, "validation": 36500, "test": 328500}

_SUPPORTED_SPLITS = ["train", "validation", "test"]
Loading
Loading