Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hierarchical ImageNet-like dataset format #1528

Merged
merged 5 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/datumaro/cli/commands/downloaders/kaggle.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def make_all_paths_absolute(d: Dict, root: str = "."):


KAGGLE_API_KEY_EXISTS = bool(os.environ.get("KAGGLE_KEY")) or os.path.exists(
os.path.join(os.path.expanduser("~"), ".kaggle")
os.path.join(os.path.expanduser("~"), ".kaggle", "kaggle.json")
)


Expand Down
4 changes: 2 additions & 2 deletions src/datumaro/components/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from datumaro.components.errors import DatasetImportError, DatasetNotFoundError
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
from datumaro.components.merge.extractor_merger import ExtractorMerger
from datumaro.util.definitions import SUBSET_NAME_BLACKLIST
from datumaro.util.definitions import SUBSET_NAME_WHITELIST

T = TypeVar("T")

Expand Down Expand Up @@ -197,7 +197,7 @@ def _change_context_root_path(context: FormatDetectionContext, path: str):
)

for sub_dir in os.listdir(path):
if sub_dir.lower() in SUBSET_NAME_BLACKLIST:
if sub_dir.lower() not in SUBSET_NAME_WHITELIST:
continue

sub_path = osp.join(path, sub_dir)
Expand Down
28 changes: 20 additions & 8 deletions src/datumaro/plugins/data_formats/image_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

import logging as log
import os
import os.path as osp
from pathlib import Path
from typing import List, Optional

from datumaro.components.dataset_base import DatasetItem, SubsetBase
from datumaro.components.exporter import Exporter
from datumaro.components.format_detection import FormatDetectionConfidence
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
from datumaro.components.importer import ImportContext, Importer
from datumaro.components.media import Image
from datumaro.util.image import IMAGE_EXTENSIONS, find_images
Expand All @@ -31,11 +31,23 @@
)
return parser

@classmethod
def detect(cls, context: FormatDetectionContext) -> FormatDetectionConfidence:
path = Path(context.root_path)
for item in path.iterdir():
if item.is_dir():
context.fail("Only flat image directories are supported")
elif item.suffix.lower() not in IMAGE_EXTENSIONS:
context.fail(f"File {item} is not an image.")
return super().detect(context)

Check warning on line 42 in src/datumaro/plugins/data_formats/image_dir.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/plugins/data_formats/image_dir.py#L42

Added line #L42 was not covered by tests

@classmethod
def find_sources(cls, path):
if not osp.isdir(path):
path = Path(path)
if not path.is_dir():
return []
return [{"url": path, "format": ImageDirBase.NAME}]

return [{"url": str(path), "format": ImageDirBase.NAME}]

@classmethod
def get_file_extensions(cls) -> List[str]:
Expand All @@ -51,11 +63,11 @@
ctx: Optional[ImportContext] = None,
):
super().__init__(subset=subset, ctx=ctx)
url = Path(url)
assert url.is_dir(), url

assert osp.isdir(url), url

for path in find_images(url, recursive=True):
item_id = osp.relpath(osp.splitext(path)[0], url)
for path in find_images(str(url)):
item_id = Path(path).stem
self._items.append(
DatasetItem(id=item_id, subset=self._subset, media=Image.from_file(path=path))
)
Expand Down
152 changes: 90 additions & 62 deletions src/datumaro/plugins/data_formats/imagenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
import errno
import logging as log
import os
import os.path as osp
import warnings
from typing import List, Optional
from pathlib import Path
from typing import List

from datumaro.components.annotation import AnnotationType, Label, LabelCategories
from datumaro.components.dataset_base import DatasetItem, SubsetBase
Expand All @@ -16,8 +15,9 @@
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
from datumaro.components.importer import ImportContext, Importer, with_subset_dirs
from datumaro.components.media import Image
from datumaro.util.definitions import SUBSET_NAME_BLACKLIST
from datumaro.util.definitions import SUBSET_NAME_BLACKLIST, SUBSET_NAME_WHITELIST
from datumaro.util.image import IMAGE_EXTENSIONS, find_images
from datumaro.util.os_util import walk


class ImagenetPath:
Expand All @@ -30,40 +30,39 @@
self,
path: str,
*,
subset: Optional[str] = None,
ctx: Optional[ImportContext] = None,
subset: str | None = None,
ctx: ImportContext | None = None,
min_depth: int | None = None,
max_depth: int | None = None,
):
if not osp.isdir(path):
if not Path(path).is_dir():
raise NotADirectoryError(errno.ENOTDIR, "Can't find dataset directory", path)

super().__init__(subset=subset, ctx=ctx)

self._max_depth = min_depth
self._min_depth = max_depth
self._categories = self._load_categories(path)
self._items = list(self._load_items(path).values())

def _load_categories(self, path):
label_cat = LabelCategories()
for dirname in sorted(os.listdir(path)):
if not os.path.isdir(os.path.join(path, dirname)):
warnings.warn(
f"{dirname} is not a directory in the folder {path}, so this will"
"be skipped when declaring the cateogries of `imagenet` dataset."
)
continue
if dirname != ImagenetPath.IMAGE_DIR_NO_LABEL:
label_cat.add(dirname)
path = Path(path)
for dirname in sorted(d for d in path.rglob("*") if d.is_dir()):
dirname = dirname.relative_to(path)
if str(dirname) != ImagenetPath.IMAGE_DIR_NO_LABEL:
label_cat.add(str(dirname))
return {AnnotationType.label: label_cat}

def _load_items(self, path):
items = {}

# Images should be in root/label_dir/*.img and root/*.img is not allowed.
# => max_depth=1, min_depth=1
for image_path in find_images(path, recursive=True, max_depth=1, min_depth=1):
label = osp.basename(osp.dirname(image_path))
image_name = osp.splitext(osp.basename(image_path))[0]

item_id = label + ImagenetPath.SEP_TOKEN + image_name
for image_path in find_images(
path, recursive=True, max_depth=self._max_depth, min_depth=self._min_depth
):
label = str(Path(image_path).parent.relative_to(path))
if label == ".": # image is located in the root directory
label = ImagenetPath.IMAGE_DIR_NO_LABEL
image_name = Path(image_path).stem
item_id = str(label) + ImagenetPath.SEP_TOKEN + image_name
item = items.get(item_id)
try:
if item is None:
Expand All @@ -89,45 +88,70 @@


class ImagenetImporter(Importer):
"""TorchVision's ImageFolder style importer.
For example, it imports the following directory structure.
"""
Multi-level version of ImagenetImporter.
For example, it imports the following directory structure.

.. code-block:: text

root
├── label_0
│ ├── label_0_1.jpg
│ └── label_0_2.jpg
│ ├── label_0_1
│ │ └── img1.jpg
│ └── label_0_2
│ └── img2.jpg
└── label_1
└── label_1_1.jpg
└── img3.jpg

"""

_MIN_DEPTH = None
_MAX_DEPTH = None
_FORMAT = ImagenetBase.NAME
DETECT_CONFIDENCE = FormatDetectionConfidence.EXTREME_LOW

@classmethod
def detect(cls, context: FormatDetectionContext) -> FormatDetectionConfidence:
# Images must not be under a directory whose name is blacklisted.
for dname in os.listdir(context.root_path):
dpath = osp.join(context.root_path, dname)
if osp.isdir(dpath) and dname.lower() in SUBSET_NAME_BLACKLIST:
for dname, dirnames, filenames in os.walk(context.root_path):
if dname in SUBSET_NAME_WHITELIST:
context.fail(
f"{dname} is found in {context.root_path}. "
"However, Images must not be under a directory whose name is blacklisted "
f"(SUBSET_NAME_BLACKLIST={SUBSET_NAME_BLACKLIST})."
f"Following directory names are not permitted: {SUBSET_NAME_WHITELIST}"
)
rel_dname = Path(dname).relative_to(context.root_path)
level = len(rel_dname.parts)
if cls._MIN_DEPTH is not None and level < cls._MIN_DEPTH and filenames:
context.fail("Found files out of the directory level bounds.")

Check warning on line 124 in src/datumaro/plugins/data_formats/imagenet.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/plugins/data_formats/imagenet.py#L124

Added line #L124 was not covered by tests
if cls._MAX_DEPTH is not None and level > cls._MAX_DEPTH and filenames:
context.fail("Found files out of the directory level bounds.")

Check warning on line 126 in src/datumaro/plugins/data_formats/imagenet.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/plugins/data_formats/imagenet.py#L126

Added line #L126 was not covered by tests
dpath = Path(context.root_path) / rel_dname
if dpath.is_dir():
if str(rel_dname).lower() in SUBSET_NAME_BLACKLIST:
context.fail(
f"{dname} is found in {context.root_path}. "
"However, Images must not be under a directory whose name is blacklisted "
f"(SUBSET_NAME_BLACKLIST={SUBSET_NAME_BLACKLIST})."
)

return super().detect(context)

@classmethod
def contains_only_images(cls, path: str | Path):
for _, dirnames, filenames in walk(path, cls._MAX_DEPTH, cls._MIN_DEPTH):
if filenames:
for filename in filenames:
if Path(filename).suffix.lower() not in IMAGE_EXTENSIONS:
return False
elif not dirnames:
return False

Check warning on line 146 in src/datumaro/plugins/data_formats/imagenet.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/plugins/data_formats/imagenet.py#L146

Added line #L146 was not covered by tests
return True

@classmethod
def find_sources(cls, path):
if not osp.isdir(path):
if not Path(path).is_dir():
return []

# Images should be in root/label_dir/*.img and root/*.img is not allowed.
# => max_depth=1, min_depth=1
for _ in find_images(path, recursive=True, max_depth=1, min_depth=1):
return [{"url": path, "format": ImagenetBase.NAME}]

return []
return [{"url": path, "format": cls._FORMAT}] if cls.contains_only_images(path) else []

@classmethod
def get_file_extensions(cls) -> List[str]:
Expand All @@ -144,32 +168,36 @@

@with_subset_dirs
class ImagenetWithSubsetDirsImporter(ImagenetImporter):
"""TorchVision ImageFolder style importer.
For example, it imports the following directory structure.
"""Multi-level image directory structure importer.
Example:

.. code-block::

root
├── train
│ ├── label_0
│ │ ├── label_0_1.jpg
│ │ └── label_0_2.jpg
│ │ ├── label_0_1
│ │ │ └── img1.jpg
│ │ └── label_0_2
│ │ └── img2.jpg
│ └── label_1
│ └── label_1_1.jpg
│ └── img3.jpg
├── val
│ ├── label_0
│ │ ├── label_0_1.jpg
│ │ └── label_0_2.jpg
│ │ ├── label_0_1
│ │ │ └── img1.jpg
│ │ └── label_0_2
│ │ └── img2.jpg
│ └── label_1
│ └── label_1_1.jpg
│ └── img3.jpg
└── test
├── label_0
│ ├── label_0_1.jpg
│ └── label_0_2.jpg
│ ├── label_0
│ ├── label_0_1
│ │ └── img1.jpg
│ └── label_0_2
│ └── img2.jpg
└── label_1
└── label_1_1.jpg

Then, it will have three subsets: train, val, and test and they have label_0 and label_1 labels.
└── img3.jpg
"""


Expand Down Expand Up @@ -199,7 +227,7 @@
'For example, dataset.export("<path/to/output>", format="imagenet_with_subset_dirs").'
)

root_dir = self._save_dir
root_dir = Path(self._save_dir)
extractor = self._extractor
labels = {}
for item in self._extractor:
Expand All @@ -210,18 +238,18 @@
label_name = extractor.categories()[AnnotationType.label][label].name
self._save_image(
item,
subdir=osp.join(root_dir, item.subset, label_name)
subdir=root_dir / item.subset / label_name
if self.USE_SUBSET_DIRS
else osp.join(root_dir, label_name),
else root_dir / label_name,
name=file_name,
)

if not labels:
self._save_image(
item,
subdir=osp.join(root_dir, item.subset, ImagenetPath.IMAGE_DIR_NO_LABEL)
subdir=root_dir / item.subset / ImagenetPath.IMAGE_DIR_NO_LABEL
if self.USE_SUBSET_DIRS
else osp.join(root_dir, ImagenetPath.IMAGE_DIR_NO_LABEL),
else root_dir / ImagenetPath.IMAGE_DIR_NO_LABEL,
name=file_name,
)

Expand Down
6 changes: 3 additions & 3 deletions src/datumaro/plugins/data_formats/yolo/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from datumaro.components.exporter import Exporter
from datumaro.components.media import Image
from datumaro.util import str_to_bool
from datumaro.util.definitions import SUBSET_NAME_WHITELIST

from .format import YoloPath

Expand Down Expand Up @@ -195,7 +196,6 @@ def can_stream(self) -> bool:


class YoloUltralyticsExporter(YoloExporter):
allowed_subset_names = {"train", "val", "test"}
must_subset_names = {"train", "val"}

def __init__(self, extractor: IDataset, save_dir: str, **kwargs) -> None:
Expand All @@ -214,9 +214,9 @@ def _check_dataset(self):
subset_names = set(self._extractor.subsets().keys())

for subset in subset_names:
if subset not in self.allowed_subset_names:
if subset not in SUBSET_NAME_WHITELIST:
raise DatasetExportError(
f"The allowed subset name is in {self.allowed_subset_names}, "
f"The allowed subset name should be in {SUBSET_NAME_WHITELIST}, "
f'so that subset "{subset}" is not allowed.'
)

Expand Down
10 changes: 5 additions & 5 deletions src/datumaro/plugins/specs.json
Original file line number Diff line number Diff line change
Expand Up @@ -1834,6 +1834,11 @@
"plugin_name": "anns_to_labels",
"plugin_type": "Transform"
},
{
"import_path": "datumaro.plugins.transforms.AstypeAnnotations",
"plugin_name": "astype_annotations",
"plugin_type": "Transform"
},
{
"import_path": "datumaro.plugins.transforms.BboxValuesDecrement",
"plugin_name": "bbox_values_decrement",
Expand Down Expand Up @@ -1949,11 +1954,6 @@
"plugin_name": "sort",
"plugin_type": "Transform"
},
{
"import_path": "datumaro.plugins.transforms.AstypeAnnotations",
"plugin_name": "astype_annotations",
"plugin_type": "Transform"
},
{
"import_path": "datumaro.plugins.validators.ClassificationValidator",
"plugin_name": "classification",
Expand Down
1 change: 1 addition & 0 deletions src/datumaro/util/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
DEFAULT_SUBSET_NAME = "default"
BboxIntCoords = Tuple[int, int, int, int] # (x, y, w, h)
SUBSET_NAME_BLACKLIST = {"labels", "images", "annotations", "instances"}
SUBSET_NAME_WHITELIST = {"train", "test", "val"}


def get_datumaro_cache_dir(
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading