Revert "Add hierarchical ImageNet-like dataset format (openvinotoolki…

…t#1528)" This reverts commit c296000.
sooahleex · Jun 22, 2024 · b7b7ffb · b7b7ffb
1 parent 8d89e59
commit b7b7ffb
Show file tree

Hide file tree

Showing 17 changed files with 106 additions and 133 deletions.
diff --git a/src/datumaro/cli/commands/downloaders/kaggle.py b/src/datumaro/cli/commands/downloaders/kaggle.py
@@ -30,7 +30,7 @@ def make_all_paths_absolute(d: Dict, root: str = "."):
 
 
 KAGGLE_API_KEY_EXISTS = bool(os.environ.get("KAGGLE_KEY")) or os.path.exists(
-    os.path.join(os.path.expanduser("~"), ".kaggle", "kaggle.json")
+    os.path.join(os.path.expanduser("~"), ".kaggle")
 )
 
 

diff --git a/src/datumaro/components/importer.py b/src/datumaro/components/importer.py
@@ -22,7 +22,7 @@
 from datumaro.components.errors import DatasetImportError, DatasetNotFoundError
 from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
 from datumaro.components.merge.extractor_merger import ExtractorMerger
-from datumaro.util.definitions import SUBSET_NAME_WHITELIST
+from datumaro.util.definitions import SUBSET_NAME_BLACKLIST
 
 T = TypeVar("T")
 
@@ -197,7 +197,7 @@ def _change_context_root_path(context: FormatDetectionContext, path: str):
                 )
 
             for sub_dir in os.listdir(path):
-                if sub_dir.lower() not in SUBSET_NAME_WHITELIST:
+                if sub_dir.lower() in SUBSET_NAME_BLACKLIST:
                     continue
 
                 sub_path = osp.join(path, sub_dir)

diff --git a/src/datumaro/plugins/data_formats/image_dir.py b/src/datumaro/plugins/data_formats/image_dir.py
@@ -4,12 +4,12 @@
 
 import logging as log
 import os
-from pathlib import Path
+import os.path as osp
 from typing import List, Optional
 
 from datumaro.components.dataset_base import DatasetItem, SubsetBase
 from datumaro.components.exporter import Exporter
-from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
+from datumaro.components.format_detection import FormatDetectionConfidence
 from datumaro.components.importer import ImportContext, Importer
 from datumaro.components.media import Image
 from datumaro.util.image import IMAGE_EXTENSIONS, find_images
@@ -31,23 +31,11 @@ def build_cmdline_parser(cls, **kwargs):
         )
         return parser
 
-    @classmethod
-    def detect(cls, context: FormatDetectionContext) -> FormatDetectionConfidence:
-        path = Path(context.root_path)
-        for item in path.iterdir():
-            if item.is_dir():
-                context.fail("Only flat image directories are supported")
-            elif item.suffix.lower() not in IMAGE_EXTENSIONS:
-                context.fail(f"File {item} is not an image.")
-        return super().detect(context)
-
     @classmethod
     def find_sources(cls, path):
-        path = Path(path)
-        if not path.is_dir():
+        if not osp.isdir(path):
             return []
-
-        return [{"url": str(path), "format": ImageDirBase.NAME}]
+        return [{"url": path, "format": ImageDirBase.NAME}]
 
     @classmethod
     def get_file_extensions(cls) -> List[str]:
@@ -63,11 +51,11 @@ def __init__(
         ctx: Optional[ImportContext] = None,
     ):
         super().__init__(subset=subset, ctx=ctx)
-        url = Path(url)
-        assert url.is_dir(), url
 
-        for path in find_images(str(url)):
-            item_id = Path(path).stem
+        assert osp.isdir(url), url
+
+        for path in find_images(url, recursive=True):
+            item_id = osp.relpath(osp.splitext(path)[0], url)
             self._items.append(
                 DatasetItem(id=item_id, subset=self._subset, media=Image.from_file(path=path))
             )

diff --git a/src/datumaro/plugins/data_formats/imagenet.py b/src/datumaro/plugins/data_formats/imagenet.py
@@ -5,8 +5,9 @@
 import errno
 import logging as log
 import os
-from pathlib import Path
-from typing import List
+import os.path as osp
+import warnings
+from typing import List, Optional
 
 from datumaro.components.annotation import AnnotationType, Label, LabelCategories
 from datumaro.components.dataset_base import DatasetItem, SubsetBase
@@ -15,9 +16,8 @@
 from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
 from datumaro.components.importer import ImportContext, Importer, with_subset_dirs
 from datumaro.components.media import Image
-from datumaro.util.definitions import SUBSET_NAME_BLACKLIST, SUBSET_NAME_WHITELIST
+from datumaro.util.definitions import SUBSET_NAME_BLACKLIST
 from datumaro.util.image import IMAGE_EXTENSIONS, find_images
-from datumaro.util.os_util import walk
 
 
 class ImagenetPath:
@@ -30,39 +30,40 @@ def __init__(
         self,
         path: str,
         *,
-        subset: str | None = None,
-        ctx: ImportContext | None = None,
-        min_depth: int | None = None,
-        max_depth: int | None = None,
+        subset: Optional[str] = None,
+        ctx: Optional[ImportContext] = None,
     ):
-        if not Path(path).is_dir():
+        if not osp.isdir(path):
             raise NotADirectoryError(errno.ENOTDIR, "Can't find dataset directory", path)
+
         super().__init__(subset=subset, ctx=ctx)
-        self._max_depth = min_depth
-        self._min_depth = max_depth
+
         self._categories = self._load_categories(path)
         self._items = list(self._load_items(path).values())
 
     def _load_categories(self, path):
         label_cat = LabelCategories()
-        path = Path(path)
-        for dirname in sorted(d for d in path.rglob("*") if d.is_dir()):
-            dirname = dirname.relative_to(path)
-            if str(dirname) != ImagenetPath.IMAGE_DIR_NO_LABEL:
-                label_cat.add(str(dirname))
+        for dirname in sorted(os.listdir(path)):
+            if not os.path.isdir(os.path.join(path, dirname)):
+                warnings.warn(
+                    f"{dirname} is not a directory in the folder {path}, so this will"
+                    "be skipped when declaring the cateogries of `imagenet` dataset."
+                )
+                continue
+            if dirname != ImagenetPath.IMAGE_DIR_NO_LABEL:
+                label_cat.add(dirname)
         return {AnnotationType.label: label_cat}
 
     def _load_items(self, path):
         items = {}
 
-        for image_path in find_images(
-            path, recursive=True, max_depth=self._max_depth, min_depth=self._min_depth
-        ):
-            label = str(Path(image_path).parent.relative_to(path))
-            if label == ".":  # image is located in the root directory
-                label = ImagenetPath.IMAGE_DIR_NO_LABEL
-            image_name = Path(image_path).stem
-            item_id = str(label) + ImagenetPath.SEP_TOKEN + image_name
+        # Images should be in root/label_dir/*.img and root/*.img is not allowed.
+        # => max_depth=1, min_depth=1
+        for image_path in find_images(path, recursive=True, max_depth=1, min_depth=1):
+            label = osp.basename(osp.dirname(image_path))
+            image_name = osp.splitext(osp.basename(image_path))[0]
+
+            item_id = label + ImagenetPath.SEP_TOKEN + image_name
             item = items.get(item_id)
             try:
                 if item is None:
@@ -88,70 +89,45 @@ def _load_items(self, path):
 
 
 class ImagenetImporter(Importer):
-    """
-        Multi-level version of ImagenetImporter.
-        For example, it imports the following directory structure.
+    """TorchVision's ImageFolder style importer.
+    For example, it imports the following directory structure.
 
     .. code-block:: text
 
         root
         ├── label_0
-        │   ├── label_0_1
-        │   │   └── img1.jpg
-        │   └── label_0_2
-        │       └── img2.jpg
+        │   ├── label_0_1.jpg
+        │   └── label_0_2.jpg
         └── label_1
-            └── img3.jpg
+            └── label_1_1.jpg
 
     """
 
-    _MIN_DEPTH = None
-    _MAX_DEPTH = None
-    _FORMAT = ImagenetBase.NAME
-    DETECT_CONFIDENCE = FormatDetectionConfidence.EXTREME_LOW
-
     @classmethod
     def detect(cls, context: FormatDetectionContext) -> FormatDetectionConfidence:
         # Images must not be under a directory whose name is blacklisted.
-        for dname, dirnames, filenames in os.walk(context.root_path):
-            if dname in SUBSET_NAME_WHITELIST:
+        for dname in os.listdir(context.root_path):
+            dpath = osp.join(context.root_path, dname)
+            if osp.isdir(dpath) and dname.lower() in SUBSET_NAME_BLACKLIST:
                 context.fail(
-                    f"Following directory names are not permitted: {SUBSET_NAME_WHITELIST}"
+                    f"{dname} is found in {context.root_path}. "
+                    "However, Images must not be under a directory whose name is blacklisted "
+                    f"(SUBSET_NAME_BLACKLIST={SUBSET_NAME_BLACKLIST})."
                 )
-            rel_dname = Path(dname).relative_to(context.root_path)
-            level = len(rel_dname.parts)
-            if cls._MIN_DEPTH is not None and level < cls._MIN_DEPTH and filenames:
-                context.fail("Found files out of the directory level bounds.")
-            if cls._MAX_DEPTH is not None and level > cls._MAX_DEPTH and filenames:
-                context.fail("Found files out of the directory level bounds.")
-            dpath = Path(context.root_path) / rel_dname
-            if dpath.is_dir():
-                if str(rel_dname).lower() in SUBSET_NAME_BLACKLIST:
-                    context.fail(
-                        f"{dname} is found in {context.root_path}. "
-                        "However, Images must not be under a directory whose name is blacklisted "
-                        f"(SUBSET_NAME_BLACKLIST={SUBSET_NAME_BLACKLIST})."
-                    )
 
         return super().detect(context)
 
-    @classmethod
-    def contains_only_images(cls, path: str | Path):
-        for _, dirnames, filenames in walk(path, cls._MAX_DEPTH, cls._MIN_DEPTH):
-            if filenames:
-                for filename in filenames:
-                    if Path(filename).suffix.lower() not in IMAGE_EXTENSIONS:
-                        return False
-            elif not dirnames:
-                return False
-        return True
-
     @classmethod
     def find_sources(cls, path):
-        if not Path(path).is_dir():
+        if not osp.isdir(path):
             return []
 
-        return [{"url": path, "format": cls._FORMAT}] if cls.contains_only_images(path) else []
+        # Images should be in root/label_dir/*.img and root/*.img is not allowed.
+        # => max_depth=1, min_depth=1
+        for _ in find_images(path, recursive=True, max_depth=1, min_depth=1):
+            return [{"url": path, "format": ImagenetBase.NAME}]
+
+        return []
 
     @classmethod
     def get_file_extensions(cls) -> List[str]:
@@ -168,36 +144,32 @@ def build_cmdline_parser(cls, **kwargs):
 
 @with_subset_dirs
 class ImagenetWithSubsetDirsImporter(ImagenetImporter):
-    """Multi-level image directory structure importer.
-    Example:
+    """TorchVision ImageFolder style importer.
+    For example, it imports the following directory structure.
 
     .. code-block::
 
         root
         ├── train
         │   ├── label_0
-        │   │   ├── label_0_1
-        │   │   │   └── img1.jpg
-        │   │   └── label_0_2
-        │   │       └── img2.jpg
+        │   │   ├── label_0_1.jpg
+        │   │   └── label_0_2.jpg
         │   └── label_1
-        │       └── img3.jpg
+        │       └── label_1_1.jpg
         ├── val
         │   ├── label_0
-        │   │   ├── label_0_1
-        │   │   │   └── img1.jpg
-        │   │   └── label_0_2
-        │   │       └── img2.jpg
+        │   │   ├── label_0_1.jpg
+        │   │   └── label_0_2.jpg
         │   └── label_1
-        │       └── img3.jpg
+        │       └── label_1_1.jpg
         └── test
-            │   ├── label_0
-            │   ├── label_0_1
-            │   │   └── img1.jpg
-            │   └── label_0_2
-            │       └── img2.jpg
+            ├── label_0
+            │   ├── label_0_1.jpg
+            │   └── label_0_2.jpg
             └── label_1
-                └── img3.jpg
+                └── label_1_1.jpg
+
+    Then, it will have three subsets: train, val, and test and they have label_0 and label_1 labels.
     """
 
 
@@ -227,7 +199,7 @@ def _get_name(item: DatasetItem) -> str:
                 'For example, dataset.export("<path/to/output>", format="imagenet_with_subset_dirs").'
             )
 
-        root_dir = Path(self._save_dir)
+        root_dir = self._save_dir
         extractor = self._extractor
         labels = {}
         for item in self._extractor:
@@ -238,18 +210,18 @@ def _get_name(item: DatasetItem) -> str:
                 label_name = extractor.categories()[AnnotationType.label][label].name
                 self._save_image(
                     item,
-                    subdir=root_dir / item.subset / label_name
+                    subdir=osp.join(root_dir, item.subset, label_name)
                     if self.USE_SUBSET_DIRS
-                    else root_dir / label_name,
+                    else osp.join(root_dir, label_name),
                     name=file_name,
                 )
 
             if not labels:
                 self._save_image(
                     item,
-                    subdir=root_dir / item.subset / ImagenetPath.IMAGE_DIR_NO_LABEL
+                    subdir=osp.join(root_dir, item.subset, ImagenetPath.IMAGE_DIR_NO_LABEL)
                     if self.USE_SUBSET_DIRS
-                    else root_dir / ImagenetPath.IMAGE_DIR_NO_LABEL,
+                    else osp.join(root_dir, ImagenetPath.IMAGE_DIR_NO_LABEL),
                     name=file_name,
                 )
 

diff --git a/src/datumaro/plugins/data_formats/yolo/exporter.py b/src/datumaro/plugins/data_formats/yolo/exporter.py
@@ -15,7 +15,6 @@
 from datumaro.components.exporter import Exporter
 from datumaro.components.media import Image
 from datumaro.util import str_to_bool
-from datumaro.util.definitions import SUBSET_NAME_WHITELIST
 
 from .format import YoloPath
 
@@ -196,6 +195,7 @@ def can_stream(self) -> bool:
 
 
 class YoloUltralyticsExporter(YoloExporter):
+    allowed_subset_names = {"train", "val", "test"}
     must_subset_names = {"train", "val"}
 
     def __init__(self, extractor: IDataset, save_dir: str, **kwargs) -> None:
@@ -214,9 +214,9 @@ def _check_dataset(self):
         subset_names = set(self._extractor.subsets().keys())
 
         for subset in subset_names:
-            if subset not in SUBSET_NAME_WHITELIST:
+            if subset not in self.allowed_subset_names:
                 raise DatasetExportError(
-                    f"The allowed subset name should be in {SUBSET_NAME_WHITELIST}, "
+                    f"The allowed subset name is in {self.allowed_subset_names}, "
                     f'so that subset "{subset}" is not allowed.'
                 )
 

diff --git a/src/datumaro/plugins/specs.json b/src/datumaro/plugins/specs.json
@@ -1834,11 +1834,6 @@
     "plugin_name": "anns_to_labels",
     "plugin_type": "Transform"
   },
-  {
-    "import_path": "datumaro.plugins.transforms.AstypeAnnotations",
-    "plugin_name": "astype_annotations",
-    "plugin_type": "Transform"
-  },
   {
     "import_path": "datumaro.plugins.transforms.BboxValuesDecrement",
     "plugin_name": "bbox_values_decrement",
@@ -1959,6 +1954,11 @@
     "plugin_name": "sort",
     "plugin_type": "Transform"
   },
+  {
+    "import_path": "datumaro.plugins.transforms.AstypeAnnotations",
+    "plugin_name": "astype_annotations",
+    "plugin_type": "Transform"
+  },
   {
     "import_path": "datumaro.plugins.validators.ClassificationValidator",
     "plugin_name": "classification",

diff --git a/src/datumaro/util/definitions.py b/src/datumaro/util/definitions.py
@@ -10,7 +10,6 @@
 DEFAULT_SUBSET_NAME = "default"
 BboxIntCoords = Tuple[int, int, int, int]  # (x, y, w, h)
 SUBSET_NAME_BLACKLIST = {"labels", "images", "annotations", "instances"}
-SUBSET_NAME_WHITELIST = {"train", "test", "val"}
 
 
 def get_datumaro_cache_dir(

diff --git a/tests/assets/imagenet_dataset/label_0_2.jpg → ...ts/imagenet_dataset/label_0/label_0_2.jpg b/tests/assets/imagenet_dataset/label_0_2.jpg → ...ts/imagenet_dataset/label_0/label_0_2.jpg
diff --git a/tests/assets/imagenet_dataset/label_1/label_1_1/label_1_1.jpg b/tests/assets/imagenet_dataset/label_1/label_1_1/label_1_1.jpg
diff --git a/...agenet_subsets_dataset/test/label_0_2.jpg → ...ubsets_dataset/test/label_0/label_0_2.jpg b/...agenet_subsets_dataset/test/label_0_2.jpg → ...ubsets_dataset/test/label_0/label_0_2.jpg
diff --git a/tests/assets/imagenet_subsets_dataset/test/label_1/label_1_1/label_1_1.jpg b/tests/assets/imagenet_subsets_dataset/test/label_1/label_1_1/label_1_1.jpg
diff --git a/...genet_subsets_dataset/train/label_0_2.jpg → ...bsets_dataset/train/label_0/label_0_2.jpg b/...genet_subsets_dataset/train/label_0_2.jpg → ...bsets_dataset/train/label_0/label_0_2.jpg
diff --git a/tests/assets/imagenet_subsets_dataset/train/label_1/label_1_1/label_1_1.jpg b/tests/assets/imagenet_subsets_dataset/train/label_1/label_1_1/label_1_1.jpg
diff --git a/...magenet_subsets_dataset/val/label_0_2.jpg → ...subsets_dataset/val/label_0/label_0_2.jpg b/...magenet_subsets_dataset/val/label_0_2.jpg → ...subsets_dataset/val/label_0/label_0_2.jpg
diff --git a/tests/assets/imagenet_subsets_dataset/val/label_1/label_1_1/label_1_1.jpg b/tests/assets/imagenet_subsets_dataset/val/label_1/label_1_1/label_1_1.jpg
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,7 +30,7 @@ def make_all_paths_absolute(d: Dict, root: str = "."): @@
     KAGGLE_API_KEY_EXISTS = bool(os.environ.get("KAGGLE_KEY")) or os.path.exists(
-        os.path.join(os.path.expanduser("~"), ".kaggle", "kaggle.json")
+        os.path.join(os.path.expanduser("~"), ".kaggle")
     )
@@ Expand Down @@