openvinotoolkit · vinnamkim · Dec 18, 2023 · Dec 14, 2023 · Dec 14, 2023
@@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1194>)
 - Enhance visualizer to toggle plot title visibility
   (<https://github.com/openvinotoolkit/datumaro/pull/1228>)
+- Enhance Datumaro data format detect() to be memory-bounded and performant
+  (<https://github.com/openvinotoolkit/datumaro/pull/1229>)
 
 ### Bug fixes
 - Fix wrong example of Datumaro dataset creation in document

@@ -23,6 +23,7 @@
 from datumaro.components.format_detection import FormatDetectionContext
 from datumaro.components.importer import ImportContext, Importer
 from datumaro.components.media import Image
+from datumaro.rust_api import JsonSectionPageMapper
 from datumaro.util import parse_json
 from datumaro.util.image import IMAGE_EXTENSIONS, find_images, lazy_image, load_image
 from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
@@ -223,11 +224,12 @@ def detect(cls, context: FormatDetectionContext) -> None:
         with context.probe_text_file(
             annot_path,
             'must be a JSON object with an "annotation" key',
-        ) as f:
-            contents = parse_json(f.read())
-            if not isinstance(contents, dict):
-                raise Exception
-            if "annotation" not in contents:
+        ):
+            fpath = osp.join(context.root_path, annot_path)
+            page_mapper = JsonSectionPageMapper(fpath)
+            sections = page_mapper.sections()
+
+            if "annotation" not in sections.keys():
                 raise Exception
 
     @classmethod

@@ -14,6 +14,7 @@
 from datumaro.components.importer import ImportContext, Importer
 from datumaro.components.media import Video
 from datumaro.plugins.data_formats.video import VIDEO_EXTENSIONS
+from datumaro.rust_api import JsonSectionPageMapper
 from datumaro.util import parse_json, parse_json_file
 from datumaro.util.os_util import find_files
 
@@ -141,10 +142,18 @@ def detect(cls, context: FormatDetectionContext) -> None:
                     ann_file,
                     "JSON file must contain an youtube 'url' key",
                 ) as f:
-                    contents = parse_json(f.read())
+                    fpath = osp.join(context.root_path, ann_file)
+                    page_mapper = JsonSectionPageMapper(fpath)
+                    sections = page_mapper.sections()
+
+                    page_map = next(iter(sections.values()))
+                    offset, size = page_map["offset"], page_map["size"]
+
+                    f.seek(offset, 0)
+                    contents = parse_json(f.read(size))
                     if not isinstance(contents, dict):
                         raise Exception
-                    if "youtube" not in next(iter(contents.values())).get("url", ""):
+                    if "youtube" not in contents.get("url", ""):
                         raise Exception
 
             with context.alternative():

@@ -8,11 +8,13 @@
 from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
 from datumaro.components.importer import Importer
 from datumaro.errors import DatasetImportError
+from datumaro.rust_api import JsonSectionPageMapper
 from datumaro.util import parse_json
 
 
 class SegmentAnythingImporter(Importer):
     _N_JSON_TO_TEST = 10
+    _MAX_ANNOTATION_SECTION_BYTES = 100 * 1024 * 1024  # 100 MiB
 
     @classmethod
     def detect(
@@ -26,24 +28,37 @@
             with context.probe_text_file(
                 file, "Annotation format is not Segmentat-Anything format", is_binary_file=True
             ) as f:
-                anno = parse_json(f.read())
-                if (
-                    set(anno.keys()) != {"annotations", "image"}
-                    or (
-                        set(anno["image"].keys())
-                        != {
-                            "image_id",
-                            "width",
-                            "height",
-                            "file_name",
-                        }
-                    )
-                    or (
-                        anno["annotations"]
-                        and not {"id", "segmentation", "bbox"}.issubset(set(anno["annotations"][0]))
-                    )
-                ):
+                fpath = os.path.join(context.root_path, file)
+                page_mapper = JsonSectionPageMapper(fpath)
+                sections = page_mapper.sections()
+
+                if set(sections.keys()) != {"annotations", "image"}:
+                    raise DatasetImportError
+
+                offset, size = sections["image"]["offset"], sections["image"]["size"]
+                f.seek(offset, 0)
+                img_contents = parse_json(f.read(size))
+
+                if set(img_contents.keys()) != {
+                    "image_id",
+                    "width",
+                    "height",
+                    "file_name",
+                }:
+                    raise DatasetImportError
+
+                offset, size = sections["annotations"]["offset"], sections["annotations"]["size"]
+
+                if size > cls._MAX_ANNOTATION_SECTION_BYTES:
+                    msg = f"Annotation section is too huge. It exceeded {cls._MAX_ANNOTATION_SECTION_BYTES} bytes."
+                    raise DatasetImportError(msg)
+
+                f.seek(offset, 0)
+                ann_contents = parse_json(f.read(size))
+
+                if not {"id", "segmentation", "bbox"}.issubset(set(ann_contents[0])):
                     raise DatasetImportError
+
             if ctr > cls._N_JSON_TO_TEST:
                 break