Add memory bounded datumaro data format detect to release 1.5.1 (#1241)

### Summary - Ticket no 128951 - Apply #1224 and #1229 changes to the releases/1.5.0 branch ### How to test Already tested in the previous PRs. ### Checklist  - [ ] I have added unit tests to cover my changes. - [ ] I have added integration tests to cover my changes. - [ ] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md). - [ ] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [ ] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [ ] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2023 Intel Corporation # # SPDX-License-Identifier: MIT ``` --------- Signed-off-by: Kim, Vinnam <[email protected]>
openvinotoolkit · Jan 11, 2024 · e426036 · e426036
1 parent 375d184
commit e426036
Show file tree

Hide file tree

Showing 7 changed files with 293 additions and 28 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1174>)
 - Add ImportError to catch GitPython import error
   (<https://github.com/openvinotoolkit/datumaro/pull/1174>)
+- Enhance Datumaro data format detect() to be memory-bounded and performant
+  (<https://github.com/openvinotoolkit/datumaro/pull/1229>)
 
 ### Bug fixes
 - Modify the draw function in the visualizer not to raise an error for unsupported annotation types.

diff --git a/rust/src/json_section_page_mapper.rs b/rust/src/json_section_page_mapper.rs
@@ -0,0 +1,232 @@
+//  Copyright (C) 2023 Intel Corporation
+//
+//  SPDX-License-Identifier: MIT
+
+use crate::{
+    page_mapper::{JsonPageMapper, ParsedJsonSection},
+    utils::read_skipping_ws,
+};
+use pyo3::{prelude::*, types::PyDict};
+use std::{
+    collections::HashMap,
+    fs::File,
+    io::{self, BufReader, Read, Seek},
+    path::Path,
+};
+
+#[derive(Debug)]
+struct JsonSection {
+    key: String,
+    offset: usize,
+    size: usize,
+}
+
+fn handle_arr_or_dict(
+    mut stack: Vec<u8>,
+    mut reader: impl Read + Seek,
+    mut last_token: u8,
+) -> Result<(), io::Error> {
+    while stack.len() != 0 {
+        match read_skipping_ws(&mut reader) {
+            Ok(c) => match c {
+                b'{' | b'[' => {
+                    stack.push(c);
+                    last_token = c;
+                }
+                b'}' => {
+                    if last_token != b'{' {
+                        let cur_pos = reader.stream_position()?;
+                        let msg = format!("Last token in the stack is '{}', but the given token at offset={} is '}}'", last_token as char, cur_pos);
+                        return Err(io::Error::new(io::ErrorKind::InvalidData, msg));
+                    }
+                    stack.pop();
+                    if stack.len() != 0 {
+                        last_token = *stack
+                            .last()
+                            .ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
+                    }
+                }
+                b']' => {
+                    if last_token != b'[' {
+                        let cur_pos = reader.stream_position()?;
+                        let msg = format!("Last token in the stack is '{}', but the given token at offset={} is ']'", last_token as char, cur_pos);
+                        return Err(io::Error::new(io::ErrorKind::InvalidData, msg));
+                    }
+                    stack.pop();
+                    if stack.len() != 0 {
+                        last_token = *stack
+                            .last()
+                            .ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
+                    }
+                }
+                b'"' => {
+                    while let Ok(c) = read_skipping_ws(&mut reader) {
+                        if c == b'"' {
+                            break;
+                        }
+                    }
+                }
+                _ => {}
+            },
+            Err(err) => {
+                return Err(err);
+            }
+        }
+    }
+    Ok(())
+}
+
+fn handle_string(mut reader: impl Read + Seek) -> Result<(), io::Error> {
+    while let Ok(c) = read_skipping_ws(&mut reader) {
+        if c == b'"' {
+            break;
+        }
+    }
+    Ok(())
+}
+
+fn get_offset(mut reader: impl Read + Seek, stack: &mut Vec<u8>) -> Result<usize, io::Error> {
+    let mut offset = usize::MAX;
+    while let Ok(c) = read_skipping_ws(&mut reader) {
+        stack.push(c);
+        match c {
+            b'{' | b'[' | b'"' => {
+                return Ok(reader.stream_position()? as usize - 1);
+            }
+            b',' => {
+                return Ok(offset - 1);
+            }
+            _ => {
+                let pos = reader.stream_position()? as usize;
+                offset = std::cmp::min(pos, offset);
+            }
+        }
+    }
+    Err(io::Error::new(
+        io::ErrorKind::InvalidData,
+        "Cannot get offset",
+    ))
+}
+
+impl ParsedJsonSection for JsonSection {
+    fn parse(buf_key: String, mut reader: impl Read + Seek) -> Result<Box<JsonSection>, io::Error> {
+        // Move reader's cursor right after ':'
+        while let Ok(c) = read_skipping_ws(&mut reader) {
+            if c == b':' {
+                break;
+            }
+        }
+
+        let mut stack = vec![];
+
+        let start_offset = get_offset(&mut reader, &mut stack)?;
+
+        let last_token = *stack
+            .last()
+            .ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
+
+        let end_offset = match last_token {
+            b'[' | b'{' => {
+                let _ = handle_arr_or_dict(stack, &mut reader, last_token)?;
+                Ok(reader.stream_position()? as usize)
+            }
+            b'"' => {
+                let _ = handle_string(&mut reader)?;
+                Ok(reader.stream_position()? as usize)
+            }
+            b',' => Ok(reader.stream_position()? as usize - 1),
+            _ => Err(io::Error::new(io::ErrorKind::InvalidData, "s")),
+        }?;
+
+        let size = end_offset - start_offset;
+
+        Ok(Box::new(JsonSection {
+            key: buf_key,
+            offset: start_offset,
+            size: size,
+        }))
+    }
+}
+
+#[derive(Debug)]
+pub struct JsonSectionPageMapperImpl {
+    sections: Vec<Box<JsonSection>>,
+}
+
+impl JsonPageMapper<JsonSection> for JsonSectionPageMapperImpl {}
+
+impl JsonSectionPageMapperImpl {
+    pub fn new(mut reader: impl Read + Seek) -> Result<Self, io::Error> {
+        let sections = Self::parse_json(&mut reader)?;
+
+        Ok(JsonSectionPageMapperImpl { sections: sections })
+    }
+}
+
+#[pyclass]
+pub struct JsonSectionPageMapper {
+    reader: BufReader<File>,
+    mapper: JsonSectionPageMapperImpl,
+}
+
+#[pymethods]
+impl JsonSectionPageMapper {
+    #[new]
+    fn py_new(path: String) -> PyResult<Self> {
+        let file = File::open(Path::new(&path))?;
+        let mut reader = BufReader::new(file);
+        let mapper = JsonSectionPageMapperImpl::new(&mut reader)?;
+
+        Ok(JsonSectionPageMapper { reader, mapper })
+    }
+
+    fn sections(self_: PyRef<Self>) -> PyResult<PyObject> {
+        let dict: HashMap<&str, HashMap<&str, usize>> = self_
+            .mapper
+            .sections
+            .iter()
+            .map(|section| {
+                let nested_dict: HashMap<&str, usize> =
+                    HashMap::from_iter([("offset", section.offset), ("size", section.size)]);
+                (section.key.as_str(), nested_dict)
+            })
+            .collect();
+
+        Ok(dict.into_py(self_.py()))
+    }
+
+    fn __len__(&self) -> PyResult<usize> {
+        Ok(self.mapper.sections.len())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test_helpers::prepare_reader;
+
+    #[test]
+    fn test_instance() {
+        const EXAMPLE: &str = r#"{"dm_format_version": "1.0", "media_type": 2, "infos": {"string": "test", "int": 0, "float": 0.0, "string_list": ["test0", "test1", "test2"], "int_list": [0, 1, 2], "float_list": [0.0, 0.1, 0.2]}, "categories": {"label": {"labels": [{"name": "cat0", "parent": "", "attributes": ["x", "y"]}, {"name": "cat1", "parent": "", "attributes": ["x", "y"]}, {"name": "cat2", "parent": "", "attributes": ["x", "y"]}, {"name": "cat3", "parent": "", "attributes": ["x", "y"]}, {"name": "cat4", "parent": "", "attributes": ["x", "y"]}], "label_groups": [], "attributes": ["a", "b", "score"]}, "mask": {"colormap": [{"label_id": 0, "r": 0, "g": 0, "b": 0}, {"label_id": 1, "r": 128, "g": 0, "b": 0}, {"label_id": 2, "r": 0, "g": 128, "b": 0}, {"label_id": 3, "r": 128, "g": 128, "b": 0}, {"label_id": 4, "r": 0, "g": 0, "b": 128}]}, "points": {"items": [{"label_id": 0, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 1, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 2, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 3, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 4, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}]}}, "items": [{"id": "42", "annotations": [{"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}, {"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}], "image": {"path": "42.jpg", "size": [10, 6]}}, {"id": "43", "annotations": [], "image": {"path": "43.qq", "size": [2, 4]}}]}
+        "#;
+
+        let (tempfile, mut reader) = prepare_reader(EXAMPLE);
+        let json_section_page_mapper = JsonSectionPageMapperImpl::new(&mut reader).unwrap();
+
+        println!("{:?}", json_section_page_mapper);
+
+        for section in json_section_page_mapper.sections {
+            let offset = section.offset;
+            let size = section.size;
+            reader.seek(io::SeekFrom::Start(offset as u64));
+            let mut buf = vec![0; size];
+            reader.read(buf.as_mut_slice());
+
+            let content: serde_json::Value = serde_json::from_str(
+                std::str::from_utf8(buf.as_slice()).expect("Cannot change to utf8"),
+            )
+            .unwrap();
+            println!("Section: {}, Content: {:?}", section.key, content);
+        }
+    }
+}
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
@@ -4,6 +4,7 @@
 
 mod coco_page_mapper;
 mod datum_page_mapper;
+mod json_section_page_mapper;
 mod page_mapper;
 mod page_maps;
 mod test_helpers;
@@ -12,13 +13,15 @@ use pyo3::prelude::*;
 
 use crate::coco_page_mapper::CocoPageMapper;
 use crate::datum_page_mapper::DatumPageMapper;
+use crate::json_section_page_mapper::JsonSectionPageMapper;
 
 /// Datumaro Rust API
 #[pymodule]
 #[pyo3(name = "rust_api")]
 fn rust_api(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
     m.add_class::<CocoPageMapper>()?;
     m.add_class::<DatumPageMapper>()?;
+    m.add_class::<JsonSectionPageMapper>()?;
 
     Ok(())
 }
diff --git a/src/datumaro/plugins/data_formats/ade20k2020.py b/src/datumaro/plugins/data_formats/ade20k2020.py
@@ -23,6 +23,7 @@
 from datumaro.components.format_detection import FormatDetectionContext
 from datumaro.components.importer import ImportContext, Importer
 from datumaro.components.media import Image
+from datumaro.rust_api import JsonSectionPageMapper
 from datumaro.util import parse_json
 from datumaro.util.image import IMAGE_EXTENSIONS, find_images, lazy_image, load_image
 from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
@@ -223,11 +224,12 @@ def detect(cls, context: FormatDetectionContext) -> None:
         with context.probe_text_file(
             annot_path,
             'must be a JSON object with an "annotation" key',
-        ) as f:
-            contents = parse_json(f.read())
-            if not isinstance(contents, dict):
-                raise Exception
-            if "annotation" not in contents:
+        ):
+            fpath = osp.join(context.root_path, annot_path)
+            page_mapper = JsonSectionPageMapper(fpath)
+            sections = page_mapper.sections()
+
+            if "annotation" not in sections.keys():
                 raise Exception
 
     @classmethod

diff --git a/src/datumaro/plugins/data_formats/datumaro/importer.py b/src/datumaro/plugins/data_formats/datumaro/importer.py
@@ -8,7 +8,7 @@
 from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
 from datumaro.components.importer import Importer
 from datumaro.components.merge.extractor_merger import ExtractorMerger
-from datumaro.util import parse_json
+from datumaro.rust_api import JsonSectionPageMapper
 
 from .format import DatumaroPath
 
@@ -28,9 +28,11 @@ def detect(
         with context.probe_text_file(
             annot_file,
             'must be a JSON object with "categories" ' 'and "items" keys',
-        ) as f:
-            contents = parse_json(f.read())
-            if not {"categories", "items"} <= contents.keys():
+        ):
+            fpath = osp.join(context.root_path, annot_file)
+            page_mapper = JsonSectionPageMapper(fpath)
+            sections = page_mapper.sections()
+            if not {"categories", "items"} <= sections.keys():
                 raise Exception
 
     @classmethod

diff --git a/src/datumaro/plugins/data_formats/kinetics.py b/src/datumaro/plugins/data_formats/kinetics.py
@@ -14,6 +14,7 @@
 from datumaro.components.importer import ImportContext, Importer
 from datumaro.components.media import Video
 from datumaro.plugins.data_formats.video import VIDEO_EXTENSIONS
+from datumaro.rust_api import JsonSectionPageMapper
 from datumaro.util import parse_json, parse_json_file
 from datumaro.util.os_util import find_files
 
@@ -141,10 +142,18 @@ def detect(cls, context: FormatDetectionContext) -> None:
                     ann_file,
                     "JSON file must contain an youtube 'url' key",
                 ) as f:
-                    contents = parse_json(f.read())
+                    fpath = osp.join(context.root_path, ann_file)
+                    page_mapper = JsonSectionPageMapper(fpath)
+                    sections = page_mapper.sections()
+
+                    page_map = next(iter(sections.values()))
+                    offset, size = page_map["offset"], page_map["size"]
+
+                    f.seek(offset, 0)
+                    contents = parse_json(f.read(size))
                     if not isinstance(contents, dict):
                         raise Exception
-                    if "youtube" not in next(iter(contents.values())).get("url", ""):
+                    if "youtube" not in contents.get("url", ""):
                         raise Exception
 
             with context.alternative():