Skip to content

Commit

Permalink
Add memory bounded datumaro data format detect to release 1.5.1 (#1241)
Browse files Browse the repository at this point in the history
### Summary

- Ticket no 128951
- Apply #1224 and #1229 changes to the releases/1.5.0 branch

### How to test
Already tested in the previous PRs.

### Checklist
<!-- Put an 'x' in all the boxes that apply -->
- [ ] I have added unit tests to cover my changes.​
- [ ] I have added integration tests to cover my changes.​
- [ ] I have added the description of my changes into
[CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​
- [ ] I have updated the
[documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs)
accordingly

### License

- [ ] I submit _my code changes_ under the same [MIT
License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE)
that covers the project.
  Feel free to contact the maintainers if that's a concern.
- [ ] I have updated the license header for each file (see an example
below).

```python
# Copyright (C) 2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
```

---------

Signed-off-by: Kim, Vinnam <[email protected]>
  • Loading branch information
vinnamkim authored Jan 11, 2024
1 parent 375d184 commit e426036
Show file tree
Hide file tree
Showing 7 changed files with 293 additions and 28 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1174>)
- Add ImportError to catch GitPython import error
(<https://github.com/openvinotoolkit/datumaro/pull/1174>)
- Enhance Datumaro data format detect() to be memory-bounded and performant
(<https://github.com/openvinotoolkit/datumaro/pull/1229>)

### Bug fixes
- Modify the draw function in the visualizer not to raise an error for unsupported annotation types.
Expand Down
232 changes: 232 additions & 0 deletions rust/src/json_section_page_mapper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
// Copyright (C) 2023 Intel Corporation
//
// SPDX-License-Identifier: MIT

use crate::{
page_mapper::{JsonPageMapper, ParsedJsonSection},
utils::read_skipping_ws,
};
use pyo3::{prelude::*, types::PyDict};
use std::{
collections::HashMap,
fs::File,
io::{self, BufReader, Read, Seek},
path::Path,
};

#[derive(Debug)]
struct JsonSection {
key: String,
offset: usize,
size: usize,
}

fn handle_arr_or_dict(
mut stack: Vec<u8>,
mut reader: impl Read + Seek,
mut last_token: u8,
) -> Result<(), io::Error> {
while stack.len() != 0 {
match read_skipping_ws(&mut reader) {
Ok(c) => match c {
b'{' | b'[' => {
stack.push(c);
last_token = c;
}
b'}' => {
if last_token != b'{' {
let cur_pos = reader.stream_position()?;
let msg = format!("Last token in the stack is '{}', but the given token at offset={} is '}}'", last_token as char, cur_pos);
return Err(io::Error::new(io::ErrorKind::InvalidData, msg));
}
stack.pop();
if stack.len() != 0 {
last_token = *stack
.last()
.ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
}
}
b']' => {
if last_token != b'[' {
let cur_pos = reader.stream_position()?;
let msg = format!("Last token in the stack is '{}', but the given token at offset={} is ']'", last_token as char, cur_pos);
return Err(io::Error::new(io::ErrorKind::InvalidData, msg));
}
stack.pop();
if stack.len() != 0 {
last_token = *stack
.last()
.ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;
}
}
b'"' => {
while let Ok(c) = read_skipping_ws(&mut reader) {
if c == b'"' {
break;
}
}
}
_ => {}
},
Err(err) => {
return Err(err);
}
}
}
Ok(())
}

fn handle_string(mut reader: impl Read + Seek) -> Result<(), io::Error> {
while let Ok(c) = read_skipping_ws(&mut reader) {
if c == b'"' {
break;
}
}
Ok(())
}

fn get_offset(mut reader: impl Read + Seek, stack: &mut Vec<u8>) -> Result<usize, io::Error> {
let mut offset = usize::MAX;
while let Ok(c) = read_skipping_ws(&mut reader) {
stack.push(c);
match c {
b'{' | b'[' | b'"' => {
return Ok(reader.stream_position()? as usize - 1);
}
b',' => {
return Ok(offset - 1);
}
_ => {
let pos = reader.stream_position()? as usize;
offset = std::cmp::min(pos, offset);
}
}
}
Err(io::Error::new(
io::ErrorKind::InvalidData,
"Cannot get offset",
))
}

impl ParsedJsonSection for JsonSection {
fn parse(buf_key: String, mut reader: impl Read + Seek) -> Result<Box<JsonSection>, io::Error> {
// Move reader's cursor right after ':'
while let Ok(c) = read_skipping_ws(&mut reader) {
if c == b':' {
break;
}
}

let mut stack = vec![];

let start_offset = get_offset(&mut reader, &mut stack)?;

let last_token = *stack
.last()
.ok_or(io::Error::new(io::ErrorKind::InvalidData, "stack is empty"))?;

let end_offset = match last_token {
b'[' | b'{' => {
let _ = handle_arr_or_dict(stack, &mut reader, last_token)?;
Ok(reader.stream_position()? as usize)
}
b'"' => {
let _ = handle_string(&mut reader)?;
Ok(reader.stream_position()? as usize)
}
b',' => Ok(reader.stream_position()? as usize - 1),
_ => Err(io::Error::new(io::ErrorKind::InvalidData, "s")),
}?;

let size = end_offset - start_offset;

Ok(Box::new(JsonSection {
key: buf_key,
offset: start_offset,
size: size,
}))
}
}

#[derive(Debug)]
pub struct JsonSectionPageMapperImpl {
sections: Vec<Box<JsonSection>>,
}

impl JsonPageMapper<JsonSection> for JsonSectionPageMapperImpl {}

impl JsonSectionPageMapperImpl {
pub fn new(mut reader: impl Read + Seek) -> Result<Self, io::Error> {
let sections = Self::parse_json(&mut reader)?;

Ok(JsonSectionPageMapperImpl { sections: sections })
}
}

#[pyclass]
pub struct JsonSectionPageMapper {
reader: BufReader<File>,
mapper: JsonSectionPageMapperImpl,
}

#[pymethods]
impl JsonSectionPageMapper {
#[new]
fn py_new(path: String) -> PyResult<Self> {
let file = File::open(Path::new(&path))?;
let mut reader = BufReader::new(file);
let mapper = JsonSectionPageMapperImpl::new(&mut reader)?;

Ok(JsonSectionPageMapper { reader, mapper })
}

fn sections(self_: PyRef<Self>) -> PyResult<PyObject> {
let dict: HashMap<&str, HashMap<&str, usize>> = self_
.mapper
.sections
.iter()
.map(|section| {
let nested_dict: HashMap<&str, usize> =
HashMap::from_iter([("offset", section.offset), ("size", section.size)]);
(section.key.as_str(), nested_dict)
})
.collect();

Ok(dict.into_py(self_.py()))
}

fn __len__(&self) -> PyResult<usize> {
Ok(self.mapper.sections.len())
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::test_helpers::prepare_reader;

#[test]
fn test_instance() {
const EXAMPLE: &str = r#"{"dm_format_version": "1.0", "media_type": 2, "infos": {"string": "test", "int": 0, "float": 0.0, "string_list": ["test0", "test1", "test2"], "int_list": [0, 1, 2], "float_list": [0.0, 0.1, 0.2]}, "categories": {"label": {"labels": [{"name": "cat0", "parent": "", "attributes": ["x", "y"]}, {"name": "cat1", "parent": "", "attributes": ["x", "y"]}, {"name": "cat2", "parent": "", "attributes": ["x", "y"]}, {"name": "cat3", "parent": "", "attributes": ["x", "y"]}, {"name": "cat4", "parent": "", "attributes": ["x", "y"]}], "label_groups": [], "attributes": ["a", "b", "score"]}, "mask": {"colormap": [{"label_id": 0, "r": 0, "g": 0, "b": 0}, {"label_id": 1, "r": 128, "g": 0, "b": 0}, {"label_id": 2, "r": 0, "g": 128, "b": 0}, {"label_id": 3, "r": 128, "g": 128, "b": 0}, {"label_id": 4, "r": 0, "g": 0, "b": 128}]}, "points": {"items": [{"label_id": 0, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 1, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 2, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 3, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}, {"label_id": 4, "labels": ["cat1", "cat2"], "joints": [[0, 1]]}]}}, "items": [{"id": "42", "annotations": [{"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}, {"id": 900100087038, "type": "mask", "attributes": {}, "group": 900100087038, "label_id": null, "rle": {"counts": "06", "size": [2, 3]}, "z_order": 0}], "image": {"path": "42.jpg", "size": [10, 6]}}, {"id": "43", "annotations": [], "image": {"path": "43.qq", "size": [2, 4]}}]}
"#;

let (tempfile, mut reader) = prepare_reader(EXAMPLE);
let json_section_page_mapper = JsonSectionPageMapperImpl::new(&mut reader).unwrap();

println!("{:?}", json_section_page_mapper);

for section in json_section_page_mapper.sections {
let offset = section.offset;
let size = section.size;
reader.seek(io::SeekFrom::Start(offset as u64));
let mut buf = vec![0; size];
reader.read(buf.as_mut_slice());

let content: serde_json::Value = serde_json::from_str(
std::str::from_utf8(buf.as_slice()).expect("Cannot change to utf8"),
)
.unwrap();
println!("Section: {}, Content: {:?}", section.key, content);
}
}
}
3 changes: 3 additions & 0 deletions rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

mod coco_page_mapper;
mod datum_page_mapper;
mod json_section_page_mapper;
mod page_mapper;
mod page_maps;
mod test_helpers;
Expand All @@ -12,13 +13,15 @@ use pyo3::prelude::*;

use crate::coco_page_mapper::CocoPageMapper;
use crate::datum_page_mapper::DatumPageMapper;
use crate::json_section_page_mapper::JsonSectionPageMapper;

/// Datumaro Rust API
#[pymodule]
#[pyo3(name = "rust_api")]
fn rust_api(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_class::<CocoPageMapper>()?;
m.add_class::<DatumPageMapper>()?;
m.add_class::<JsonSectionPageMapper>()?;

Ok(())
}
12 changes: 7 additions & 5 deletions src/datumaro/plugins/data_formats/ade20k2020.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from datumaro.components.format_detection import FormatDetectionContext
from datumaro.components.importer import ImportContext, Importer
from datumaro.components.media import Image
from datumaro.rust_api import JsonSectionPageMapper
from datumaro.util import parse_json
from datumaro.util.image import IMAGE_EXTENSIONS, find_images, lazy_image, load_image
from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
Expand Down Expand Up @@ -223,11 +224,12 @@ def detect(cls, context: FormatDetectionContext) -> None:
with context.probe_text_file(
annot_path,
'must be a JSON object with an "annotation" key',
) as f:
contents = parse_json(f.read())
if not isinstance(contents, dict):
raise Exception
if "annotation" not in contents:
):
fpath = osp.join(context.root_path, annot_path)
page_mapper = JsonSectionPageMapper(fpath)
sections = page_mapper.sections()

if "annotation" not in sections.keys():
raise Exception

@classmethod
Expand Down
10 changes: 6 additions & 4 deletions src/datumaro/plugins/data_formats/datumaro/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
from datumaro.components.importer import Importer
from datumaro.components.merge.extractor_merger import ExtractorMerger
from datumaro.util import parse_json
from datumaro.rust_api import JsonSectionPageMapper

from .format import DatumaroPath

Expand All @@ -28,9 +28,11 @@ def detect(
with context.probe_text_file(
annot_file,
'must be a JSON object with "categories" ' 'and "items" keys',
) as f:
contents = parse_json(f.read())
if not {"categories", "items"} <= contents.keys():
):
fpath = osp.join(context.root_path, annot_file)
page_mapper = JsonSectionPageMapper(fpath)
sections = page_mapper.sections()
if not {"categories", "items"} <= sections.keys():
raise Exception

@classmethod
Expand Down
13 changes: 11 additions & 2 deletions src/datumaro/plugins/data_formats/kinetics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from datumaro.components.importer import ImportContext, Importer
from datumaro.components.media import Video
from datumaro.plugins.data_formats.video import VIDEO_EXTENSIONS
from datumaro.rust_api import JsonSectionPageMapper
from datumaro.util import parse_json, parse_json_file
from datumaro.util.os_util import find_files

Expand Down Expand Up @@ -141,10 +142,18 @@ def detect(cls, context: FormatDetectionContext) -> None:
ann_file,
"JSON file must contain an youtube 'url' key",
) as f:
contents = parse_json(f.read())
fpath = osp.join(context.root_path, ann_file)
page_mapper = JsonSectionPageMapper(fpath)
sections = page_mapper.sections()

page_map = next(iter(sections.values()))
offset, size = page_map["offset"], page_map["size"]

f.seek(offset, 0)
contents = parse_json(f.read(size))
if not isinstance(contents, dict):
raise Exception
if "youtube" not in next(iter(contents.values())).get("url", ""):
if "youtube" not in contents.get("url", ""):
raise Exception

with context.alternative():
Expand Down
Loading

0 comments on commit e426036

Please sign in to comment.