Skip to content

Commit

Permalink
JSON reader - empty nested list should not create child value (#826)
Browse files Browse the repository at this point in the history
* JSON reader - empty nested list should not create child value

* PR review
  • Loading branch information
nevi-me authored Oct 13, 2021
1 parent d4bf0b6 commit e898de5
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 22 deletions.
41 changes: 19 additions & 22 deletions arrow/src/json/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1048,31 +1048,27 @@ impl Decoder {
}
DataType::Struct(fields) => {
// extract list values, with non-lists converted to Value::Null
let array_item_count = rows
.iter()
.map(|row| match row {
Value::Array(values) => values.len(),
_ => 1,
})
.sum();
let array_item_count = cur_offset.to_usize().unwrap();
let num_bytes = bit_util::ceil(array_item_count, 8);
let mut null_buffer = MutableBuffer::from_len_zeroed(num_bytes);
let mut struct_index = 0;
let rows: Vec<Value> = rows
.iter()
.flat_map(|row| {
if let Value::Array(values) = row {
values.iter().for_each(|_| {
bit_util::set_bit(
null_buffer.as_slice_mut(),
struct_index,
);
.flat_map(|row| match row {
Value::Array(values) if !values.is_empty() => {
values.iter().for_each(|value| {
if !value.is_null() {
bit_util::set_bit(
null_buffer.as_slice_mut(),
struct_index,
);
}
struct_index += 1;
});
values.clone()
} else {
struct_index += 1;
vec![Value::Null]
}
_ => {
vec![]
}
})
.collect();
Expand Down Expand Up @@ -2209,6 +2205,7 @@ mod tests {
{"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d": "d_text"}}, {"b": true, "c": {"d": null}}]}
{"a": null}
{"a": []}
{"a": [null]}
"#;
let mut reader = builder.build(Cursor::new(json_content)).unwrap();

Expand Down Expand Up @@ -2243,23 +2240,23 @@ mod tests {
.null_bit_buffer(Buffer::from(vec![0b00111111]))
.build();
let a_list = ArrayDataBuilder::new(a_field.data_type().clone())
.len(5)
.add_buffer(Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6]))
.len(6)
.add_buffer(Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6, 7]))
.add_child_data(a)
.null_bit_buffer(Buffer::from(vec![0b00010111]))
.null_bit_buffer(Buffer::from(vec![0b00110111]))
.build();
let expected = make_array(a_list);

// compare `a` with result from json reader
let batch = reader.next().unwrap().unwrap();
let read = batch.column(0);
assert_eq!(read.len(), 5);
assert_eq!(read.len(), 6);
// compare the arrays the long way around, to better detect differences
let read: &ListArray = read.as_any().downcast_ref::<ListArray>().unwrap();
let expected = expected.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(
read.data().buffers()[0],
Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6])
Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6, 7])
);
// compare list null buffers
assert_eq!(read.data().null_buffer(), expected.data().null_buffer());
Expand Down
52 changes: 52 additions & 0 deletions arrow/src/json/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1259,4 +1259,56 @@ mod tests {
r#"[{"an":"object"},{"another":"object"}]"#
);
}

#[test]
fn json_list_roundtrip() {
let json_content = r#"
{"list": [{"ints": 1}]}
{"list": [{}]}
{"list": []}
{"list": null}
{"list": [{"ints": null}]}
{"list": [null]}
"#;
let ints_struct =
DataType::Struct(vec![Field::new("ints", DataType::Int32, true)]);
let list_type = DataType::List(Box::new(Field::new("item", ints_struct, true)));
let list_field = Field::new("list", list_type, true);
let schema = Arc::new(Schema::new(vec![list_field]));
let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64);
let mut reader = builder.build(std::io::Cursor::new(json_content)).unwrap();

let batch = reader.next().unwrap().unwrap();

let list_row = batch
.column(0)
.as_any()
.downcast_ref::<ListArray>()
.unwrap();
let values = list_row.values();
assert_eq!(values.len(), 4);
assert_eq!(values.null_count(), 1);

// write the batch to JSON, and compare output with input
let mut buf = Vec::new();
{
let mut writer = LineDelimitedWriter::new(&mut buf);
writer.write_batches(&[batch]).unwrap();
}

// NOTE: The last value should technically be {"list": [null]} but it appears
// that implementations differ on the treatment of a null struct.
// It would be more accurate to return a null struct, so this can be done
// as a follow up.
assert_eq!(
String::from_utf8(buf).unwrap(),
r#"{"list":[{"ints":1}]}
{"list":[{}]}
{"list":[]}
{}
{"list":[{}]}
{"list":[{}]}
"#
);
}
}

0 comments on commit e898de5

Please sign in to comment.