From 8feebe4e0a3bf04ea3a0e4ecb27892e415b7fe68 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 24 Apr 2023 22:30:42 -0300 Subject: [PATCH] ListViewArray: Buffers validation, creation from JSON, and basic tests --- cpp/src/arrow/CMakeLists.txt | 2 + cpp/src/arrow/array/array_list_view_test.cc | 104 +++++++++++++++ cpp/src/arrow/array/list_view.cc | 45 ++++++- cpp/src/arrow/array/list_view.h | 26 +++- cpp/src/arrow/ipc/json_simple.cc | 18 +++ cpp/src/arrow/ipc/json_simple.h | 5 + cpp/src/arrow/testing/gtest_util.cc | 10 ++ cpp/src/arrow/testing/gtest_util.h | 6 + cpp/src/arrow/util/list_view_util.cc | 136 ++++++++++++++++++++ cpp/src/arrow/util/list_view_util.h | 53 ++++++++ 10 files changed, 399 insertions(+), 6 deletions(-) create mode 100644 cpp/src/arrow/array/array_list_view_test.cc create mode 100644 cpp/src/arrow/util/list_view_util.cc create mode 100644 cpp/src/arrow/util/list_view_util.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index de871a73f6baf..17287e33a4076 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -218,6 +218,7 @@ set(ARROW_SRCS util/future.cc util/int_util.cc util/io_util.cc + util/list_view_util.cc util/logging.cc util/key_value_metadata.cc util/memory.cc @@ -769,6 +770,7 @@ add_arrow_test(array_test array/array_binary_test.cc array/array_dict_test.cc array/array_list_test.cc + array/array_list_view_test.cc array/array_run_end_test.cc array/array_struct_test.cc array/array_union_test.cc diff --git a/cpp/src/arrow/array/array_list_view_test.cc b/cpp/src/arrow/array/array_list_view_test.cc new file mode 100644 index 0000000000000..d516d23628934 --- /dev/null +++ b/cpp/src/arrow/array/array_list_view_test.cc @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/list_view.h" +#include "arrow/array/util.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// List-view array tests + +namespace { + +class TestListViewArray : public ::testing::Test { + public: + std::shared_ptr string_values; + std::shared_ptr int32_values; + std::shared_ptr int16_values; + + void SetUp() override { + string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + int32_values = ArrayFromJSON(int32(), "[1, 20, 3]"); + int16_values = ArrayFromJSON(int16(), "[10, 2, 30]"); + } + + static std::shared_ptr Offsets(std::string_view json) { + return ArrayFromJSON(int32(), json); + } + + static std::shared_ptr Sizes(std::string_view json) { + return ArrayFromJSON(int32(), json); + } +}; + +} // namespace + +TEST_F(TestListViewArray, MakeArray) { + ASSERT_OK_AND_ASSIGN( + auto list_view_array, + ListViewArray::FromArrays(list_view(utf8()), Offsets("[0, 0, 1, 2]"), + Sizes("[2, 1, 1, 1]"), string_values)); + auto array_data = list_view_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *list_view_array); + // Should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast(new_array), NULLPTR); +} + +TEST_F(TestListViewArray, FromOffsetsAndSizes) { + std::shared_ptr list_view_array; + + ASSERT_OK_AND_ASSIGN( + list_view_array, + ListViewArray::FromArrays(list_view(int32()), Offsets("[0, 0, 1, 1000]"), + Sizes("[2, 1, 1, null]"), int32_values)); + ASSERT_EQ(list_view_array->length(), 4); + ASSERT_ARRAYS_EQUAL(*list_view_array->values(), *int32_values); + ASSERT_EQ(list_view_array->offset(), 0); + ASSERT_EQ(list_view_array->data()->GetNullCount(), 1); + ASSERT_EQ(list_view_array->data()->buffers.size(), 3); + + // Passing a non-zero logical offset + ASSERT_OK_AND_ASSIGN( + list_view_array, + ListViewArray::FromArrays(list_view(utf8()), Offsets("[0, 0, 1, 2]")->Slice(1), + Sizes("[2, 1, 1, 1]")->Slice(1), string_values)); + ASSERT_EQ(list_view_array->length(), 3); + ASSERT_ARRAYS_EQUAL(*list_view_array->values(), *string_values); + ASSERT_EQ(list_view_array->data()->GetNullCount(), 0); + ASSERT_EQ(list_view_array->offset(), 1); + + ASSERT_RAISES_WITH_MESSAGE( + Invalid, "Invalid: values type must match int32", + ListViewArray::FromArrays(list_view(int32()), Offsets("[0, 0, 1, 1000]"), + Sizes("[2, 1, 1, null]"), int16_values)); + ASSERT_RAISES_WITH_MESSAGE( + Invalid, "Invalid: offsets and sizes must have the same length", + ListViewArray::FromArrays(list_view(int32()), Offsets("[0, 0, 1]"), + Sizes("[2, 1, 1, null]"), int32_values)); +} + +} // namespace arrow diff --git a/cpp/src/arrow/array/list_view.cc b/cpp/src/arrow/array/list_view.cc index 8536a5e2d8f38..b8d210afa1b4f 100644 --- a/cpp/src/arrow/array/list_view.cc +++ b/cpp/src/arrow/array/list_view.cc @@ -17,6 +17,7 @@ #include "arrow/array/list_view.h" #include "arrow/array/util.h" +#include "arrow/util/list_view_util.h" #include "arrow/util/logging.h" namespace arrow { @@ -39,20 +40,24 @@ ListViewArray::ListViewArray(const std::shared_ptr& type, int64_t leng Result> ListViewArray::Make( const std::shared_ptr& type, int64_t length, const std::vector>& buffers, - const std::shared_ptr& values, int64_t null_count, int64_t offset) { + const std::shared_ptr& values, int64_t null_count, int64_t offset, + const std::shared_ptr& offsets_validity) { if (type->id() != Type::LIST_VIEW) { return Status::Invalid("Type must be LIST_VIEW"); } - // TODO(felipecrv): validate buffers and values array + RETURN_NOT_OK(list_view_util::internal::ValidateBuffers( + length, buffers, offset, values->length(), offsets_validity)); return std::make_shared(type, length, buffers, values, null_count, offset); } Result> ListViewArray::Make( int64_t length, const std::vector>& buffers, - const std::shared_ptr& values, int64_t null_count, int64_t offset) { + const std::shared_ptr& values, int64_t null_count, int64_t offset, + const std::shared_ptr& offsets_validity) { auto list_view_type = list_view(values->type()); - return Make(list_view_type, length, buffers, values, null_count, offset); + return Make(list_view_type, length, buffers, values, null_count, offset, + offsets_validity); } void ListViewArray::SetData(const std::shared_ptr& data) { @@ -66,4 +71,36 @@ void ListViewArray::SetData(const std::shared_ptr& data) { values_array_ = MakeArray(this->data()->child_data[0]); } +Result> ListViewArray::FromArrays( + const std::shared_ptr& type, const std::shared_ptr& offsets, + const std::shared_ptr& sizes, const std::shared_ptr& values) { + if (type->id() != Type::LIST_VIEW) { + return Status::Invalid("Expected an list-view type"); + } + if (offsets->type()->id() != Type::INT32) { + return Status::Invalid("offsets must be int32"); + } + if (sizes->type()->id() != Type::INT32) { + return Status::Invalid("sizes must be int32"); + } + const auto* list_view_type = internal::checked_cast(type.get()); + if (!list_view_type->value_type()->Equals(*values->type())) { + return Status::Invalid("values type must match ", + list_view_type->value_type()->ToString()); + } + if (offsets->length() != sizes->length()) { + return Status::Invalid("offsets and sizes must have the same length"); + } + if (offsets->offset() != sizes->offset()) { + return Status::Invalid("offsets and sizes must have the same offset"); + } + const std::vector> buffers = { + sizes->data()->buffers[0], + offsets->data()->buffers[1], + sizes->data()->buffers[1], + }; + return Make(type, sizes->length(), buffers, values, sizes->null_count(), + sizes->offset(), offsets->data()->buffers[0]); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/list_view.h b/cpp/src/arrow/array/list_view.h index a2902d62d3ec3..458c0e715f1b6 100644 --- a/cpp/src/arrow/array/list_view.h +++ b/cpp/src/arrow/array/list_view.h @@ -64,7 +64,7 @@ class ARROW_EXPORT ListViewArray : public Array { const std::shared_ptr& type, int64_t length, const std::vector>& buffers, const std::shared_ptr& values, int64_t null_count = kUnknownNullCount, - int64_t offset = 0); + int64_t offset = 0, const std::shared_ptr& offsets_validity = NULLPTR); /// \brief Construct an ListViewArray /// @@ -73,7 +73,29 @@ class ARROW_EXPORT ListViewArray : public Array { static Result> Make( int64_t length, const std::vector>& buffers, const std::shared_ptr& values, int64_t null_count = kUnknownNullCount, - int64_t offset = 0); + int64_t offset = 0, const std::shared_ptr& offsets_validity = NULLPTR); + + /// \brief Construct an ListViewArray + /// + /// Construct an ListViewArray using buffers from offsets and sizes arrays + /// that project views into the values array. + /// + /// \note Each individual array is assumed to be valid by itself, and this + /// function only validates that they can be combined into a valid ListViewArray. + /// + /// \note As this function is not expected to allocate new buffers, the + /// offset of the offsets and sizes arrays is expected to be the same. If that + /// is not the case a Status::Invalid will be returned. + /// + /// \param type An ListViewType instance + /// \param offsets An array of int32 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param sizes An array containing the int32 sizes of every view. NULL values are + /// taken to represent a NULL listy-view in the array being created. + /// \param values The array that is being nested into the ListViewArray. + static Result> FromArrays( + const std::shared_ptr& type, const std::shared_ptr& offsets, + const std::shared_ptr& sizes, const std::shared_ptr& values); protected: void SetData(const std::shared_ptr& data); diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index eea0c9730283e..586a62a903dfb 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -30,6 +30,7 @@ #include "arrow/array/builder_primitive.h" #include "arrow/array/builder_time.h" #include "arrow/array/builder_union.h" +#include "arrow/array/list_view.h" #include "arrow/chunked_array.h" #include "arrow/ipc/json_simple.h" #include "arrow/scalar.h" @@ -986,6 +987,23 @@ Status DictArrayFromJSON(const std::shared_ptr& type, .Value(out); } +Status ListViewArrayFromJSON(const std::shared_ptr& type, + std::string_view offsets_json, std::string_view sizes_json, + std::string_view values_json, std::shared_ptr* out) { + if (type->id() != Type::LIST_VIEW) { + return Status::TypeError("ListViewArrayFromJSON requires list-view type, got ", + *type); + } + const auto& list_view_type = checked_cast(*type); + ARROW_ASSIGN_OR_RAISE(auto offsets, ArrayFromJSON(int32(), offsets_json)); + ARROW_ASSIGN_OR_RAISE(auto sizes, ArrayFromJSON(int32(), sizes_json)); + ARROW_ASSIGN_OR_RAISE(auto values, + ArrayFromJSON(list_view_type.value_type(), values_json)); + return ListViewArray::FromArrays(type, std::move(offsets), std::move(sizes), + std::move(values)) + .Value(out); +} + Status ScalarFromJSON(const std::shared_ptr& type, std::string_view json_string, std::shared_ptr* out) { std::shared_ptr converter; diff --git a/cpp/src/arrow/ipc/json_simple.h b/cpp/src/arrow/ipc/json_simple.h index 3a730ee6a3f19..80783ccaa543c 100644 --- a/cpp/src/arrow/ipc/json_simple.h +++ b/cpp/src/arrow/ipc/json_simple.h @@ -57,6 +57,11 @@ ARROW_EXPORT Status DictArrayFromJSON(const std::shared_ptr&, std::string_view indices_json, std::string_view dictionary_json, std::shared_ptr* out); +ARROW_EXPORT +Status ListViewArrayFromJSON(const std::shared_ptr& type, + std::string_view offsets_json, std::string_view sizes_json, + std::string_view values_json, std::shared_ptr* out); + ARROW_EXPORT Status ScalarFromJSON(const std::shared_ptr&, std::string_view json, std::shared_ptr* out); diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 37c430892d022..4b6ae065df54c 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -385,6 +385,16 @@ std::shared_ptr DictArrayFromJSON(const std::shared_ptr& type, return out; } +std::shared_ptr ListViewArrayFromJSON(const std::shared_ptr& type, + std::string_view offsets_json, + std::string_view sizes_json, + std::string_view values_json) { + std::shared_ptr out; + ABORT_NOT_OK(ipc::internal::json::ListViewArrayFromJSON(type, offsets_json, sizes_json, + values_json, &out)); + return out; +} + std::shared_ptr ChunkedArrayFromJSON(const std::shared_ptr& type, const std::vector& json) { std::shared_ptr out; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 270805629529b..8cad87b9f34ff 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -320,6 +320,12 @@ std::shared_ptr DictArrayFromJSON(const std::shared_ptr& type, std::string_view indices_json, std::string_view dictionary_json); +ARROW_TESTING_EXPORT +std::shared_ptr ListViewArrayFromJSON(const std::shared_ptr& type, + std::string_view offsets_json, + std::string_view sizes_json, + std::string_view values_json); + ARROW_TESTING_EXPORT std::shared_ptr RecordBatchFromJSON(const std::shared_ptr&, std::string_view); diff --git a/cpp/src/arrow/util/list_view_util.cc b/cpp/src/arrow/util/list_view_util.cc new file mode 100644 index 0000000000000..d94c60c553d8d --- /dev/null +++ b/cpp/src/arrow/util/list_view_util.cc @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/list_view_util.h" +#include "arrow/util/string.h" + +namespace arrow { +namespace list_view_util { + +namespace internal { + +Status ValidateBuffers(int64_t length, + const std::vector>& buffers, + int64_t offset, uint64_t nested_values_array_length, + const std::shared_ptr& offsets_validity) { + // 1) Check dimensions + if (length < 0) { + return Status::Invalid("length must be non-negative, got ", length); + } + if (offset < 0) { + return Status::Invalid("offset must be non-negative, got ", offset); + } + if (buffers.size() < 3) { + return Status::Invalid("3 buffers needed, only ", buffers.size(), " provided"); + } + const auto expected_buffer_size = + (offset + length) * static_cast(sizeof(int32_t)); + if (buffers[1]->size() < expected_buffer_size) { + return Status::Invalid("offsets buffer too small"); + } + if (buffers[2]->size() < expected_buffer_size) { + return Status::Invalid("sizes buffer too small"); + } + + auto* validity = buffers[0] ? buffers[0]->data() : NULLPTR; + auto* offsets = reinterpret_cast(buffers[1]->data()); + auto* sizes = reinterpret_cast(buffers[2]->data()); + + // 2) Check sizes by themselves + // + // Check that all valid sizes are non-negative and less than or equal to + // nested_values_array_length. This allow us to safely calculate + // nested_values_array_length - sizes[i] for every i on the next checks. + auto IsImpossibleSize = [&](int32_t size) -> bool { + return size < 0 || static_cast(size) > nested_values_array_length; + }; + RETURN_NOT_OK(arrow::internal::VisitSetBitRuns( + validity, offset, length, [&](int64_t position, int64_t size) { + bool block_out_of_bounds = false; + for (int64_t i = 0; i < size; ++i) { + block_out_of_bounds |= IsImpossibleSize(sizes[position + i]); + } + if (ARROW_PREDICT_FALSE(block_out_of_bounds)) { + for (int64_t i = 0; i < size; ++i) { + if (IsImpossibleSize(sizes[position + i])) { + return Status::IndexError( + "list-view size " + arrow::internal::ToChars(sizes[position + i]) + + " at position " + arrow::internal::ToChars(position + i) + + " is out of bounds"); + } + } + } + return Status::OK(); + })); + + // 3) Check offsets combined with sizes + // + // Check that all offsets are non-negative and at a position that allows for + // the corresponding size to be valid. + auto OutOfRangeOffset = [&](int64_t i) -> bool { + // This subtraction is safe because of the previous check. + const uint64_t upper_limit = + nested_values_array_length - static_cast(sizes[i]); + return offsets[i] < 0 || static_cast(offsets[i]) > upper_limit; + }; + RETURN_NOT_OK(arrow::internal::VisitSetBitRuns( + validity, offset, length, [&](int64_t position, int64_t size) { + bool bad_offset = false; + // When a validity bit is 0, the corresponding value is undefined (it + // can be anything). The same is true for offsets[i] when size[i] is 0. + // That said, these undefined values are *usually* 0, so we can perform + // a check with less conditionals if we validate undefined values as if + // they were defined. If they are 0 or another valid value, the check will + // pass and we don't need to perform a check that skips undefined values. + for (int64_t i = 0; i < size; ++i) { + bad_offset |= OutOfRangeOffset(position + i); + } + if (ARROW_PREDICT_FALSE(bad_offset)) { // Full check needed + const auto* offsets_validity_bitmap = offsets_validity->data(); + if (offsets_validity_bitmap) { + for (int64_t i = 0; i < size; ++i) { + if (sizes[position + i] != 0 && + ((offsets_validity_bitmap && + !bit_util::GetBit(offsets_validity_bitmap, position + i)) || + OutOfRangeOffset(position + i))) { + const bool valid = + !offsets_validity_bitmap || + bit_util::GetBit(offsets_validity_bitmap, position + i); + return Status::IndexError( + "list-view offset " + + (valid ? arrow::internal::ToChars(offsets[position + i]) : "NULL") + + " at position " + arrow::internal::ToChars(position + i) + + " is out of bounds"); + } + } + } + } + return Status::OK(); + })); + return Status::OK(); +} + +} // namespace internal + +} // namespace list_view_util +} // namespace arrow diff --git a/cpp/src/arrow/util/list_view_util.h b/cpp/src/arrow/util/list_view_util.h new file mode 100644 index 0000000000000..217327ba1f14c --- /dev/null +++ b/cpp/src/arrow/util/list_view_util.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/data.h" +#include "arrow/type_fwd.h" + +namespace arrow { +namespace list_view_util { + +/// \brief Get the child array holding the values from an ListView array +inline const ArraySpan& ValuesArray(const ArraySpan& span) { return span.child_data[0]; } + +namespace internal { + +/// \brief Validate the buffers of an ListView array +/// +/// \param length The length of the ListView array +/// \param buffers The buffers of the ListView array being validated +/// (validity bitmap, offsets, and values) +/// \param offset The offset to be applied to all buffers including +/// offsets_validity +/// \param nested_values_array_length The length of the nested values array +/// which offsets refer to +/// \param offsets_validity Optional. The validity bitmap of the offsets buffer. Although +/// not part of the ListView structure, it is used for validation purposes. +Status ValidateBuffers(int64_t length, + const std::vector>& buffers, + int64_t offset, uint64_t nested_values_array_length, + const std::shared_ptr& offsets_validity); + +} // namespace internal + +} // namespace list_view_util +} // namespace arrow