Skip to content

Commit

Permalink
ListViewArray: Buffers validation, creation from JSON, and basic tests
Browse files Browse the repository at this point in the history
  • Loading branch information
felipecrv committed Apr 26, 2023
1 parent 6ae5d3b commit 8feebe4
Show file tree
Hide file tree
Showing 10 changed files with 399 additions and 6 deletions.
2 changes: 2 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ set(ARROW_SRCS
util/future.cc
util/int_util.cc
util/io_util.cc
util/list_view_util.cc
util/logging.cc
util/key_value_metadata.cc
util/memory.cc
Expand Down Expand Up @@ -769,6 +770,7 @@ add_arrow_test(array_test
array/array_binary_test.cc
array/array_dict_test.cc
array/array_list_test.cc
array/array_list_view_test.cc
array/array_run_end_test.cc
array/array_struct_test.cc
array/array_union_test.cc
Expand Down
104 changes: 104 additions & 0 deletions cpp/src/arrow/array/array_list_view_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>

#include "arrow/array/list_view.h"
#include "arrow/array/util.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type_fwd.h"
#include "arrow/util/checked_cast.h"

namespace arrow {

using internal::checked_cast;

// ----------------------------------------------------------------------
// List-view array tests

namespace {

class TestListViewArray : public ::testing::Test {
public:
std::shared_ptr<Array> string_values;
std::shared_ptr<Array> int32_values;
std::shared_ptr<Array> int16_values;

void SetUp() override {
string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])");
int32_values = ArrayFromJSON(int32(), "[1, 20, 3]");
int16_values = ArrayFromJSON(int16(), "[10, 2, 30]");
}

static std::shared_ptr<Array> Offsets(std::string_view json) {
return ArrayFromJSON(int32(), json);
}

static std::shared_ptr<Array> Sizes(std::string_view json) {
return ArrayFromJSON(int32(), json);
}
};

} // namespace

TEST_F(TestListViewArray, MakeArray) {
ASSERT_OK_AND_ASSIGN(
auto list_view_array,
ListViewArray::FromArrays(list_view(utf8()), Offsets("[0, 0, 1, 2]"),
Sizes("[2, 1, 1, 1]"), string_values));
auto array_data = list_view_array->data();
auto new_array = MakeArray(array_data);
ASSERT_ARRAYS_EQUAL(*new_array, *list_view_array);
// Should be the exact same ArrayData object
ASSERT_EQ(new_array->data(), array_data);
ASSERT_NE(std::dynamic_pointer_cast<ListViewArray>(new_array), NULLPTR);
}

TEST_F(TestListViewArray, FromOffsetsAndSizes) {
std::shared_ptr<ListViewArray> list_view_array;

ASSERT_OK_AND_ASSIGN(
list_view_array,
ListViewArray::FromArrays(list_view(int32()), Offsets("[0, 0, 1, 1000]"),
Sizes("[2, 1, 1, null]"), int32_values));
ASSERT_EQ(list_view_array->length(), 4);
ASSERT_ARRAYS_EQUAL(*list_view_array->values(), *int32_values);
ASSERT_EQ(list_view_array->offset(), 0);
ASSERT_EQ(list_view_array->data()->GetNullCount(), 1);
ASSERT_EQ(list_view_array->data()->buffers.size(), 3);

// Passing a non-zero logical offset
ASSERT_OK_AND_ASSIGN(
list_view_array,
ListViewArray::FromArrays(list_view(utf8()), Offsets("[0, 0, 1, 2]")->Slice(1),
Sizes("[2, 1, 1, 1]")->Slice(1), string_values));
ASSERT_EQ(list_view_array->length(), 3);
ASSERT_ARRAYS_EQUAL(*list_view_array->values(), *string_values);
ASSERT_EQ(list_view_array->data()->GetNullCount(), 0);
ASSERT_EQ(list_view_array->offset(), 1);

ASSERT_RAISES_WITH_MESSAGE(
Invalid, "Invalid: values type must match int32",
ListViewArray::FromArrays(list_view(int32()), Offsets("[0, 0, 1, 1000]"),
Sizes("[2, 1, 1, null]"), int16_values));
ASSERT_RAISES_WITH_MESSAGE(
Invalid, "Invalid: offsets and sizes must have the same length",
ListViewArray::FromArrays(list_view(int32()), Offsets("[0, 0, 1]"),
Sizes("[2, 1, 1, null]"), int32_values));
}

} // namespace arrow
45 changes: 41 additions & 4 deletions cpp/src/arrow/array/list_view.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "arrow/array/list_view.h"
#include "arrow/array/util.h"
#include "arrow/util/list_view_util.h"
#include "arrow/util/logging.h"

namespace arrow {
Expand All @@ -39,20 +40,24 @@ ListViewArray::ListViewArray(const std::shared_ptr<DataType>& type, int64_t leng
Result<std::shared_ptr<ListViewArray>> ListViewArray::Make(
const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers,
const std::shared_ptr<Array>& values, int64_t null_count, int64_t offset) {
const std::shared_ptr<Array>& values, int64_t null_count, int64_t offset,
const std::shared_ptr<Buffer>& offsets_validity) {
if (type->id() != Type::LIST_VIEW) {
return Status::Invalid("Type must be LIST_VIEW");
}
// TODO(felipecrv): validate buffers and values array
RETURN_NOT_OK(list_view_util::internal::ValidateBuffers(
length, buffers, offset, values->length(), offsets_validity));
return std::make_shared<ListViewArray>(type, length, buffers, values, null_count,
offset);
}

Result<std::shared_ptr<ListViewArray>> ListViewArray::Make(
int64_t length, const std::vector<std::shared_ptr<Buffer>>& buffers,
const std::shared_ptr<Array>& values, int64_t null_count, int64_t offset) {
const std::shared_ptr<Array>& values, int64_t null_count, int64_t offset,
const std::shared_ptr<Buffer>& offsets_validity) {
auto list_view_type = list_view(values->type());
return Make(list_view_type, length, buffers, values, null_count, offset);
return Make(list_view_type, length, buffers, values, null_count, offset,
offsets_validity);
}

void ListViewArray::SetData(const std::shared_ptr<ArrayData>& data) {
Expand All @@ -66,4 +71,36 @@ void ListViewArray::SetData(const std::shared_ptr<ArrayData>& data) {
values_array_ = MakeArray(this->data()->child_data[0]);
}

Result<std::shared_ptr<ListViewArray>> ListViewArray::FromArrays(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& sizes, const std::shared_ptr<Array>& values) {
if (type->id() != Type::LIST_VIEW) {
return Status::Invalid("Expected an list-view type");
}
if (offsets->type()->id() != Type::INT32) {
return Status::Invalid("offsets must be int32");
}
if (sizes->type()->id() != Type::INT32) {
return Status::Invalid("sizes must be int32");
}
const auto* list_view_type = internal::checked_cast<const ListViewType*>(type.get());
if (!list_view_type->value_type()->Equals(*values->type())) {
return Status::Invalid("values type must match ",
list_view_type->value_type()->ToString());
}
if (offsets->length() != sizes->length()) {
return Status::Invalid("offsets and sizes must have the same length");
}
if (offsets->offset() != sizes->offset()) {
return Status::Invalid("offsets and sizes must have the same offset");
}
const std::vector<std::shared_ptr<Buffer>> buffers = {
sizes->data()->buffers[0],
offsets->data()->buffers[1],
sizes->data()->buffers[1],
};
return Make(type, sizes->length(), buffers, values, sizes->null_count(),
sizes->offset(), offsets->data()->buffers[0]);
}

} // namespace arrow
26 changes: 24 additions & 2 deletions cpp/src/arrow/array/list_view.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class ARROW_EXPORT ListViewArray : public Array {
const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers,
const std::shared_ptr<Array>& values, int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
int64_t offset = 0, const std::shared_ptr<Buffer>& offsets_validity = NULLPTR);

/// \brief Construct an ListViewArray
///
Expand All @@ -73,7 +73,29 @@ class ARROW_EXPORT ListViewArray : public Array {
static Result<std::shared_ptr<ListViewArray>> Make(
int64_t length, const std::vector<std::shared_ptr<Buffer>>& buffers,
const std::shared_ptr<Array>& values, int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
int64_t offset = 0, const std::shared_ptr<Buffer>& offsets_validity = NULLPTR);

/// \brief Construct an ListViewArray
///
/// Construct an ListViewArray using buffers from offsets and sizes arrays
/// that project views into the values array.
///
/// \note Each individual array is assumed to be valid by itself, and this
/// function only validates that they can be combined into a valid ListViewArray.
///
/// \note As this function is not expected to allocate new buffers, the
/// offset of the offsets and sizes arrays is expected to be the same. If that
/// is not the case a Status::Invalid will be returned.
///
/// \param type An ListViewType instance
/// \param offsets An array of int32 offsets into the values array. NULL values are
/// supported if the corresponding values in sizes is NULL or 0.
/// \param sizes An array containing the int32 sizes of every view. NULL values are
/// taken to represent a NULL listy-view in the array being created.
/// \param values The array that is being nested into the ListViewArray.
static Result<std::shared_ptr<ListViewArray>> FromArrays(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& sizes, const std::shared_ptr<Array>& values);

protected:
void SetData(const std::shared_ptr<ArrayData>& data);
Expand Down
18 changes: 18 additions & 0 deletions cpp/src/arrow/ipc/json_simple.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "arrow/array/builder_primitive.h"
#include "arrow/array/builder_time.h"
#include "arrow/array/builder_union.h"
#include "arrow/array/list_view.h"
#include "arrow/chunked_array.h"
#include "arrow/ipc/json_simple.h"
#include "arrow/scalar.h"
Expand Down Expand Up @@ -986,6 +987,23 @@ Status DictArrayFromJSON(const std::shared_ptr<DataType>& type,
.Value(out);
}

Status ListViewArrayFromJSON(const std::shared_ptr<DataType>& type,
std::string_view offsets_json, std::string_view sizes_json,
std::string_view values_json, std::shared_ptr<Array>* out) {
if (type->id() != Type::LIST_VIEW) {
return Status::TypeError("ListViewArrayFromJSON requires list-view type, got ",
*type);
}
const auto& list_view_type = checked_cast<const ListViewType&>(*type);
ARROW_ASSIGN_OR_RAISE(auto offsets, ArrayFromJSON(int32(), offsets_json));
ARROW_ASSIGN_OR_RAISE(auto sizes, ArrayFromJSON(int32(), sizes_json));
ARROW_ASSIGN_OR_RAISE(auto values,
ArrayFromJSON(list_view_type.value_type(), values_json));
return ListViewArray::FromArrays(type, std::move(offsets), std::move(sizes),
std::move(values))
.Value(out);
}

Status ScalarFromJSON(const std::shared_ptr<DataType>& type, std::string_view json_string,
std::shared_ptr<Scalar>* out) {
std::shared_ptr<Converter> converter;
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/ipc/json_simple.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ ARROW_EXPORT
Status DictArrayFromJSON(const std::shared_ptr<DataType>&, std::string_view indices_json,
std::string_view dictionary_json, std::shared_ptr<Array>* out);

ARROW_EXPORT
Status ListViewArrayFromJSON(const std::shared_ptr<DataType>& type,
std::string_view offsets_json, std::string_view sizes_json,
std::string_view values_json, std::shared_ptr<Array>* out);

ARROW_EXPORT
Status ScalarFromJSON(const std::shared_ptr<DataType>&, std::string_view json,
std::shared_ptr<Scalar>* out);
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/arrow/testing/gtest_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,16 @@ std::shared_ptr<Array> DictArrayFromJSON(const std::shared_ptr<DataType>& type,
return out;
}

std::shared_ptr<Array> ListViewArrayFromJSON(const std::shared_ptr<DataType>& type,
std::string_view offsets_json,
std::string_view sizes_json,
std::string_view values_json) {
std::shared_ptr<Array> out;
ABORT_NOT_OK(ipc::internal::json::ListViewArrayFromJSON(type, offsets_json, sizes_json,
values_json, &out));
return out;
}

std::shared_ptr<ChunkedArray> ChunkedArrayFromJSON(const std::shared_ptr<DataType>& type,
const std::vector<std::string>& json) {
std::shared_ptr<ChunkedArray> out;
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/testing/gtest_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,12 @@ std::shared_ptr<Array> DictArrayFromJSON(const std::shared_ptr<DataType>& type,
std::string_view indices_json,
std::string_view dictionary_json);

ARROW_TESTING_EXPORT
std::shared_ptr<Array> ListViewArrayFromJSON(const std::shared_ptr<DataType>& type,
std::string_view offsets_json,
std::string_view sizes_json,
std::string_view values_json);

ARROW_TESTING_EXPORT
std::shared_ptr<RecordBatch> RecordBatchFromJSON(const std::shared_ptr<Schema>&,
std::string_view);
Expand Down
Loading

0 comments on commit 8feebe4

Please sign in to comment.