From 06ca3f26dd72c9ef6cdf5710bda572a4864c5cd4 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 19 Jul 2023 01:02:13 -0300 Subject: [PATCH] random.cc: Generate random ListViewArrays --- cpp/src/arrow/testing/random.cc | 169 +++++++++++++++++++++++---- cpp/src/arrow/testing/random.h | 17 +++ cpp/src/arrow/testing/random_test.cc | 78 ++++++++++++- 3 files changed, 240 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index b74c41f75e452..bde90a8d56d10 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -485,6 +485,7 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, } namespace { + template std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, typename OffsetArrayType::value_type first_offset, @@ -594,6 +595,121 @@ std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths, std::make_shared(), size, buffers, null_count); return std::make_shared(array_data); } + +// Helper for RandomArrayGenerator::ArrayOf: extract some C value from +// a given metadata key. +template ::ArrowType> +enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, + const std::string& key, + T default_value) { + if (!metadata) return default_value; + const auto index = metadata->FindKey(key); + if (index < 0) return default_value; + const auto& value = metadata->value(index); + T output{}; + if (!internal::ParseValue(value.data(), value.length(), &output)) { + ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value)); + } + return output; +} + +/// Try to pass sizes such that every non-null sizes[i] <= values_size. +template +std::shared_ptr ViewOffsetsFromLengthsArray( + SeedType seed, offset_type avg_length, offset_type values_length, + OffsetArrayType& mutable_sizes_array, bool force_empty_nulls, + bool zero_undefined_offsets, int64_t alignment, MemoryPool* memory_pool) { + using TypeClass = typename OffsetArrayType::TypeClass; + constexpr offset_type kZero = 0; + + auto* sizes = mutable_sizes_array.data()->template GetMutableValues(1); + + BufferVector buffers{2}; + buffers[0] = NULLPTR; // sizes can have nulls, offsets don't have to + buffers[1] = *AllocateBuffer(sizeof(offset_type) * mutable_sizes_array.length(), + alignment, memory_pool); + auto offsets = reinterpret_cast(buffers[1]->mutable_data()); + + pcg32_fast rng(seed); + std::uniform_int_distribution offset_delta_dist(-avg_length, avg_length); + offset_type offset_base = 0; + for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) { + // We want to always sample the offset_delta_dist(rng) to make sure + // different options regarding nulls and empty views don't affect + // the other offsets. + offset_type offset = offset_base + offset_delta_dist(rng); + if (mutable_sizes_array.IsNull(i)) { + if (force_empty_nulls) { + sizes[i] = 0; + } + offsets[i] = zero_undefined_offsets ? 0 : offset; + continue; + } + offset_type size = sizes[i]; + if (size == 0) { + offsets[i] = zero_undefined_offsets ? 0 : offset; + } else { + // Ensure that the size is not too large. + if (ARROW_PREDICT_FALSE(size > values_length)) { + size = values_length; + sizes[i] = size; // Fix the size. + } + // Ensure the offset is not negative or too large. + offset = std::max(offset, kZero); + if (offset > values_length - size) { + offset = values_length - size; + } + offsets[i] = offset; + } + offset_base += avg_length; + } + + auto array_data = + ArrayData::Make(TypeTraits::type_singleton(), + mutable_sizes_array.length(), std::move(buffers), /*null_count=*/0); + return std::make_shared(std::move(array_data)); +} + +template +Result> ArrayOfListView(RAG& self, const Field& field, + int64_t length, int64_t alignment, + MemoryPool* memory_pool, + double null_probability) { + using TypeClass = typename ArrayType::TypeClass; + using offset_type = typename ArrayType::offset_type; + using OffsetArrayType = typename CTypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + const auto force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); + const auto zero_undefined_offsets = + GetMetadata(field.metadata().get(), "zero_undefined_offsets", false); + const auto lengths = internal::checked_pointer_cast( + self.RAG::template Numeric( + length, min_length, max_length, null_probability)); + + // List views don't have to be disjoint, so let's make the values_length a + // multiple of the average list-view size. To make sure every list view + // into the values array can fit, it should be at least max_length. + const offset_type avg_length = min_length + (max_length - min_length) / 2; + const int64_t values_length = std::max(avg_length * (length - lengths->null_count()), + static_cast(max_length)); + DCHECK_LT(values_length, std::numeric_limits::max()); + const auto values = self.RAG::ArrayOf( + *internal::checked_pointer_cast(field.type())->value_field(), + values_length, alignment, memory_pool); + + const auto offsets = ViewOffsetsFromLengthsArray( + self.seed(), avg_length, static_cast(values_length), *lengths, + force_empty_nulls, zero_undefined_offsets, alignment, memory_pool); + + return ArrayType::FromArrays(field.type(), *offsets, *lengths, *values); +} + } // namespace std::shared_ptr RandomArrayGenerator::Offsets( @@ -623,6 +739,30 @@ std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t s return *::arrow::ListArray::FromArrays(*offsets, values); } +std::shared_ptr RandomArrayGenerator::ListView( + const Array& values, int64_t size, double null_probability, bool force_empty_nulls, + bool zero_undefined_offsets, int64_t alignment, MemoryPool* memory_pool) { + using offset_type = int32_t; + using OffsetArrayType = Int32Array; + using OffsetArrowType = Int32Type; + + DCHECK_LE(values.length(), std::numeric_limits::max()); + const auto values_length = static_cast(values.length()); + + const offset_type avg_length = (values_length - 1) / size + 1; + const offset_type min_length = 0; + const offset_type max_length = std::min(std::max(2 * avg_length, 1), values_length); + const auto lengths = internal::checked_pointer_cast( + Numeric(size, min_length, max_length, + null_probability)); + + const auto offsets = ViewOffsetsFromLengthsArray( + seed(), avg_length, values_length, *lengths, force_empty_nulls, + zero_undefined_offsets, alignment, memory_pool); + + return *ListViewArray::FromArrays(*offsets, *lengths, values, memory_pool); +} + std::shared_ptr RandomArrayGenerator::Map(const std::shared_ptr& keys, const std::shared_ptr& items, int64_t size, double null_probability, @@ -699,27 +839,6 @@ std::shared_ptr RandomArrayGenerator::DenseUnion(const ArrayVector& field return *DenseUnionArray::Make(*type_ids, *offsets, fields, type_codes); } -namespace { - -// Helper for RandomArrayGenerator::ArrayOf: extract some C value from -// a given metadata key. -template ::ArrowType> -enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, - const std::string& key, - T default_value) { - if (!metadata) return default_value; - const auto index = metadata->FindKey(key); - if (index < 0) return default_value; - const auto& value = metadata->value(index); - T output{}; - if (!internal::ParseValue(value.data(), value.length(), &output)) { - ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value)); - } - return output; -} - -} // namespace - std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr type, int64_t size, double null_probability, @@ -797,6 +916,12 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t return *ARRAY_TYPE::FromArrays(field.type(), *offsets, *values); \ } +#define GENERATE_LIST_VIEW_CASE(ARRAY_TYPE) \ + case ARRAY_TYPE::TypeClass::type_id: { \ + return *ArrayOfListView(*this, field, length, alignment, memory_pool, \ + null_probability); \ + } + const double null_probability = field.nullable() ? GetMetadata(field.metadata().get(), "null_probability", 0.01) @@ -914,6 +1039,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(ListArray); + GENERATE_LIST_VIEW_CASE(ListViewArray); case Type::type::STRUCT: { ArrayVector child_arrays(field.type()->num_fields()); @@ -1037,6 +1163,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(LargeListArray); + GENERATE_LIST_VIEW_CASE(LargeListViewArray); default: break; diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index de9ea6d05648d..9246950ac8043 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -438,6 +438,23 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random ListViewArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr ListView(const Array& values, int64_t size, + double null_probability = 0, + bool force_empty_nulls = false, + bool zero_undefined_offsets = false, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random MapArray /// /// \param[in] keys The underlying keys array diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index f269818e83a3d..489b0532c3f9c 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -70,7 +70,7 @@ class RandomArrayTest : public ::testing::TestWithParam { } bool HasList(const DataType& type) { - if (is_var_length_list(type.id())) { + if (is_var_length_list_like(type.id())) { return true; } for (const auto& child : type.fields()) { @@ -99,7 +99,7 @@ TEST_P(RandomArrayTest, GenerateArrayAlignment) { const int64_t alignment = 1024; auto field = GetField(); if (HasList(*field->type())) { - GTEST_SKIP() << "ListArray::FromArrays does not conserve buffer alignment"; + GTEST_SKIP() << "List[View]Array::FromArrays does not conserve buffer alignment"; } auto array = GenerateArray(*field, /*size=*/13, 0xDEADBEEF, alignment); AssertTypeEqual(field->type(), array->type()); @@ -176,6 +176,13 @@ auto values = ::testing::Values( key_value_metadata({{"force_empty_nulls", "true"}})), field("listint81024values", list(int8()), true, key_value_metadata({{"values", "1024"}})), + field("listviewint8", list_view(int8())), + field("listviewlistviewint8", list_view(list_view(int8()))), + field("listviewint8emptynulls", list_view(int8()), true, + key_value_metadata( + {{"force_empty_nulls", "true"}, {"zero_undefined_offsets", "true"}})), + field("listviewint81024values", list_view(int8()), true, + key_value_metadata({{"values", "1024"}})), field("structints", struct_({ field("int8", int8()), field("int16", int16()), @@ -200,7 +207,8 @@ auto values = ::testing::Values( field("fixedsizelist", fixed_size_list(int8(), 4)), field("durationns", duration(TimeUnit::NANO)), field("largestring", large_utf8()), field("largebinary", large_binary()), - field("largelistlistint8", large_list(list(int8())))); + field("largelistlistint8", large_list(list(int8()))), + field("largelistviewlistviewint8", large_list_view(list_view(int8())))); INSTANTIATE_TEST_SUITE_P( TestRandomArrayGeneration, RandomArrayTest, values, @@ -399,6 +407,39 @@ TEST(TypeSpecificTests, ListLengths) { } } +TEST(TypeSpecificTests, ListViewLengths) { + { + auto field = + arrow::field("list_view", list_view(int8()), + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), kExpectedLength); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = + arrow::field("list_view", large_list_view(int8()), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_EQ(array->length(), kExpectedLength); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } +} + TEST(TypeSpecificTests, MapValues) { auto field = arrow::field("map", map(int8(), int8()), key_value_metadata({{"values", "4"}})); @@ -499,6 +540,24 @@ TEST(RandomList, Basics) { } } +TEST(RandomListView, Basics) { + random::RandomArrayGenerator rng(42); + for (const double null_probability : {0.0, 0.1, 0.98}) { + SCOPED_TRACE("null_probability = " + std::to_string(null_probability)); + auto values = rng.Int16(1234, 0, 10000, null_probability); + auto array = rng.ListView(*values, 45, null_probability); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), 45); + const auto& list_view_array = checked_cast(*array); + ASSERT_EQ(list_view_array.values()->length(), 1234); + int64_t null_count = 0; + for (int64_t i = 0; i < array->length(); ++i) { + null_count += array->IsNull(i); + } + ASSERT_EQ(null_count, array->data()->null_count); + } +} + TEST(RandomChildFieldNullablity, List) { random::RandomArrayGenerator rng(42); @@ -512,6 +571,19 @@ TEST(RandomChildFieldNullablity, List) { ARROW_EXPECT_OK(batch->ValidateFull()); } +TEST(RandomChildFieldNullablity, ListView) { + random::RandomArrayGenerator rng(42); + + auto item = arrow::field("item", arrow::int8(), true); + auto nest_list_view_field = arrow::field("list_view", list_view(item), false); + auto list_view_field = arrow::field("list_view", list_view(nest_list_view_field), true); + auto array = rng.ArrayOf(*list_view_field, 428); + ARROW_EXPECT_OK(array->ValidateFull()); + + auto batch = rng.BatchOf({list_view_field}, 428); + ARROW_EXPECT_OK(batch->ValidateFull()); +} + TEST(RandomChildFieldNullablity, Struct) { random::RandomArrayGenerator rng(42);