Skip to content

Commit

Permalink
random.cc: Generate random ListViewArrays
Browse files Browse the repository at this point in the history
  • Loading branch information
felipecrv committed Jul 20, 2023
1 parent 3afc1ff commit 06ca3f2
Show file tree
Hide file tree
Showing 3 changed files with 240 additions and 24 deletions.
169 changes: 148 additions & 21 deletions cpp/src/arrow/testing/random.cc
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ std::shared_ptr<Array> RandomArrayGenerator::FixedSizeBinary(int64_t size,
}

namespace {

template <typename OffsetArrayType>
std::shared_ptr<Array> GenerateOffsets(SeedType seed, int64_t size,
typename OffsetArrayType::value_type first_offset,
Expand Down Expand Up @@ -594,6 +595,121 @@ std::shared_ptr<Array> OffsetsFromLengthsArray(OffsetArrayType* lengths,
std::make_shared<typename OffsetArrayType::TypeClass>(), size, buffers, null_count);
return std::make_shared<OffsetArrayType>(array_data);
}

// Helper for RandomArrayGenerator::ArrayOf: extract some C value from
// a given metadata key.
template <typename T, typename ArrowType = typename CTypeTraits<T>::ArrowType>
enable_if_parameter_free<ArrowType, T> GetMetadata(const KeyValueMetadata* metadata,
const std::string& key,
T default_value) {
if (!metadata) return default_value;
const auto index = metadata->FindKey(key);
if (index < 0) return default_value;
const auto& value = metadata->value(index);
T output{};
if (!internal::ParseValue<ArrowType>(value.data(), value.length(), &output)) {
ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value));
}
return output;
}

/// Try to pass sizes such that every non-null sizes[i] <= values_size.
template <typename OffsetArrayType, typename offset_type>
std::shared_ptr<Array> ViewOffsetsFromLengthsArray(
SeedType seed, offset_type avg_length, offset_type values_length,
OffsetArrayType& mutable_sizes_array, bool force_empty_nulls,
bool zero_undefined_offsets, int64_t alignment, MemoryPool* memory_pool) {
using TypeClass = typename OffsetArrayType::TypeClass;
constexpr offset_type kZero = 0;

auto* sizes = mutable_sizes_array.data()->template GetMutableValues<offset_type>(1);

BufferVector buffers{2};
buffers[0] = NULLPTR; // sizes can have nulls, offsets don't have to
buffers[1] = *AllocateBuffer(sizeof(offset_type) * mutable_sizes_array.length(),
alignment, memory_pool);
auto offsets = reinterpret_cast<offset_type*>(buffers[1]->mutable_data());

pcg32_fast rng(seed);
std::uniform_int_distribution<offset_type> offset_delta_dist(-avg_length, avg_length);
offset_type offset_base = 0;
for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) {
// We want to always sample the offset_delta_dist(rng) to make sure
// different options regarding nulls and empty views don't affect
// the other offsets.
offset_type offset = offset_base + offset_delta_dist(rng);
if (mutable_sizes_array.IsNull(i)) {
if (force_empty_nulls) {
sizes[i] = 0;
}
offsets[i] = zero_undefined_offsets ? 0 : offset;
continue;
}
offset_type size = sizes[i];
if (size == 0) {
offsets[i] = zero_undefined_offsets ? 0 : offset;
} else {
// Ensure that the size is not too large.
if (ARROW_PREDICT_FALSE(size > values_length)) {
size = values_length;
sizes[i] = size; // Fix the size.
}
// Ensure the offset is not negative or too large.
offset = std::max(offset, kZero);
if (offset > values_length - size) {
offset = values_length - size;
}
offsets[i] = offset;
}
offset_base += avg_length;
}

auto array_data =
ArrayData::Make(TypeTraits<TypeClass>::type_singleton(),
mutable_sizes_array.length(), std::move(buffers), /*null_count=*/0);
return std::make_shared<OffsetArrayType>(std::move(array_data));
}

template <typename ArrayType, typename RAG>
Result<std::shared_ptr<Array>> ArrayOfListView(RAG& self, const Field& field,
int64_t length, int64_t alignment,
MemoryPool* memory_pool,
double null_probability) {
using TypeClass = typename ArrayType::TypeClass;
using offset_type = typename ArrayType::offset_type;
using OffsetArrayType = typename CTypeTraits<offset_type>::ArrayType;
using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;

const auto min_length =
GetMetadata<offset_type>(field.metadata().get(), "min_length", 0);
const auto max_length =
GetMetadata<offset_type>(field.metadata().get(), "max_length", 20);
const auto force_empty_nulls =
GetMetadata<bool>(field.metadata().get(), "force_empty_nulls", false);
const auto zero_undefined_offsets =
GetMetadata<bool>(field.metadata().get(), "zero_undefined_offsets", false);
const auto lengths = internal::checked_pointer_cast<OffsetArrayType>(
self.RAG::template Numeric<OffsetArrowType, offset_type>(
length, min_length, max_length, null_probability));

// List views don't have to be disjoint, so let's make the values_length a
// multiple of the average list-view size. To make sure every list view
// into the values array can fit, it should be at least max_length.
const offset_type avg_length = min_length + (max_length - min_length) / 2;
const int64_t values_length = std::max(avg_length * (length - lengths->null_count()),
static_cast<int64_t>(max_length));
DCHECK_LT(values_length, std::numeric_limits<offset_type>::max());
const auto values = self.RAG::ArrayOf(
*internal::checked_pointer_cast<TypeClass>(field.type())->value_field(),
values_length, alignment, memory_pool);

const auto offsets = ViewOffsetsFromLengthsArray<OffsetArrayType, offset_type>(
self.seed(), avg_length, static_cast<offset_type>(values_length), *lengths,
force_empty_nulls, zero_undefined_offsets, alignment, memory_pool);

return ArrayType::FromArrays(field.type(), *offsets, *lengths, *values);
}

} // namespace

std::shared_ptr<Array> RandomArrayGenerator::Offsets(
Expand Down Expand Up @@ -623,6 +739,30 @@ std::shared_ptr<Array> RandomArrayGenerator::List(const Array& values, int64_t s
return *::arrow::ListArray::FromArrays(*offsets, values);
}

std::shared_ptr<Array> RandomArrayGenerator::ListView(
const Array& values, int64_t size, double null_probability, bool force_empty_nulls,
bool zero_undefined_offsets, int64_t alignment, MemoryPool* memory_pool) {
using offset_type = int32_t;
using OffsetArrayType = Int32Array;
using OffsetArrowType = Int32Type;

DCHECK_LE(values.length(), std::numeric_limits<offset_type>::max());
const auto values_length = static_cast<offset_type>(values.length());

const offset_type avg_length = (values_length - 1) / size + 1;
const offset_type min_length = 0;
const offset_type max_length = std::min(std::max(2 * avg_length, 1), values_length);
const auto lengths = internal::checked_pointer_cast<OffsetArrayType>(
Numeric<OffsetArrowType, offset_type>(size, min_length, max_length,
null_probability));

const auto offsets = ViewOffsetsFromLengthsArray<OffsetArrayType, offset_type>(
seed(), avg_length, values_length, *lengths, force_empty_nulls,
zero_undefined_offsets, alignment, memory_pool);

return *ListViewArray::FromArrays(*offsets, *lengths, values, memory_pool);
}

std::shared_ptr<Array> RandomArrayGenerator::Map(const std::shared_ptr<Array>& keys,
const std::shared_ptr<Array>& items,
int64_t size, double null_probability,
Expand Down Expand Up @@ -699,27 +839,6 @@ std::shared_ptr<Array> RandomArrayGenerator::DenseUnion(const ArrayVector& field
return *DenseUnionArray::Make(*type_ids, *offsets, fields, type_codes);
}

namespace {

// Helper for RandomArrayGenerator::ArrayOf: extract some C value from
// a given metadata key.
template <typename T, typename ArrowType = typename CTypeTraits<T>::ArrowType>
enable_if_parameter_free<ArrowType, T> GetMetadata(const KeyValueMetadata* metadata,
const std::string& key,
T default_value) {
if (!metadata) return default_value;
const auto index = metadata->FindKey(key);
if (index < 0) return default_value;
const auto& value = metadata->value(index);
T output{};
if (!internal::ParseValue<ArrowType>(value.data(), value.length(), &output)) {
ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value));
}
return output;
}

} // namespace

std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(std::shared_ptr<DataType> type,
int64_t size,
double null_probability,
Expand Down Expand Up @@ -797,6 +916,12 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
return *ARRAY_TYPE::FromArrays(field.type(), *offsets, *values); \
}

#define GENERATE_LIST_VIEW_CASE(ARRAY_TYPE) \
case ARRAY_TYPE::TypeClass::type_id: { \
return *ArrayOfListView<ARRAY_TYPE>(*this, field, length, alignment, memory_pool, \
null_probability); \
}

const double null_probability =
field.nullable()
? GetMetadata<double>(field.metadata().get(), "null_probability", 0.01)
Expand Down Expand Up @@ -914,6 +1039,7 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
}

GENERATE_LIST_CASE(ListArray);
GENERATE_LIST_VIEW_CASE(ListViewArray);

case Type::type::STRUCT: {
ArrayVector child_arrays(field.type()->num_fields());
Expand Down Expand Up @@ -1037,6 +1163,7 @@ std::shared_ptr<Array> RandomArrayGenerator::ArrayOf(const Field& field, int64_t
}

GENERATE_LIST_CASE(LargeListArray);
GENERATE_LIST_VIEW_CASE(LargeListViewArray);

default:
break;
Expand Down
17 changes: 17 additions & 0 deletions cpp/src/arrow/testing/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,23 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
int64_t alignment = kDefaultBufferAlignment,
MemoryPool* memory_pool = default_memory_pool());

/// \brief Generate a random ListViewArray
///
/// \param[in] values The underlying values array
/// \param[in] size The size of the generated list array
/// \param[in] null_probability the probability of a list value being null
/// \param[in] force_empty_nulls if true, null list entries must have 0 length
/// \param[in] alignment alignment for memory allocations (in bytes)
/// \param[in] memory_pool memory pool to allocate memory from
///
/// \return a generated Array
std::shared_ptr<Array> ListView(const Array& values, int64_t size,
double null_probability = 0,
bool force_empty_nulls = false,
bool zero_undefined_offsets = false,
int64_t alignment = kDefaultBufferAlignment,
MemoryPool* memory_pool = default_memory_pool());

/// \brief Generate a random MapArray
///
/// \param[in] keys The underlying keys array
Expand Down
78 changes: 75 additions & 3 deletions cpp/src/arrow/testing/random_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class RandomArrayTest : public ::testing::TestWithParam<RandomTestParam> {
}

bool HasList(const DataType& type) {
if (is_var_length_list(type.id())) {
if (is_var_length_list_like(type.id())) {
return true;
}
for (const auto& child : type.fields()) {
Expand Down Expand Up @@ -99,7 +99,7 @@ TEST_P(RandomArrayTest, GenerateArrayAlignment) {
const int64_t alignment = 1024;
auto field = GetField();
if (HasList(*field->type())) {
GTEST_SKIP() << "ListArray::FromArrays does not conserve buffer alignment";
GTEST_SKIP() << "List[View]Array::FromArrays does not conserve buffer alignment";
}
auto array = GenerateArray(*field, /*size=*/13, 0xDEADBEEF, alignment);
AssertTypeEqual(field->type(), array->type());
Expand Down Expand Up @@ -176,6 +176,13 @@ auto values = ::testing::Values(
key_value_metadata({{"force_empty_nulls", "true"}})),
field("listint81024values", list(int8()), true,
key_value_metadata({{"values", "1024"}})),
field("listviewint8", list_view(int8())),
field("listviewlistviewint8", list_view(list_view(int8()))),
field("listviewint8emptynulls", list_view(int8()), true,
key_value_metadata(
{{"force_empty_nulls", "true"}, {"zero_undefined_offsets", "true"}})),
field("listviewint81024values", list_view(int8()), true,
key_value_metadata({{"values", "1024"}})),
field("structints", struct_({
field("int8", int8()),
field("int16", int16()),
Expand All @@ -200,7 +207,8 @@ auto values = ::testing::Values(
field("fixedsizelist", fixed_size_list(int8(), 4)),
field("durationns", duration(TimeUnit::NANO)), field("largestring", large_utf8()),
field("largebinary", large_binary()),
field("largelistlistint8", large_list(list(int8()))));
field("largelistlistint8", large_list(list(int8()))),
field("largelistviewlistviewint8", large_list_view(list_view(int8()))));

INSTANTIATE_TEST_SUITE_P(
TestRandomArrayGeneration, RandomArrayTest, values,
Expand Down Expand Up @@ -399,6 +407,39 @@ TEST(TypeSpecificTests, ListLengths) {
}
}

TEST(TypeSpecificTests, ListViewLengths) {
{
auto field =
arrow::field("list_view", list_view(int8()),
key_value_metadata({{"min_length", "1"}, {"max_length", "1"}}));
auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF);
AssertTypeEqual(field->type(), base_array->type());
auto array = internal::checked_pointer_cast<ListViewArray>(base_array);
ASSERT_OK(array->ValidateFull());
ASSERT_EQ(array->length(), kExpectedLength);
for (int i = 0; i < kExpectedLength; i++) {
if (!array->IsNull(i)) {
ASSERT_EQ(1, array->value_length(i));
}
}
}
{
auto field =
arrow::field("list_view", large_list_view(int8()),
key_value_metadata({{"min_length", "10"}, {"max_length", "10"}}));
auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF);
AssertTypeEqual(field->type(), base_array->type());
auto array = internal::checked_pointer_cast<LargeListViewArray>(base_array);
ASSERT_EQ(array->length(), kExpectedLength);
ASSERT_OK(array->ValidateFull());
for (int i = 0; i < kExpectedLength; i++) {
if (!array->IsNull(i)) {
ASSERT_EQ(10, array->value_length(i));
}
}
}
}

TEST(TypeSpecificTests, MapValues) {
auto field =
arrow::field("map", map(int8(), int8()), key_value_metadata({{"values", "4"}}));
Expand Down Expand Up @@ -499,6 +540,24 @@ TEST(RandomList, Basics) {
}
}

TEST(RandomListView, Basics) {
random::RandomArrayGenerator rng(42);
for (const double null_probability : {0.0, 0.1, 0.98}) {
SCOPED_TRACE("null_probability = " + std::to_string(null_probability));
auto values = rng.Int16(1234, 0, 10000, null_probability);
auto array = rng.ListView(*values, 45, null_probability);
ASSERT_OK(array->ValidateFull());
ASSERT_EQ(array->length(), 45);
const auto& list_view_array = checked_cast<const ListViewArray&>(*array);
ASSERT_EQ(list_view_array.values()->length(), 1234);
int64_t null_count = 0;
for (int64_t i = 0; i < array->length(); ++i) {
null_count += array->IsNull(i);
}
ASSERT_EQ(null_count, array->data()->null_count);
}
}

TEST(RandomChildFieldNullablity, List) {
random::RandomArrayGenerator rng(42);

Expand All @@ -512,6 +571,19 @@ TEST(RandomChildFieldNullablity, List) {
ARROW_EXPECT_OK(batch->ValidateFull());
}

TEST(RandomChildFieldNullablity, ListView) {
random::RandomArrayGenerator rng(42);

auto item = arrow::field("item", arrow::int8(), true);
auto nest_list_view_field = arrow::field("list_view", list_view(item), false);
auto list_view_field = arrow::field("list_view", list_view(nest_list_view_field), true);
auto array = rng.ArrayOf(*list_view_field, 428);
ARROW_EXPECT_OK(array->ValidateFull());

auto batch = rng.BatchOf({list_view_field}, 428);
ARROW_EXPECT_OK(batch->ValidateFull());
}

TEST(RandomChildFieldNullablity, Struct) {
random::RandomArrayGenerator rng(42);

Expand Down

0 comments on commit 06ca3f2

Please sign in to comment.