Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add support for run-end encoded array #507

Merged
merged 11 commits into from
Jun 7, 2024
108 changes: 108 additions & 0 deletions src/nanoarrow/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
switch (storage_type) {
case NANOARROW_TYPE_UNINITIALIZED:
case NANOARROW_TYPE_NA:
case NANOARROW_TYPE_RUN_END_ENCODED:
array->n_buffers = 0;
break;

Expand Down Expand Up @@ -811,6 +812,15 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
(long)array_view->n_children);
return EINVAL;
}
break;
case NANOARROW_TYPE_RUN_END_ENCODED:
if (array_view->n_children != 2) {
ArrowErrorSet(
error, "Expected 2 children for %s array but found %ld child arrays",
ArrowTypeString(array_view->storage_type), (long)array_view->n_children);
return EINVAL;
}
break;
default:
break;
}
Expand Down Expand Up @@ -846,6 +856,68 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
return EINVAL;
}
break;

case NANOARROW_TYPE_RUN_END_ENCODED: {
if (array_view->n_children != 2) {
ArrowErrorSet(error,
"Expected 2 children for run-end encoded array but found %ld",
(long)array_view->n_children);
return EINVAL;
}
struct ArrowArrayView* run_ends_view = array_view->children[0];
struct ArrowArrayView* values_view = array_view->children[1];
int64_t max_length;
switch (run_ends_view->storage_type) {
case NANOARROW_TYPE_INT16:
max_length = INT16_MAX;
break;
case NANOARROW_TYPE_INT32:
max_length = INT32_MAX;
break;
case NANOARROW_TYPE_INT64:
max_length = INT64_MAX;
break;
default:
ArrowErrorSet(
error,
"Run-end encoded array only supports INT16, INT32 or INT64 run-ends "
"but found run-ends type %s",
ArrowTypeString(run_ends_view->storage_type));
return EINVAL;
}
// uint64_t is used here to avoid overflow when adding the offset and length
if ((uint64_t)array_view->offset + (uint64_t)array_view->length >
(uint64_t)max_length) {
ArrowErrorSet(
error,
"Offset + length of a run-end encoded array must fit in a value"
" of the run end type %s, but offset + length is %lu while the "
"allowed maximum is %lu",
ArrowTypeString(run_ends_view->storage_type),
(unsigned long)array_view->offset + (unsigned long)array_view->length,
(unsigned long)max_length);
return EINVAL;
}
if (run_ends_view->length > values_view->length) {
ArrowErrorSet(
error, "Length of run_ends is greater than the length of values: %ld > %ld",
(long)run_ends_view->length, (long)values_view->length);
return EINVAL;
}
if (run_ends_view->length == 0 && values_view->length != 0) {
ArrowErrorSet(error,
"Run-end encoded array has zero length %ld, but values array has "
"non-zero length",
(long)values_view->length);
return EINVAL;
}
if (run_ends_view->null_count != 0) {
ArrowErrorSet(error, "Null count must be 0 for run ends array, but is %ld",
(long)run_ends_view->null_count);
return EINVAL;
}
break;
}
default:
break;
}
Expand Down Expand Up @@ -995,6 +1067,18 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
}
}
break;

case NANOARROW_TYPE_RUN_END_ENCODED: {
struct ArrowArrayView* run_ends_view = array_view->children[0];
int64_t last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 0);
if (last_run_end < 1) {
ArrowErrorSet(error,
"All run ends must be greater than 0 but the first run end is %ld",
(long)last_run_end);
return EINVAL;
}
break;
}
default:
break;
}
Expand Down Expand Up @@ -1163,6 +1247,30 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
}
}

if (array_view->storage_type == NANOARROW_TYPE_RUN_END_ENCODED) {
struct ArrowArrayView* run_ends_view = array_view->children[0];
int64_t last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 0);
for (int64_t i = 1; i < run_ends_view->length; i++) {
const int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i);
if (run_end <= last_run_end) {
ArrowErrorSet(error,
"Every run end must be strictly greater than the previous run end, "
"but run_ends[%ld] is %ld and run_ends[%ld] is %ld",
(long)i, (long)run_end, (long)i - 1, (long)last_run_end);
return EINVAL;
}
last_run_end = run_end;
}
last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, run_ends_view->length - 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, run_ends_view->length - 1);

Is there any chance of run_ends_view->length being 0 here? If so I think this might invoke UB

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! I'll send a PR

if (last_run_end < (array_view->offset + array_view->length)) {
ArrowErrorSet(error,
"Last run end is %ld but it should >= %ld (offset: %ld, length: %ld)",
(long)last_run_end, (long)(array_view->offset + array_view->length),
(long)array_view->offset, (long)array_view->length);
return EINVAL;
}
}

// Recurse for children
for (int64_t i = 0; i < array_view->n_children; i++) {
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error));
Expand Down
1 change: 1 addition & 0 deletions src/nanoarrow/array_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,7 @@ static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) {
}
}
break;
return NANOARROW_OK;
default:
return EINVAL;
}
Expand Down
153 changes: 153 additions & 0 deletions src/nanoarrow/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <arrow/array/builder_decimal.h>
#include <arrow/array/builder_nested.h>
#include <arrow/array/builder_primitive.h>
#include <arrow/array/builder_run_end.h>
#include <arrow/array/builder_time.h>
#include <arrow/array/builder_union.h>
#include <arrow/c/bridge.h>
Expand Down Expand Up @@ -1440,6 +1441,158 @@ TEST(ArrayTest, ArrayTestAppendToStructArray) {
EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
}

TEST(ArrayTest, ArrayTestAppendToRunEndEncodedArray) {
struct ArrowArray array;
struct ArrowSchema schema;
struct ArrowError error;

// in this test case we construct a run-end encoded array with logical length = 7
// and the values are float32s
//
// the virtual big array:
// type: Float32
// [1.0, 1.0, 1.0, 1.0, null, null, 2.0]
//
// run-end encoded array:
// run_ends<INT32>: [4, 6, 7]
// values<FLOAT>: [1.0, null, 2.0]

ArrowSchemaInit(&schema);
ASSERT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT32), NANOARROW_OK);
ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_FLOAT), NANOARROW_OK);
ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);

ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 4), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 6), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 7), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 1.0), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendNull(array.children[1], 1), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 2.0), NANOARROW_OK);
array.length = 7;

// Make sure number of children is checked at finish
array.n_children = 0;
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Expected 2 children for run_end_encoded array but found 0 child arrays");
array.n_children = 2;

{
array.offset = INT32_MAX;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
EINVAL);
EXPECT_STREQ(
ArrowErrorMessage(&error),
"Offset + length of a run-end encoded array must fit in a value of the "
"run end type int32, but offset + length is 2147483654 while the allowed "
"maximum is 2147483647");

((struct ArrowArrayPrivateData*)(array.children[0]->private_data))->storage_type =
NANOARROW_TYPE_INT16;
array.offset = INT16_MAX;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
EINVAL);
EXPECT_STREQ(
ArrowErrorMessage(&error),
"Offset + length of a run-end encoded array must fit in a value of the run end "
"type int16, but offset + length is 32774 while the allowed maximum is 32767");

((struct ArrowArrayPrivateData*)(array.children[0]->private_data))->storage_type =
NANOARROW_TYPE_INT64;
array.offset = INT64_MAX;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Offset + length of a run-end encoded array must fit in a value of the "
"run end type int64, but offset + length is 9223372036854775814 while "
"the allowed "
"maximum is 9223372036854775807");
}
((struct ArrowArrayPrivateData*)(array.children[0]->private_data))->storage_type =
NANOARROW_TYPE_INT32;
array.offset = 0;

// Make sure final child size is checked at finish
array.children[0]->length += 1;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Length of run_ends is greater than the length of values: 4 > 3");
array.children[0]->length -= 1;

array.children[0]->length = 0;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
EINVAL);
EXPECT_STREQ(
ArrowErrorMessage(&error),
"Run-end encoded array has zero length 3, but values array has non-zero length");
array.children[0]->length = 3;

array.children[0]->null_count = 1;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Null count must be 0 for run ends array, but is 1");
array.children[0]->null_count = 0;

// it can be a projection of the virtual big array
// [1.0, 1.0, 1.0, 1.0, null, null, 2.0]
// ^ ^
// |- offset = 1 |- length = 6
array.length = 6;
array.offset = 1;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
NANOARROW_OK);

// checks for one-off errors
// this one makes the logical length larger than the last run end
// [1.0, 1.0, 1.0, 1.0, null, null, 2.0]
// ^ ^
// |- offset = 1 |- length = 7 (out of bound)
array.length = 7;
array.offset = 1;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Last run end is 7 but it should >= 8 (offset: 1, length: 7)");

// [1.0, 1.0, 1.0, 1.0, null, null, 2.0]
// ^ ^
// |- offset = 1 |- length = 8 (out of bound)
array.length = 8;
array.offset = 0;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Last run end is 7 but it should >= 8 (offset: 0, length: 8)");

array.length = 7;
array.offset = 0;
EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, &error),
NANOARROW_OK);

auto arrow_array = ImportArray(&array, &schema);
ARROW_EXPECT_OK(arrow_array);

auto run_ends_builder = std::make_shared<Int32Builder>();
auto values_builder = std::make_shared<FloatBuilder>();
auto builder =
RunEndEncodedBuilder(default_memory_pool(), run_ends_builder, values_builder,
run_end_encoded(int32(), float32()));
ARROW_EXPECT_OK(run_ends_builder->Append(4));
ARROW_EXPECT_OK(run_ends_builder->Append(6));
ARROW_EXPECT_OK(run_ends_builder->Append(7));
ARROW_EXPECT_OK(values_builder->Append(1.0));
ARROW_EXPECT_OK(values_builder->AppendNull());
ARROW_EXPECT_OK(values_builder->Append(2.0));
auto expected_array = builder.Finish();
ARROW_EXPECT_OK(expected_array);

EXPECT_STREQ(arrow_array.ValueUnsafe()->ToString().c_str(),
expected_array.ValueUnsafe()->ToString().c_str());
}

TEST(ArrayTest, ArrayTestUnionUtils) {
// Check length calculation with nullptr
EXPECT_EQ(_ArrowParseUnionTypeIds("", nullptr), 0);
Expand Down
13 changes: 13 additions & 0 deletions src/nanoarrow/nanoarrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize)
#define ArrowSchemaSetTypeDecimal \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal)
#define ArrowSchemaSetTypeRunEndEncoded \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeRunEndEncoded)
#define ArrowSchemaSetTypeDateTime \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime)
#define ArrowSchemaSetTypeUnion \
Expand Down Expand Up @@ -372,6 +374,17 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowT
int32_t decimal_precision,
int32_t decimal_scale);

/// \brief Set the format field of a run-end encoded schema
///
/// Returns EINVAL for run_end_type that is not
/// NANOARROW_TYPE_INT16, NANOARROW_TYPE_INT32 or NANOARROW_TYPE_INT64.
/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy().
/// The caller must call `ArrowSchemaSetTypeXXX(schema->children[1])` to
/// set the value type. Note that when building arrays using the `ArrowArrayAppendXXX()`
/// functions, the run-end encoded array's logical length must be updated manually.
ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema,
enum ArrowType run_end_type);

/// \brief Set the format field of a time, timestamp, or duration schema
///
/// Returns EINVAL for type that is not
Expand Down
5 changes: 4 additions & 1 deletion src/nanoarrow/nanoarrow_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,8 @@ enum ArrowType {
NANOARROW_TYPE_LARGE_STRING,
NANOARROW_TYPE_LARGE_BINARY,
NANOARROW_TYPE_LARGE_LIST,
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO,
NANOARROW_TYPE_RUN_END_ENCODED
};

/// \brief Get a string value of an enum ArrowType value
Expand Down Expand Up @@ -537,6 +538,8 @@ static inline const char* ArrowTypeString(enum ArrowType type) {
return "large_list";
case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
return "interval_month_day_nano";
case NANOARROW_TYPE_RUN_END_ENCODED:
return "run_end_encoded";
default:
return NULL;
}
Expand Down
Loading
Loading