Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add support for run-end encoded array #507

Merged
merged 11 commits into from
Jun 7, 2024
85 changes: 85 additions & 0 deletions src/nanoarrow/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
switch (storage_type) {
case NANOARROW_TYPE_UNINITIALIZED:
case NANOARROW_TYPE_NA:
case NANOARROW_TYPE_RUN_END_ENCODED:
array->n_buffers = 0;
break;

Expand Down Expand Up @@ -811,6 +812,15 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
(long)array_view->n_children);
return EINVAL;
}
break;
case NANOARROW_TYPE_RUN_END_ENCODED:
if (array_view->n_children != 2) {
ArrowErrorSet(error, "Expected 2 child of %s array but found %ld child arrays",
ArrowTypeString(array_view->storage_type),
(long)array_view->n_children);
return EINVAL;
}
break;
default:
break;
}
Expand Down Expand Up @@ -846,6 +856,22 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
return EINVAL;
}
break;

case NANOARROW_TYPE_RUN_END_ENCODED:
if (array_view->children[0]->null_count != 0) {
ArrowErrorSet(error, "Run Ends cannot be null but found null_count value %ld",
(long)array_view->children[0]->null_count);
return EINVAL;
}
if (array_view->children[0]->length < array_view->children[1]->length) {
ArrowErrorSet(
error,
"Expected the 2 children of run-end encoded array to have equal length "
"but found mismatched lengths: run_ends->length=%ld, values->length=%ld",
(long)array_view->children[0]->length, (long)array_view->children[1]->length);
return EINVAL;
}
break;
default:
break;
}
Expand Down Expand Up @@ -995,6 +1021,31 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
}
}
break;

case NANOARROW_TYPE_RUN_END_ENCODED: {
struct ArrowArrayView* run_ends_view;
run_ends_view = array_view->children[0];
if (run_ends_view->null_count != 0) {
ArrowErrorSet(error, "Run Ends cannot be null but found null_count value %ld",
(long)run_ends_view->null_count);
return EINVAL;
}
for (int64_t i = 0, prev_run_end = 0; i < run_ends_view->length; i++) {
int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i);
if (run_end < 0 || run_end < prev_run_end) {
return EINVAL;
}
prev_run_end = run_end;
if (i == run_ends_view->length - 1 && run_end != array_view->length) {
ArrowErrorSet(error,
"Run End value %ld at index %ld exceeds the logical "
"length of the run-end encoded array %ld",
(long)run_end, (long)i, (long)array_view->length);
return EINVAL;
}
}
break;
}
default:
break;
}
Expand Down Expand Up @@ -1163,6 +1214,40 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
}
}

if (array_view->storage_type == NANOARROW_TYPE_RUN_END_ENCODED) {
struct ArrowArrayView* run_ends_view = array_view->children[0];
switch (run_ends_view->storage_type) {
case NANOARROW_TYPE_INT16:
case NANOARROW_TYPE_INT32:
case NANOARROW_TYPE_INT64:
break;
default:
ArrowErrorSet(
error,
"Run-end encoded array only supports INT16, INT32 or INT64 run-ends "
"but found run-ends type %s",
ArrowTypeString(run_ends_view->storage_type));
return EINVAL;
}
int64_t prev_run_end = 0;
for (int64_t i = 0; i < run_ends_view->length; i++) {
int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i);
if (run_end < 0 || run_end < prev_run_end) {
return EINVAL;
}
prev_run_end = run_end;
}
if (prev_run_end != array_view->length) {
ArrowErrorSet(error,
"The last run end value of a run-end encoded array must be equal to "
"the logical length"
"of the values array but found the last run end value %ld while the "
"logical length is %ld",
(long)prev_run_end, (long)array_view->length);
return EINVAL;
}
}

// Recurse for children
for (int64_t i = 0; i < array_view->n_children; i++) {
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error));
Expand Down
38 changes: 38 additions & 0 deletions src/nanoarrow/array_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,44 @@ static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) {
}
}
break;

case NANOARROW_TYPE_RUN_END_ENCODED:
if (array->children[0]->length != array->children[1]->length) {
return EINVAL;
}
if (array->children[0]->null_count != 0) {
return EINVAL;
}
array->length = 0;
struct ArrowBuffer* data_buffer;
data_buffer = ArrowArrayBuffer(array->children[0], 1);
for (int64_t i = 0; i < array->children[0]->length; i++) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure that you need a loop here? (i.e., is it possible to just extract the last run end?)

I wonder if this would be a better fit for ArrowArrayFinishBuilding()? Or maybe we need FinishAppending()? I am not sure I would have thought of FinishElement to do these checks/updates since they really only need to happen once per array.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe ArrowArrayFinishRunEndEncoded might be better as it's more suggestive from its name and less confusion?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps, but it is also very similar to the piece of code we have that updates the null_count based on the validity buffer, which I think happens in ArrowArrayFinishBuilding() (although that "feature" has caused me some personal confusion because I sometimes forget that it happens and wonder why the null_count isn't what I set it to be).

The most backward compatible thing to do would be to not do any updating of lengths (i.e., force the caller to set the length of the parent run-end-encoded array). This would not necessarily be difficult to do because they would have to be keeping track of that length to append to the run-ends child (so they would have to keep a counter somewhere anyway). Perhaps for this PR the helper could be omitted and we can learn from the experience of implementing this elsewhere what the best intervention would be?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps for this PR the helper could be omitted and we can learn from the experience of implementing this elsewhere what the best intervention would be?

No problem, I've removed it :)

int64_t run_end = -1;
switch (((struct ArrowArrayPrivateData*)array->children[0]->private_data)
->storage_type) {
case NANOARROW_TYPE_INT16:
run_end = ((int16_t*)data_buffer->data)[i];
break;
case NANOARROW_TYPE_INT32:
run_end = ((int32_t*)data_buffer->data)[i];
break;
case NANOARROW_TYPE_INT64:
run_end = ((int64_t*)data_buffer->data)[i];
break;
default:
break;
}
if (run_end < 0) {
return EINVAL;
}
array->length = run_end;
}

if (private_data->bitmap.buffer.data != NULL) {
NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1, 1));
}

return NANOARROW_OK;
default:
return EINVAL;
}
Expand Down
77 changes: 77 additions & 0 deletions src/nanoarrow/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <arrow/array/builder_decimal.h>
#include <arrow/array/builder_nested.h>
#include <arrow/array/builder_primitive.h>
#include <arrow/array/builder_run_end.h>
#include <arrow/array/builder_time.h>
#include <arrow/array/builder_union.h>
#include <arrow/c/bridge.h>
Expand Down Expand Up @@ -1440,6 +1441,82 @@ TEST(ArrayTest, ArrayTestAppendToStructArray) {
EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
}

TEST(ArrayTest, ArrayTestAppendToRunEndEncodedArray) {
struct ArrowArray array;
struct ArrowSchema schema;
struct ArrowError error;

ArrowSchemaInit(&schema);
ASSERT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT32,
NANOARROW_TYPE_FLOAT),
NANOARROW_OK);
ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);

ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);

ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 4), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 6), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 7), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 1.0), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendNull(array.children[1], 1), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 2.0), NANOARROW_OK);
EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);

// Make sure number of children is checked at finish
array.n_children = 0;
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Expected 2 child of run_end_encoded array but found 0 child arrays");
array.n_children = 2;

// Make sure final child size is checked at finish
array.children[0]->length = array.children[0]->length - 1;
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
EXPECT_STREQ(
ArrowErrorMessage(&error),
"Expected the 2 children of run-end encoded array to have equal length but "
"found mismatched lengths: run_ends->length=2, values->length=3");

array.children[0]->length = array.children[0]->length + 1;
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);

// Make sure the run_ends array length is validated
struct ArrowArrayView array_view;
ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, NULL), NANOARROW_OK);
ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, NULL), NANOARROW_OK);
EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error),
NANOARROW_OK);

array_view.length -= 1;
EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error),
EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Run End value 7 at index 2 exceeds the logical length of the run-end "
"encoded array 6");
array_view.length += 1;
ArrowArrayViewReset(&array_view);

auto arrow_array = ImportArray(&array, &schema);
ARROW_EXPECT_OK(arrow_array);

auto run_ends_builder = std::make_shared<Int32Builder>();
auto values_builder = std::make_shared<FloatBuilder>();
auto builder =
RunEndEncodedBuilder(default_memory_pool(), run_ends_builder, values_builder,
run_end_encoded(int32(), float32()));
ARROW_EXPECT_OK(run_ends_builder->Append(4));
ARROW_EXPECT_OK(run_ends_builder->Append(6));
ARROW_EXPECT_OK(run_ends_builder->Append(7));
ARROW_EXPECT_OK(values_builder->Append(1.0));
ARROW_EXPECT_OK(values_builder->AppendNull());
ARROW_EXPECT_OK(values_builder->Append(2.0));
auto expected_array = builder.Finish();
ARROW_EXPECT_OK(expected_array);

EXPECT_STREQ(arrow_array.ValueUnsafe()->ToString().c_str(),
expected_array.ValueUnsafe()->ToString().c_str());
}

TEST(ArrayTest, ArrayTestUnionUtils) {
// Check length calculation with nullptr
EXPECT_EQ(_ArrowParseUnionTypeIds("", nullptr), 0);
Expand Down
11 changes: 11 additions & 0 deletions src/nanoarrow/nanoarrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize)
#define ArrowSchemaSetTypeDecimal \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal)
#define ArrowSchemaSetTypeRunEndEncoded \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeRunEndEncoded)
#define ArrowSchemaSetTypeDateTime \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime)
#define ArrowSchemaSetTypeUnion \
Expand Down Expand Up @@ -372,6 +374,15 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowT
int32_t decimal_precision,
int32_t decimal_scale);

/// \brief Set the format field of a run-end encoded schema
///
/// Returns EINVAL for scale <= 0 or for run_end_type that is not
/// NANOARROW_TYPE_INT16, NANOARROW_TYPE_INT32 or NANOARROW_TYPE_INT64.
/// Schema must have been initialized using ArrowSchemaInit() or ArrowSchemaDeepCopy().
ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema,
enum ArrowType run_end_type,
enum ArrowType value_type);

/// \brief Set the format field of a time, timestamp, or duration schema
///
/// Returns EINVAL for type that is not
Expand Down
15 changes: 14 additions & 1 deletion src/nanoarrow/nanoarrow_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,8 @@ enum ArrowType {
NANOARROW_TYPE_LARGE_STRING,
NANOARROW_TYPE_LARGE_BINARY,
NANOARROW_TYPE_LARGE_LIST,
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO,
NANOARROW_TYPE_RUN_END_ENCODED
};

/// \brief Get a string value of an enum ArrowType value
Expand Down Expand Up @@ -537,6 +538,8 @@ static inline const char* ArrowTypeString(enum ArrowType type) {
return "large_list";
case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
return "interval_month_day_nano";
case NANOARROW_TYPE_RUN_END_ENCODED:
return "run_end_encoded";
default:
return NULL;
}
Expand Down Expand Up @@ -953,6 +956,16 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal,
memcpy(decimal->words, value, decimal->n_words * sizeof(uint64_t));
}

/// \brief A representation of a run of the run end encoded array
/// \ingroup nanoarrow-utils
///
struct ArrowRunEndEncoded {
/// \brief The length of this run
int64_t length;

/// \brief The value of this run
};

#ifdef __cplusplus
}
#endif
Expand Down
41 changes: 41 additions & 0 deletions src/nanoarrow/schema.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ static const char* ArrowSchemaFormatTemplate(enum ArrowType type) {
return "+s";
case NANOARROW_TYPE_MAP:
return "+m";
case NANOARROW_TYPE_RUN_END_ENCODED:
return "+r";

default:
return NULL;
Expand Down Expand Up @@ -155,6 +157,13 @@ static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema,
NANOARROW_RETURN_NOT_OK(
ArrowSchemaSetName(schema->children[0]->children[1], "value"));
break;
case NANOARROW_TYPE_RUN_END_ENCODED:
NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 2));
ArrowSchemaInit(schema->children[0]);
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "run_ends"));
schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE;
ArrowSchemaInit(schema->children[1]);
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[1], "values"));
default:
break;
}
Expand Down Expand Up @@ -277,6 +286,28 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowT
return ArrowSchemaSetFormat(schema, buffer);
}

ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema,
enum ArrowType run_end_type,
enum ArrowType value_type) {
switch (run_end_type) {
case NANOARROW_TYPE_INT16:
case NANOARROW_TYPE_INT32:
case NANOARROW_TYPE_INT64:
break;
default:
return EINVAL;
}

NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(
schema, ArrowSchemaFormatTemplate(NANOARROW_TYPE_RUN_END_ENCODED)));
NANOARROW_RETURN_NOT_OK(
ArrowSchemaInitChildrenIfNeeded(schema, NANOARROW_TYPE_RUN_END_ENCODED));
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema->children[0], run_end_type));
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema->children[1], value_type));

return NANOARROW_OK;
}

static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) {
switch (time_unit) {
case NANOARROW_TIME_UNIT_SECOND:
Expand Down Expand Up @@ -750,6 +781,13 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view,
*format_end_out = format + 2;
return NANOARROW_OK;

// run end encoded has no buffer at all
case 'r':
schema_view->storage_type = NANOARROW_TYPE_RUN_END_ENCODED;
schema_view->type = NANOARROW_TYPE_RUN_END_ENCODED;
*format_end_out = format + 2;
return NANOARROW_OK;

// just validity buffer
case 'w':
if (format[2] != ':' || format[3] == '\0') {
Expand Down Expand Up @@ -1124,6 +1162,9 @@ static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_vie
case NANOARROW_TYPE_FIXED_SIZE_LIST:
return ArrowSchemaViewValidateNChildren(schema_view, 1, error);

case NANOARROW_TYPE_RUN_END_ENCODED:
return ArrowSchemaViewValidateNChildren(schema_view, 2, error);

case NANOARROW_TYPE_STRUCT:
return ArrowSchemaViewValidateNChildren(schema_view, -1, error);

Expand Down
Loading
Loading