Skip to content

Commit

Permalink
GH-32104: [C++] Add support for Run-End encoded data to Arrow (#33641)
Browse files Browse the repository at this point in the history
This PR gathers work from multiple PRs that can be closed after this one is merged:

 - Closes #13752
 - Closes #13754
 - Closes #13842
 - Closes #13882
 - Closes #13916
 - Closes #14063
 - Closes #13970

And the issues associated with those PRs can also be closed:

 - Fixes #20350
 - Add RunEndEncodedScalarType
 - Fixes #32543
 - Fixes #32544
 - Fixes #32688
 - Fixes #32731
 - Fixes #32772
 - Fixes #32774

* Closes: #32104

Lead-authored-by: Felipe Oliveira Carvalho <[email protected]>
Co-authored-by: Tobias Zagorni <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
  • Loading branch information
felipecrv and zagto authored Feb 17, 2023
1 parent 157b8f5 commit 1264e40
Show file tree
Hide file tree
Showing 46 changed files with 2,814 additions and 13 deletions.
4 changes: 4 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,13 @@ set(ARROW_SRCS
array/array_dict.cc
array/array_nested.cc
array/array_primitive.cc
array/array_run_end.cc
array/builder_adaptive.cc
array/builder_base.cc
array/builder_binary.cc
array/builder_decimal.cc
array/builder_dict.cc
array/builder_run_end.cc
array/builder_nested.cc
array/builder_primitive.cc
array/builder_union.cc
Expand Down Expand Up @@ -218,6 +220,7 @@ set(ARROW_SRCS
util/key_value_metadata.cc
util/memory.cc
util/mutex.cc
util/ree_util.cc
util/string.cc
util/string_builder.cc
util/task_group.cc
Expand Down Expand Up @@ -746,6 +749,7 @@ add_arrow_test(array_test
array/array_binary_test.cc
array/array_dict_test.cc
array/array_list_test.cc
array/array_run_end_test.cc
array/array_struct_test.cc
array/array_union_test.cc
array/array_view_test.cc
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,16 @@
/// @{
/// @}

/// \defgroup run-end-encoded-arrays Concrete classes for run-end encoded arrays
/// @{
/// @}

#include "arrow/array/array_base.h" // IWYU pragma: keep
#include "arrow/array/array_binary.h" // IWYU pragma: keep
#include "arrow/array/array_decimal.h" // IWYU pragma: keep
#include "arrow/array/array_dict.h" // IWYU pragma: keep
#include "arrow/array/array_nested.h" // IWYU pragma: keep
#include "arrow/array/array_primitive.h" // IWYU pragma: keep
#include "arrow/array/array_run_end.h" // IWYU pragma: keep
#include "arrow/array/data.h" // IWYU pragma: keep
#include "arrow/array/util.h" // IWYU pragma: keep
12 changes: 11 additions & 1 deletion cpp/src/arrow/array/array_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/logging.h"
#include "arrow/util/ree_util.h"
#include "arrow/visit_array_inline.h"
#include "arrow/visitor.h"

Expand Down Expand Up @@ -143,6 +144,15 @@ struct ScalarFromArraySlotImpl {
return Status::OK();
}

Status Visit(const RunEndEncodedArray& a) {
ArraySpan span{*a.data()};
const int64_t physical_index = ree_util::FindPhysicalIndex(span, index_, span.offset);
ScalarFromArraySlotImpl scalar_from_values(*a.values(), physical_index);
ARROW_ASSIGN_OR_RAISE(auto value, std::move(scalar_from_values).Finish());
out_ = std::make_shared<RunEndEncodedScalar>(std::move(value), a.type());
return Status::OK();
}

Status Visit(const ExtensionArray& a) {
ARROW_ASSIGN_OR_RAISE(auto storage, a.storage()->GetScalar(index_));
out_ = std::make_shared<ExtensionScalar>(std::move(storage), a.type());
Expand All @@ -165,7 +175,7 @@ struct ScalarFromArraySlotImpl {
array_.length());
}

if (array_.IsNull(index_)) {
if (array_.type()->id() != Type::RUN_END_ENCODED && array_.IsNull(index_)) {
auto null = MakeNullScalar(array_.type());
if (is_dictionary(array_.type()->id())) {
auto& dict_null = checked_cast<DictionaryScalar&>(*null);
Expand Down
98 changes: 98 additions & 0 deletions cpp/src/arrow/array/array_run_end.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/array/array_run_end.h"
#include "arrow/array/util.h"
#include "arrow/util/logging.h"
#include "arrow/util/ree_util.h"

namespace arrow {

// ----------------------------------------------------------------------
// RunEndEncodedArray

RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr<ArrayData>& data) {
this->SetData(data);
}

RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr<DataType>& type,
int64_t length,
const std::shared_ptr<Array>& run_ends,
const std::shared_ptr<Array>& values,
int64_t offset) {
this->SetData(ArrayData::Make(type, length,
/*buffers=*/{NULLPTR},
/*child_data=*/{run_ends->data(), values->data()},
/*null_count=*/0, offset));
}

Result<std::shared_ptr<RunEndEncodedArray>> RunEndEncodedArray::Make(
int64_t logical_length, const std::shared_ptr<Array>& run_ends,
const std::shared_ptr<Array>& values, int64_t logical_offset) {
auto run_end_type = run_ends->type();
auto values_type = values->type();
if (!RunEndEncodedType::RunEndTypeValid(*run_end_type)) {
return Status::Invalid("Run end type must be int16, int32 or int64");
}
if (run_ends->null_count() != 0) {
return Status::Invalid("Run ends array cannot contain null values");
}
if (values->length() < run_ends->length()) {
return Status::Invalid("Values array has to be at least as long as run ends array");
}

return std::make_shared<RunEndEncodedArray>(
run_end_encoded(std::move(run_end_type), std::move(values_type)), logical_length,
run_ends, values, logical_offset);
}

void RunEndEncodedArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::RUN_END_ENCODED);
const auto* ree_type =
internal::checked_cast<const RunEndEncodedType*>(data->type.get());
ARROW_CHECK_EQ(ree_type->run_end_type()->id(), data->child_data[0]->type->id());
ARROW_CHECK_EQ(ree_type->value_type()->id(), data->child_data[1]->type->id());

DCHECK_EQ(data->child_data.size(), 2);

// A non-zero number of logical values in this array (offset + length) implies
// a non-zero number of runs and values.
DCHECK(data->offset + data->length == 0 || data->child_data[0]->length > 0);
DCHECK(data->offset + data->length == 0 || data->child_data[1]->length > 0);
// At least as many values as run_ends
DCHECK_GE(data->child_data[1]->length, data->child_data[0]->length);

// The null count for run-end encoded arrays is always 0. Actual number of
// nulls needs to be calculated through other means.
DCHECK_EQ(data->null_count, 0);

Array::SetData(data);
run_ends_array_ = MakeArray(this->data()->child_data[0]);
values_array_ = MakeArray(this->data()->child_data[1]);
}

int64_t RunEndEncodedArray::FindPhysicalOffset() const {
const ArraySpan span(*this->data_);
return ree_util::FindPhysicalIndex(span, 0, span.offset);
}

int64_t RunEndEncodedArray::FindPhysicalLength() const {
const ArraySpan span(*this->data_);
return ree_util::FindPhysicalLength(span);
}

} // namespace arrow
109 changes: 109 additions & 0 deletions cpp/src/arrow/array/array_run_end.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Array accessor classes run-end encoded arrays

#pragma once

#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"

namespace arrow {

/// \addtogroup run-end-encoded-arrays
///
/// @{

// ----------------------------------------------------------------------
// RunEndEncoded

/// \brief Array type for run-end encoded data
class ARROW_EXPORT RunEndEncodedArray : public Array {
private:
std::shared_ptr<Array> run_ends_array_;
std::shared_ptr<Array> values_array_;

public:
using TypeClass = RunEndEncodedType;

explicit RunEndEncodedArray(const std::shared_ptr<ArrayData>& data);

/// \brief Construct a RunEndEncodedArray from all parameters
///
/// The length and offset parameters refer to the dimensions of the logical
/// array which is the array we would get after expanding all the runs into
/// repeated values. As such, length can be much greater than the lenght of
/// the child run_ends and values arrays.
RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Array>& run_ends,
const std::shared_ptr<Array>& values, int64_t offset = 0);

/// \brief Construct a RunEndEncodedArray from values and run ends arrays
///
/// The data type is automatically inferred from the arguments.
/// The run_ends and values arrays must have the same length.
static Result<std::shared_ptr<RunEndEncodedArray>> Make(
int64_t logical_length, const std::shared_ptr<Array>& run_ends,
const std::shared_ptr<Array>& values, int64_t logical_offset = 0);

protected:
void SetData(const std::shared_ptr<ArrayData>& data);

public:
/// \brief Returns an array holding the logical indexes of each run-end
///
/// The physical offset to the array is applied.
const std::shared_ptr<Array>& run_ends() const { return run_ends_array_; }

/// \brief Returns an array holding the values of each run
///
/// The physical offset to the array is applied.
const std::shared_ptr<Array>& values() const { return values_array_; }

/// \brief Find the physical offset of this REE array
///
/// This function uses binary-search, so it has a O(log N) cost.
int64_t FindPhysicalOffset() const;

/// \brief Find the physical length of this REE array
///
/// The physical length of an REE is the number of physical values (and
/// run-ends) necessary to represent the logical range of values from offset
/// to length.
///
/// Avoid calling this function if the physical length can be estabilished in
/// some other way (e.g. when iterating over the runs sequentially until the
/// end). This function uses binary-search, so it has a O(log N) cost.
int64_t FindPhysicalLength() const;
};

/// @}

} // namespace arrow
Loading

0 comments on commit 1264e40

Please sign in to comment.