-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
This PR gathers work from multiple PRs that can be closed after this one is merged: - Closes #13752 - Closes #13754 - Closes #13842 - Closes #13882 - Closes #13916 - Closes #14063 - Closes #13970 And the issues associated with those PRs can also be closed: - Fixes #20350 - Add RunEndEncodedScalarType - Fixes #32543 - Fixes #32544 - Fixes #32688 - Fixes #32731 - Fixes #32772 - Fixes #32774 * Closes: #32104 Lead-authored-by: Felipe Oliveira Carvalho <[email protected]> Co-authored-by: Tobias Zagorni <[email protected]> Signed-off-by: Matt Topol <[email protected]>
- Loading branch information
Showing
46 changed files
with
2,814 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/array/array_run_end.h" | ||
#include "arrow/array/util.h" | ||
#include "arrow/util/logging.h" | ||
#include "arrow/util/ree_util.h" | ||
|
||
namespace arrow { | ||
|
||
// ---------------------------------------------------------------------- | ||
// RunEndEncodedArray | ||
|
||
RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr<ArrayData>& data) { | ||
this->SetData(data); | ||
} | ||
|
||
RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr<DataType>& type, | ||
int64_t length, | ||
const std::shared_ptr<Array>& run_ends, | ||
const std::shared_ptr<Array>& values, | ||
int64_t offset) { | ||
this->SetData(ArrayData::Make(type, length, | ||
/*buffers=*/{NULLPTR}, | ||
/*child_data=*/{run_ends->data(), values->data()}, | ||
/*null_count=*/0, offset)); | ||
} | ||
|
||
Result<std::shared_ptr<RunEndEncodedArray>> RunEndEncodedArray::Make( | ||
int64_t logical_length, const std::shared_ptr<Array>& run_ends, | ||
const std::shared_ptr<Array>& values, int64_t logical_offset) { | ||
auto run_end_type = run_ends->type(); | ||
auto values_type = values->type(); | ||
if (!RunEndEncodedType::RunEndTypeValid(*run_end_type)) { | ||
return Status::Invalid("Run end type must be int16, int32 or int64"); | ||
} | ||
if (run_ends->null_count() != 0) { | ||
return Status::Invalid("Run ends array cannot contain null values"); | ||
} | ||
if (values->length() < run_ends->length()) { | ||
return Status::Invalid("Values array has to be at least as long as run ends array"); | ||
} | ||
|
||
return std::make_shared<RunEndEncodedArray>( | ||
run_end_encoded(std::move(run_end_type), std::move(values_type)), logical_length, | ||
run_ends, values, logical_offset); | ||
} | ||
|
||
void RunEndEncodedArray::SetData(const std::shared_ptr<ArrayData>& data) { | ||
ARROW_CHECK_EQ(data->type->id(), Type::RUN_END_ENCODED); | ||
const auto* ree_type = | ||
internal::checked_cast<const RunEndEncodedType*>(data->type.get()); | ||
ARROW_CHECK_EQ(ree_type->run_end_type()->id(), data->child_data[0]->type->id()); | ||
ARROW_CHECK_EQ(ree_type->value_type()->id(), data->child_data[1]->type->id()); | ||
|
||
DCHECK_EQ(data->child_data.size(), 2); | ||
|
||
// A non-zero number of logical values in this array (offset + length) implies | ||
// a non-zero number of runs and values. | ||
DCHECK(data->offset + data->length == 0 || data->child_data[0]->length > 0); | ||
DCHECK(data->offset + data->length == 0 || data->child_data[1]->length > 0); | ||
// At least as many values as run_ends | ||
DCHECK_GE(data->child_data[1]->length, data->child_data[0]->length); | ||
|
||
// The null count for run-end encoded arrays is always 0. Actual number of | ||
// nulls needs to be calculated through other means. | ||
DCHECK_EQ(data->null_count, 0); | ||
|
||
Array::SetData(data); | ||
run_ends_array_ = MakeArray(this->data()->child_data[0]); | ||
values_array_ = MakeArray(this->data()->child_data[1]); | ||
} | ||
|
||
int64_t RunEndEncodedArray::FindPhysicalOffset() const { | ||
const ArraySpan span(*this->data_); | ||
return ree_util::FindPhysicalIndex(span, 0, span.offset); | ||
} | ||
|
||
int64_t RunEndEncodedArray::FindPhysicalLength() const { | ||
const ArraySpan span(*this->data_); | ||
return ree_util::FindPhysicalLength(span); | ||
} | ||
|
||
} // namespace arrow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
// Array accessor classes run-end encoded arrays | ||
|
||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <memory> | ||
#include <string> | ||
#include <utility> | ||
#include <vector> | ||
|
||
#include "arrow/array/array_base.h" | ||
#include "arrow/array/data.h" | ||
#include "arrow/result.h" | ||
#include "arrow/status.h" | ||
#include "arrow/type.h" | ||
#include "arrow/type_fwd.h" | ||
#include "arrow/util/checked_cast.h" | ||
#include "arrow/util/macros.h" | ||
#include "arrow/util/visibility.h" | ||
|
||
namespace arrow { | ||
|
||
/// \addtogroup run-end-encoded-arrays | ||
/// | ||
/// @{ | ||
|
||
// ---------------------------------------------------------------------- | ||
// RunEndEncoded | ||
|
||
/// \brief Array type for run-end encoded data | ||
class ARROW_EXPORT RunEndEncodedArray : public Array { | ||
private: | ||
std::shared_ptr<Array> run_ends_array_; | ||
std::shared_ptr<Array> values_array_; | ||
|
||
public: | ||
using TypeClass = RunEndEncodedType; | ||
|
||
explicit RunEndEncodedArray(const std::shared_ptr<ArrayData>& data); | ||
|
||
/// \brief Construct a RunEndEncodedArray from all parameters | ||
/// | ||
/// The length and offset parameters refer to the dimensions of the logical | ||
/// array which is the array we would get after expanding all the runs into | ||
/// repeated values. As such, length can be much greater than the lenght of | ||
/// the child run_ends and values arrays. | ||
RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t length, | ||
const std::shared_ptr<Array>& run_ends, | ||
const std::shared_ptr<Array>& values, int64_t offset = 0); | ||
|
||
/// \brief Construct a RunEndEncodedArray from values and run ends arrays | ||
/// | ||
/// The data type is automatically inferred from the arguments. | ||
/// The run_ends and values arrays must have the same length. | ||
static Result<std::shared_ptr<RunEndEncodedArray>> Make( | ||
int64_t logical_length, const std::shared_ptr<Array>& run_ends, | ||
const std::shared_ptr<Array>& values, int64_t logical_offset = 0); | ||
|
||
protected: | ||
void SetData(const std::shared_ptr<ArrayData>& data); | ||
|
||
public: | ||
/// \brief Returns an array holding the logical indexes of each run-end | ||
/// | ||
/// The physical offset to the array is applied. | ||
const std::shared_ptr<Array>& run_ends() const { return run_ends_array_; } | ||
|
||
/// \brief Returns an array holding the values of each run | ||
/// | ||
/// The physical offset to the array is applied. | ||
const std::shared_ptr<Array>& values() const { return values_array_; } | ||
|
||
/// \brief Find the physical offset of this REE array | ||
/// | ||
/// This function uses binary-search, so it has a O(log N) cost. | ||
int64_t FindPhysicalOffset() const; | ||
|
||
/// \brief Find the physical length of this REE array | ||
/// | ||
/// The physical length of an REE is the number of physical values (and | ||
/// run-ends) necessary to represent the logical range of values from offset | ||
/// to length. | ||
/// | ||
/// Avoid calling this function if the physical length can be estabilished in | ||
/// some other way (e.g. when iterating over the runs sequentially until the | ||
/// end). This function uses binary-search, so it has a O(log N) cost. | ||
int64_t FindPhysicalLength() const; | ||
}; | ||
|
||
/// @} | ||
|
||
} // namespace arrow |
Oops, something went wrong.