Skip to content

Commit

Permalink
ARROW-13035: [C++] indices_nonzero compute function
Browse files Browse the repository at this point in the history
Add a `indices_nonzero` compute function that returns the indices of an array that contain values `!= 0` or `!= false`.
This can be used in conjunction with our existing functions that return a mask to get back the indices where the mask matches.

Closes #11886 from amol-/ARROW-13035

Authored-by: Alessandro Molina <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
amol- authored and pitrou committed Jan 4, 2022
1 parent e7dc8f5 commit 8e18cf3
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 0 deletions.
97 changes: 97 additions & 0 deletions cpp/src/arrow/compute/kernels/vector_selection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@
// under the License.

#include <algorithm>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <type_traits>

#include "arrow/array/array_binary.h"
#include "arrow/array/array_dict.h"
Expand Down Expand Up @@ -2355,6 +2358,97 @@ const FunctionDoc array_take_doc(
"given by `indices`. Nulls in `indices` emit null in the output."),
{"array", "indices"}, "TakeOptions");

const FunctionDoc indices_nonzero_doc(
"Return the indices of the values in the array that are non-zero",
("For each input value, check if it's zero, false or null. Emit the index\n"
"of the value in the array if it's none of the those."),
{"values"});

struct NonZeroVisitor {
UInt64Builder* builder;
const ArrayDataVector& arrays;

NonZeroVisitor(UInt64Builder* builder, const ArrayDataVector& arrays)
: builder(builder), arrays(arrays) {}

Status Visit(const DataType& type) { return Status::NotImplemented(type.ToString()); }

template <typename Type>
enable_if_t<is_primitive_ctype<Type>::value, Status> Visit(const Type&) {
using T = typename GetViewType<Type>::T;
uint64_t index = 0;

for (const auto& current_array : arrays) {
VisitArrayDataInline<Type>(
*current_array,
[&](T v) {
if (v) {
this->builder->UnsafeAppend(index);
}
++index;
},
[&]() { ++index; });
}

return Status::OK();
}
};

Status IndicesNonZeroExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
UInt64Builder builder;
ArrayDataVector arrays;
Datum input = batch[0];

if (input.kind() == Datum::ARRAY) {
std::shared_ptr<ArrayData> array = input.array();
RETURN_NOT_OK(builder.Reserve(array->length));
arrays.push_back(std::move(array));
} else if (input.kind() == Datum::CHUNKED_ARRAY) {
std::shared_ptr<ChunkedArray> chunkedarr = input.chunked_array();
RETURN_NOT_OK(builder.Reserve(chunkedarr->length()));
for (int chunkidx = 0; chunkidx < chunkedarr->num_chunks(); ++chunkidx) {
arrays.push_back(std::move(chunkedarr->chunk(chunkidx)->data()));
}
} else {
return Status::NotImplemented(input.ToString());
}

NonZeroVisitor visitor(&builder, arrays);
RETURN_NOT_OK(VisitTypeInline(*(arrays[0]->type), &visitor));

std::shared_ptr<ArrayData> out_data;
RETURN_NOT_OK(builder.FinishInternal(&out_data));
out->value = std::move(out_data);
return Status::OK();
}

std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name,
const FunctionDoc* doc) {
auto func = std::make_shared<VectorFunction>(name, Arity::Unary(), doc);

for (const auto& ty : NumericTypes()) {
VectorKernel kernel;
kernel.exec = IndicesNonZeroExec;
kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
kernel.output_chunked = false;
kernel.can_execute_chunkwise = false;
kernel.signature = KernelSignature::Make({InputType(ty->id())}, uint64());
DCHECK_OK(func->AddKernel(kernel));
}

VectorKernel boolkernel;
boolkernel.exec = IndicesNonZeroExec;
boolkernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
boolkernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
boolkernel.output_chunked = false;
boolkernel.can_execute_chunkwise = false;
boolkernel.signature = KernelSignature::Make({boolean()}, uint64());
DCHECK_OK(func->AddKernel(boolkernel));

return func;
}

} // namespace

void RegisterVectorSelection(FunctionRegistry* registry) {
Expand Down Expand Up @@ -2420,6 +2514,9 @@ void RegisterVectorSelection(FunctionRegistry* registry) {

// DropNull kernel
DCHECK_OK(registry->AddFunction(std::make_shared<DropNullMetaFunction>()));

DCHECK_OK(registry->AddFunction(
MakeIndicesNonZeroFunction("indices_nonzero", &indices_nonzero_doc)));
}

} // namespace internal
Expand Down
34 changes: 34 additions & 0 deletions cpp/src/arrow/compute/kernels/vector_selection_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2365,5 +2365,39 @@ TEST_F(TestDropNullKernelWithTable, DropNullTableWithSlices) {
});
}

TEST(TestIndicesNonZero, IndicesNonZero) {
Datum actual;
std::shared_ptr<Array> result;

ASSERT_OK_AND_ASSIGN(
actual,
CallFunction("indices_nonzero", {ArrayFromJSON(uint32(), "[null, 50, 0, 10]")}));
result = actual.make_array();
AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[1, 3]"));

ASSERT_OK_AND_ASSIGN(
actual, CallFunction("indices_nonzero",
{ArrayFromJSON(boolean(), "[null, true, false, true]")}));
result = actual.make_array();
AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[1, 3]"));

ASSERT_OK_AND_ASSIGN(actual,
CallFunction("indices_nonzero",
{ArrayFromJSON(float64(), "[null, 1.3, 0.0, 5.0]")}));
result = actual.make_array();
AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[1, 3]"));

ASSERT_OK_AND_ASSIGN(actual,
CallFunction("indices_nonzero", {ArrayFromJSON(float64(), "[]")}));
result = actual.make_array();
AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[]"));

ChunkedArray chunkedarr(
{ArrayFromJSON(uint32(), "[1, 0, 3]"), ArrayFromJSON(uint32(), "[4, 0, 6]")});
ASSERT_OK_AND_ASSIGN(actual,
CallFunction("indices_nonzero", {static_cast<Datum>(chunkedarr)}));
AssertArraysEqual(*actual.make_array(), *ArrayFromJSON(uint64(), "[0, 2, 3, 5]"));
}

} // namespace compute
} // namespace arrow
11 changes: 11 additions & 0 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1564,6 +1564,17 @@ These functions select and return a subset of their input.
* \(4) For each element *i* in input 2 (the indices), the *i*'th element
in input 1 (the values) is appended to the output.

Containment tests
~~~~~~~~~~~~~~~~~

This function returns the indices at which array elements are non-null and non-zero.

+-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
+=======================+=======+===================================+================+=================================+=======+
| indices_nonzero | Unary | Boolean, Null, Numeric | UInt64 | | |
+-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+

Sorts and partitions
~~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions docs/source/python/api/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ Containment Tests
match_substring
match_substring_regex
starts_with
indices_nonzero

Categorizations
---------------
Expand Down

0 comments on commit 8e18cf3

Please sign in to comment.