From 8e18cf343fdfbde0fe817f110ec316f22f420c2e Mon Sep 17 00:00:00 2001 From: Alessandro Molina <amol@turbogears.org> Date: Tue, 4 Jan 2022 20:42:28 +0100 Subject: [PATCH] ARROW-13035: [C++] indices_nonzero compute function Add a `indices_nonzero` compute function that returns the indices of an array that contain values `!= 0` or `!= false`. This can be used in conjunction with our existing functions that return a mask to get back the indices where the mask matches. Closes #11886 from amol-/ARROW-13035 Authored-by: Alessandro Molina <amol@turbogears.org> Signed-off-by: Antoine Pitrou <antoine@python.org> --- .../arrow/compute/kernels/vector_selection.cc | 97 +++++++++++++++++++ .../compute/kernels/vector_selection_test.cc | 34 +++++++ docs/source/cpp/compute.rst | 11 +++ docs/source/python/api/compute.rst | 1 + 4 files changed, 143 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index aece06202fe49..c7f599d509304 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -16,8 +16,11 @@ // under the License. #include <algorithm> +#include <cstdint> #include <cstring> #include <limits> +#include <memory> +#include <type_traits> #include "arrow/array/array_binary.h" #include "arrow/array/array_dict.h" @@ -2355,6 +2358,97 @@ const FunctionDoc array_take_doc( "given by `indices`. Nulls in `indices` emit null in the output."), {"array", "indices"}, "TakeOptions"); +const FunctionDoc indices_nonzero_doc( + "Return the indices of the values in the array that are non-zero", + ("For each input value, check if it's zero, false or null. Emit the index\n" + "of the value in the array if it's none of the those."), + {"values"}); + +struct NonZeroVisitor { + UInt64Builder* builder; + const ArrayDataVector& arrays; + + NonZeroVisitor(UInt64Builder* builder, const ArrayDataVector& arrays) + : builder(builder), arrays(arrays) {} + + Status Visit(const DataType& type) { return Status::NotImplemented(type.ToString()); } + + template <typename Type> + enable_if_t<is_primitive_ctype<Type>::value, Status> Visit(const Type&) { + using T = typename GetViewType<Type>::T; + uint64_t index = 0; + + for (const auto& current_array : arrays) { + VisitArrayDataInline<Type>( + *current_array, + [&](T v) { + if (v) { + this->builder->UnsafeAppend(index); + } + ++index; + }, + [&]() { ++index; }); + } + + return Status::OK(); + } +}; + +Status IndicesNonZeroExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + UInt64Builder builder; + ArrayDataVector arrays; + Datum input = batch[0]; + + if (input.kind() == Datum::ARRAY) { + std::shared_ptr<ArrayData> array = input.array(); + RETURN_NOT_OK(builder.Reserve(array->length)); + arrays.push_back(std::move(array)); + } else if (input.kind() == Datum::CHUNKED_ARRAY) { + std::shared_ptr<ChunkedArray> chunkedarr = input.chunked_array(); + RETURN_NOT_OK(builder.Reserve(chunkedarr->length())); + for (int chunkidx = 0; chunkidx < chunkedarr->num_chunks(); ++chunkidx) { + arrays.push_back(std::move(chunkedarr->chunk(chunkidx)->data())); + } + } else { + return Status::NotImplemented(input.ToString()); + } + + NonZeroVisitor visitor(&builder, arrays); + RETURN_NOT_OK(VisitTypeInline(*(arrays[0]->type), &visitor)); + + std::shared_ptr<ArrayData> out_data; + RETURN_NOT_OK(builder.FinishInternal(&out_data)); + out->value = std::move(out_data); + return Status::OK(); +} + +std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name, + const FunctionDoc* doc) { + auto func = std::make_shared<VectorFunction>(name, Arity::Unary(), doc); + + for (const auto& ty : NumericTypes()) { + VectorKernel kernel; + kernel.exec = IndicesNonZeroExec; + kernel.null_handling = NullHandling::OUTPUT_NOT_NULL; + kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + kernel.output_chunked = false; + kernel.can_execute_chunkwise = false; + kernel.signature = KernelSignature::Make({InputType(ty->id())}, uint64()); + DCHECK_OK(func->AddKernel(kernel)); + } + + VectorKernel boolkernel; + boolkernel.exec = IndicesNonZeroExec; + boolkernel.null_handling = NullHandling::OUTPUT_NOT_NULL; + boolkernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + boolkernel.output_chunked = false; + boolkernel.can_execute_chunkwise = false; + boolkernel.signature = KernelSignature::Make({boolean()}, uint64()); + DCHECK_OK(func->AddKernel(boolkernel)); + + return func; +} + } // namespace void RegisterVectorSelection(FunctionRegistry* registry) { @@ -2420,6 +2514,9 @@ void RegisterVectorSelection(FunctionRegistry* registry) { // DropNull kernel DCHECK_OK(registry->AddFunction(std::make_shared<DropNullMetaFunction>())); + + DCHECK_OK(registry->AddFunction( + MakeIndicesNonZeroFunction("indices_nonzero", &indices_nonzero_doc))); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index c0eff3a8e9190..2530d6da86769 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -2365,5 +2365,39 @@ TEST_F(TestDropNullKernelWithTable, DropNullTableWithSlices) { }); } +TEST(TestIndicesNonZero, IndicesNonZero) { + Datum actual; + std::shared_ptr<Array> result; + + ASSERT_OK_AND_ASSIGN( + actual, + CallFunction("indices_nonzero", {ArrayFromJSON(uint32(), "[null, 50, 0, 10]")})); + result = actual.make_array(); + AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[1, 3]")); + + ASSERT_OK_AND_ASSIGN( + actual, CallFunction("indices_nonzero", + {ArrayFromJSON(boolean(), "[null, true, false, true]")})); + result = actual.make_array(); + AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[1, 3]")); + + ASSERT_OK_AND_ASSIGN(actual, + CallFunction("indices_nonzero", + {ArrayFromJSON(float64(), "[null, 1.3, 0.0, 5.0]")})); + result = actual.make_array(); + AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[1, 3]")); + + ASSERT_OK_AND_ASSIGN(actual, + CallFunction("indices_nonzero", {ArrayFromJSON(float64(), "[]")})); + result = actual.make_array(); + AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[]")); + + ChunkedArray chunkedarr( + {ArrayFromJSON(uint32(), "[1, 0, 3]"), ArrayFromJSON(uint32(), "[4, 0, 6]")}); + ASSERT_OK_AND_ASSIGN(actual, + CallFunction("indices_nonzero", {static_cast<Datum>(chunkedarr)})); + AssertArraysEqual(*actual.make_array(), *ArrayFromJSON(uint64(), "[0, 2, 3, 5]")); +} + } // namespace compute } // namespace arrow diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index e88bd4a7ec9b2..9a863f6ad739a 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1564,6 +1564,17 @@ These functions select and return a subset of their input. * \(4) For each element *i* in input 2 (the indices), the *i*'th element in input 1 (the values) is appended to the output. +Containment tests +~~~~~~~~~~~~~~~~~ + +This function returns the indices at which array elements are non-null and non-zero. + ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+=======+===================================+================+=================================+=======+ +| indices_nonzero | Unary | Boolean, Null, Numeric | UInt64 | | | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ + Sorts and partitions ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 516c8c8e3997a..340047130f555 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -344,6 +344,7 @@ Containment Tests match_substring match_substring_regex starts_with + indices_nonzero Categorizations ---------------