From 8e18cf343fdfbde0fe817f110ec316f22f420c2e Mon Sep 17 00:00:00 2001
From: Alessandro Molina <amol@turbogears.org>
Date: Tue, 4 Jan 2022 20:42:28 +0100
Subject: [PATCH] ARROW-13035: [C++] indices_nonzero compute function

Add a `indices_nonzero` compute function that returns the indices of an array that contain values `!= 0` or `!= false`.
This can be used in conjunction with our existing functions that return a mask to get back the indices where the mask matches.

Closes #11886 from amol-/ARROW-13035

Authored-by: Alessandro Molina <amol@turbogears.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../arrow/compute/kernels/vector_selection.cc | 97 +++++++++++++++++++
 .../compute/kernels/vector_selection_test.cc  | 34 +++++++
 docs/source/cpp/compute.rst                   | 11 +++
 docs/source/python/api/compute.rst            |  1 +
 4 files changed, 143 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index aece06202fe49..c7f599d509304 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -16,8 +16,11 @@
 // under the License.
 
 #include <algorithm>
+#include <cstdint>
 #include <cstring>
 #include <limits>
+#include <memory>
+#include <type_traits>
 
 #include "arrow/array/array_binary.h"
 #include "arrow/array/array_dict.h"
@@ -2355,6 +2358,97 @@ const FunctionDoc array_take_doc(
      "given by `indices`.  Nulls in `indices` emit null in the output."),
     {"array", "indices"}, "TakeOptions");
 
+const FunctionDoc indices_nonzero_doc(
+    "Return the indices of the values in the array that are non-zero",
+    ("For each input value, check if it's zero, false or null. Emit the index\n"
+     "of the value in the array if it's none of the those."),
+    {"values"});
+
+struct NonZeroVisitor {
+  UInt64Builder* builder;
+  const ArrayDataVector& arrays;
+
+  NonZeroVisitor(UInt64Builder* builder, const ArrayDataVector& arrays)
+      : builder(builder), arrays(arrays) {}
+
+  Status Visit(const DataType& type) { return Status::NotImplemented(type.ToString()); }
+
+  template <typename Type>
+  enable_if_t<is_primitive_ctype<Type>::value, Status> Visit(const Type&) {
+    using T = typename GetViewType<Type>::T;
+    uint64_t index = 0;
+
+    for (const auto& current_array : arrays) {
+      VisitArrayDataInline<Type>(
+          *current_array,
+          [&](T v) {
+            if (v) {
+              this->builder->UnsafeAppend(index);
+            }
+            ++index;
+          },
+          [&]() { ++index; });
+    }
+
+    return Status::OK();
+  }
+};
+
+Status IndicesNonZeroExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  UInt64Builder builder;
+  ArrayDataVector arrays;
+  Datum input = batch[0];
+
+  if (input.kind() == Datum::ARRAY) {
+    std::shared_ptr<ArrayData> array = input.array();
+    RETURN_NOT_OK(builder.Reserve(array->length));
+    arrays.push_back(std::move(array));
+  } else if (input.kind() == Datum::CHUNKED_ARRAY) {
+    std::shared_ptr<ChunkedArray> chunkedarr = input.chunked_array();
+    RETURN_NOT_OK(builder.Reserve(chunkedarr->length()));
+    for (int chunkidx = 0; chunkidx < chunkedarr->num_chunks(); ++chunkidx) {
+      arrays.push_back(std::move(chunkedarr->chunk(chunkidx)->data()));
+    }
+  } else {
+    return Status::NotImplemented(input.ToString());
+  }
+
+  NonZeroVisitor visitor(&builder, arrays);
+  RETURN_NOT_OK(VisitTypeInline(*(arrays[0]->type), &visitor));
+
+  std::shared_ptr<ArrayData> out_data;
+  RETURN_NOT_OK(builder.FinishInternal(&out_data));
+  out->value = std::move(out_data);
+  return Status::OK();
+}
+
+std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name,
+                                                           const FunctionDoc* doc) {
+  auto func = std::make_shared<VectorFunction>(name, Arity::Unary(), doc);
+
+  for (const auto& ty : NumericTypes()) {
+    VectorKernel kernel;
+    kernel.exec = IndicesNonZeroExec;
+    kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
+    kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+    kernel.output_chunked = false;
+    kernel.can_execute_chunkwise = false;
+    kernel.signature = KernelSignature::Make({InputType(ty->id())}, uint64());
+    DCHECK_OK(func->AddKernel(kernel));
+  }
+
+  VectorKernel boolkernel;
+  boolkernel.exec = IndicesNonZeroExec;
+  boolkernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
+  boolkernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  boolkernel.output_chunked = false;
+  boolkernel.can_execute_chunkwise = false;
+  boolkernel.signature = KernelSignature::Make({boolean()}, uint64());
+  DCHECK_OK(func->AddKernel(boolkernel));
+
+  return func;
+}
+
 }  // namespace
 
 void RegisterVectorSelection(FunctionRegistry* registry) {
@@ -2420,6 +2514,9 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
 
   // DropNull kernel
   DCHECK_OK(registry->AddFunction(std::make_shared<DropNullMetaFunction>()));
+
+  DCHECK_OK(registry->AddFunction(
+      MakeIndicesNonZeroFunction("indices_nonzero", &indices_nonzero_doc)));
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
index c0eff3a8e9190..2530d6da86769 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
@@ -2365,5 +2365,39 @@ TEST_F(TestDropNullKernelWithTable, DropNullTableWithSlices) {
   });
 }
 
+TEST(TestIndicesNonZero, IndicesNonZero) {
+  Datum actual;
+  std::shared_ptr<Array> result;
+
+  ASSERT_OK_AND_ASSIGN(
+      actual,
+      CallFunction("indices_nonzero", {ArrayFromJSON(uint32(), "[null, 50, 0, 10]")}));
+  result = actual.make_array();
+  AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[1, 3]"));
+
+  ASSERT_OK_AND_ASSIGN(
+      actual, CallFunction("indices_nonzero",
+                           {ArrayFromJSON(boolean(), "[null, true, false, true]")}));
+  result = actual.make_array();
+  AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[1, 3]"));
+
+  ASSERT_OK_AND_ASSIGN(actual,
+                       CallFunction("indices_nonzero",
+                                    {ArrayFromJSON(float64(), "[null, 1.3, 0.0, 5.0]")}));
+  result = actual.make_array();
+  AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[1, 3]"));
+
+  ASSERT_OK_AND_ASSIGN(actual,
+                       CallFunction("indices_nonzero", {ArrayFromJSON(float64(), "[]")}));
+  result = actual.make_array();
+  AssertArraysEqual(*result, *ArrayFromJSON(uint64(), "[]"));
+
+  ChunkedArray chunkedarr(
+      {ArrayFromJSON(uint32(), "[1, 0, 3]"), ArrayFromJSON(uint32(), "[4, 0, 6]")});
+  ASSERT_OK_AND_ASSIGN(actual,
+                       CallFunction("indices_nonzero", {static_cast<Datum>(chunkedarr)}));
+  AssertArraysEqual(*actual.make_array(), *ArrayFromJSON(uint64(), "[0, 2, 3, 5]"));
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index e88bd4a7ec9b2..9a863f6ad739a 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1564,6 +1564,17 @@ These functions select and return a subset of their input.
 * \(4) For each element *i* in input 2 (the indices), the *i*'th element
   in input 1 (the values) is appended to the output.
 
+Containment tests
+~~~~~~~~~~~~~~~~~
+
+This function returns the indices at which array elements are non-null and non-zero.
+
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| Function name         | Arity | Input types                       | Output type    | Options class                   | Notes |
++=======================+=======+===================================+================+=================================+=======+
+| indices_nonzero       | Unary | Boolean, Null, Numeric            | UInt64         |                                 |       |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+
 Sorts and partitions
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index 516c8c8e3997a..340047130f555 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -344,6 +344,7 @@ Containment Tests
    match_substring
    match_substring_regex
    starts_with
+   indices_nonzero
 
 Categorizations
 ---------------