From 9af26dc3056ab6ad8a8bd3f1e7b6123104f70a6e Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:20:04 -0700
Subject: [PATCH 1/5] adds tabulate output iterator

---
 thrust/testing/tabulate_output_iterator.cu    | 138 ++++++++++++++++++
 .../detail/tabulate_output_iterator.inl       |  72 +++++++++
 .../iterator/tabulate_output_iterator.h       | 116 +++++++++++++++
 3 files changed, 326 insertions(+)
 create mode 100644 thrust/testing/tabulate_output_iterator.cu
 create mode 100644 thrust/thrust/iterator/detail/tabulate_output_iterator.inl
 create mode 100644 thrust/thrust/iterator/tabulate_output_iterator.h

diff --git a/thrust/testing/tabulate_output_iterator.cu b/thrust/testing/tabulate_output_iterator.cu
new file mode 100644
index 00000000000..766e47caeb7
--- /dev/null
+++ b/thrust/testing/tabulate_output_iterator.cu
@@ -0,0 +1,138 @@
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <cuda/std/type_traits>
+
+#include <unittest/unittest.h>
+
+template <typename OutItT>
+struct host_write_op
+{
+  OutItT out;
+
+  template <typename IndexT, typename T>
+  __host__ __forceinline__ void operator()(IndexT index, T val)
+  {
+    // val is a thrust::tuple(value, input_index). Only write out the value part.
+    out[index] = thrust::get<0>(val);
+  }
+};
+
+template <typename OutItT>
+struct device_write_op
+{
+  OutItT out;
+
+  template <typename IndexT, typename T>
+  __device__ __forceinline__ void operator()(IndexT index, T val)
+  {
+    // val is a thrust::tuple(value, input_index). Only write out the value part.
+    out[index] = thrust::get<0>(val);
+  }
+};
+
+struct select_op
+{
+  std::size_t select_every_nth;
+
+  template <typename T, typename IndexT>
+  __device__ __host__ __forceinline__ bool operator()(thrust::tuple<T, IndexT> key_index_pair)
+  {
+    // Select every n-th item
+    return (thrust::get<1>(key_index_pair) % select_every_nth == 0);
+  }
+};
+
+struct index_to_gather_index_op
+{
+  std::size_t gather_stride;
+
+  template <typename IndexT>
+  __device__ __host__ __forceinline__ IndexT operator()(IndexT index)
+  {
+    // Gather the i-th output item from input[i*3]
+    return index * static_cast<IndexT>(gather_stride);
+  }
+};
+
+template <class Vector>
+void TestTabulateOutputIterator()
+{
+  using T     = typename Vector::value_type;
+  using it_t  = typename Vector::iterator;
+  using space = typename thrust::iterator_system<typename Vector::iterator>::type;
+
+  static constexpr std::size_t num_items = 240;
+  Vector input(num_items);
+  Vector output(num_items, T{42});
+
+  // Use operator type that supports the targeted system
+  using op_t = typename ::cuda::std::conditional<(::cuda::std::is_same<space, thrust::host_system_tag>::value),
+                                                 host_write_op<it_t>,
+                                                 device_write_op<it_t>>::type;
+
+  // Construct tabulate_output_iterator
+  op_t op{output.begin()};
+  auto tabulate_out_it = thrust::make_tabulate_output_iterator(op);
+
+  // Prepare input
+  thrust::sequence(input.begin(), input.end(), 1);
+  auto iota_it   = thrust::make_counting_iterator(0);
+  auto zipped_in = thrust::make_zip_iterator(input.begin(), iota_it);
+
+  // Run copy_if using tabulate_output_iterator as the output iterator
+  static constexpr std::size_t select_every_nth = 3;
+  auto selected_it_end =
+    thrust::copy_if(zipped_in, zipped_in + num_items, tabulate_out_it, select_op{select_every_nth});
+  const auto num_selected = static_cast<std::size_t>(thrust::distance(tabulate_out_it, selected_it_end));
+
+  // Prepare expected data
+  Vector expected_output(num_items, T{42});
+  const std::size_t expected_num_selected = (num_items + select_every_nth - 1) / select_every_nth;
+  auto gather_index_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(0), index_to_gather_index_op{select_every_nth});
+  thrust::gather(gather_index_it, gather_index_it + expected_num_selected, input.cbegin(), expected_output.begin());
+
+  ASSERT_EQUAL(expected_num_selected, num_selected);
+  ASSERT_EQUAL(output, expected_output);
+}
+DECLARE_VECTOR_UNITTEST(TestTabulateOutputIterator);
+
+void TestTabulateOutputIterator()
+{
+  using vector_t = thrust::host_vector<int>;
+  using vec_it_t = typename vector_t::iterator;
+  using op_t     = host_write_op<vec_it_t>;
+
+  vector_t out(4, 42);
+  thrust::tabulate_output_iterator<op_t> tabulate_out_it{op_t{out.begin()}};
+
+  tabulate_out_it[1] = 2;
+  ASSERT_EQUAL(out[0], 42);
+  ASSERT_EQUAL(out[1], 2);
+  ASSERT_EQUAL(out[2], 42);
+  ASSERT_EQUAL(out[3], 42);
+
+  tabulate_out_it[3] = 0;
+  ASSERT_EQUAL(out[0], 42);
+  ASSERT_EQUAL(out[1], 2);
+  ASSERT_EQUAL(out[2], 42);
+  ASSERT_EQUAL(out[3], 0);
+
+  tabulate_out_it[1] = 4;
+  ASSERT_EQUAL(out[0], 42);
+  ASSERT_EQUAL(out[1], 4);
+  ASSERT_EQUAL(out[2], 42);
+  ASSERT_EQUAL(out[3], 0);
+}
+
+DECLARE_UNITTEST(TestTabulateOutputIterator);
diff --git a/thrust/thrust/iterator/detail/tabulate_output_iterator.inl b/thrust/thrust/iterator/detail/tabulate_output_iterator.inl
new file mode 100644
index 00000000000..f473abf5636
--- /dev/null
+++ b/thrust/thrust/iterator/detail/tabulate_output_iterator.inl
@@ -0,0 +1,72 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename BinaryFunction, typename System, typename DifferenceT>
+class tabulate_output_iterator;
+
+namespace detail
+{
+
+// Proxy reference that invokes a BinaryFunction with the index of the dereferenced iterator and the assigned value
+template <typename BinaryFunction, typename DifferenceT>
+class tabulate_output_iterator_proxy
+{
+public:
+  _CCCL_HOST_DEVICE tabulate_output_iterator_proxy(BinaryFunction fun, DifferenceT index)
+      : fun(fun)
+      , index(index)
+  {}
+
+  _CCCL_EXEC_CHECK_DISABLE
+  template <typename T>
+  _CCCL_HOST_DEVICE tabulate_output_iterator_proxy operator=(const T& x)
+  {
+    fun(index, x);
+    return *this;
+  }
+
+private:
+  BinaryFunction fun;
+  DifferenceT index;
+};
+
+// Compute the iterator_adaptor instantiation to be used for tabulate_output_iterator
+template <typename BinaryFunction, typename System, typename DifferenceT>
+struct tabulate_output_iterator_base
+{
+  using type =
+    thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, System, DifferenceT>,
+                             counting_iterator<DifferenceT>,
+                             thrust::use_default,
+                             System,
+                             thrust::use_default,
+                             tabulate_output_iterator_proxy<BinaryFunction, DifferenceT>>;
+};
+
+// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <class BinaryFunction, class OutputIterator>
+struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, OutputIterator>>
+    : public thrust::detail::true_type
+{};
+
+} // namespace detail
+THRUST_NAMESPACE_END
diff --git a/thrust/thrust/iterator/tabulate_output_iterator.h b/thrust/thrust/iterator/tabulate_output_iterator.h
new file mode 100644
index 00000000000..e290489e172
--- /dev/null
+++ b/thrust/thrust/iterator/tabulate_output_iterator.h
@@ -0,0 +1,116 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+#include <thrust/iterator/detail/tabulate_output_iterator.inl>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p tabulate_output_iterator is a special kind of output iterator which, whenever a value is assigned to a
+ * dereferenced iterator, calls the given callable with the index of the dereferenced iterator and the the assigned
+ * value.
+ *
+ * The following code snippet demonstrated how to create a \p tabulate_output_iterator which prints the index and the
+ * assigned value.
+ *
+ * \code
+ * #include <thrust/iterator/tabulate_output_iterator.h>
+ *
+ *  // note: functor inherits form binary function
+ *  struct print_op : public thrust::binary_function<int, float, void>
+ *  {
+ *    __host__ __device__
+ *    void operator()(int index, float value) const
+ *    {
+ *      printf("%d: %f\n", index, value);
+ *    }
+ *  };
+ *
+ *  int main()
+ *  {
+ *    auto tabulate_it = thrust::make_tabulate_output_iterator(print_op{});
+ *
+ *    tabulate_it[0] =  1.0f;    // prints: 0: 1.0
+ *    tabulate_it[1] =  3.0f;    // prints: 1: 3.0
+ *    tabulate_it[9] =  5.0f;    // prints: 9: 5.0
+ *  }
+ *  \endcode
+ *
+ *  \see make_tabulate_output_iterator
+ */
+
+template <typename BinaryFunction, typename System = use_default, typename DifferenceT = ptrdiff_t>
+class tabulate_output_iterator : public detail::tabulate_output_iterator_base<BinaryFunction, System, DifferenceT>::type
+{
+  /*! \cond
+   */
+
+public:
+  using super_t = typename detail::tabulate_output_iterator_base<BinaryFunction, System, DifferenceT>::type;
+
+  friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  tabulate_output_iterator() = default;
+
+  /*! This constructor takes as argument a \c BinaryFunction and copies it to a new \p tabulate_output_iterator
+   *
+   * \param fun A \c BinaryFunction called whenever a value is assigned to this \p tabulate_output_iterator.
+   */
+  _CCCL_HOST_DEVICE tabulate_output_iterator(BinaryFunction fun)
+      : fun(fun)
+  {}
+
+  /*! \cond
+   */
+
+private:
+  _CCCL_HOST_DEVICE typename super_t::reference dereference() const
+  {
+    return detail::tabulate_output_iterator_proxy<BinaryFunction, DifferenceT>(fun, *this->base());
+  }
+
+  BinaryFunction fun;
+
+  /*! \endcond
+   */
+}; // end tabulate_output_iterator
+
+/*! \p make_tabulate_output_iterator creates a \p tabulate_output_iterator from a \c BinaryFunction.
+ *
+ *  \param fun The \c BinaryFunction invoked whenever assigning to a dereferenced \p tabulate_output_iterator
+ *  \see tabulate_output_iterator
+ */
+template <typename BinaryFunction>
+tabulate_output_iterator<BinaryFunction> _CCCL_HOST_DEVICE make_tabulate_output_iterator(BinaryFunction fun)
+{
+  return tabulate_output_iterator<BinaryFunction>(fun);
+} // end make_tabulate_output_iterator
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+THRUST_NAMESPACE_END

From 5b3c411165c785a04013f26b8d55865d695ffab7 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Thu, 22 Aug 2024 22:51:27 -0700
Subject: [PATCH 2/5] uses cccl exec space macros

---
 thrust/testing/tabulate_output_iterator.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/thrust/testing/tabulate_output_iterator.cu b/thrust/testing/tabulate_output_iterator.cu
index 766e47caeb7..8f76acf208c 100644
--- a/thrust/testing/tabulate_output_iterator.cu
+++ b/thrust/testing/tabulate_output_iterator.cu
@@ -20,7 +20,7 @@ struct host_write_op
   OutItT out;
 
   template <typename IndexT, typename T>
-  __host__ __forceinline__ void operator()(IndexT index, T val)
+  _CCCL_HOST void operator()(IndexT index, T val)
   {
     // val is a thrust::tuple(value, input_index). Only write out the value part.
     out[index] = thrust::get<0>(val);
@@ -33,7 +33,7 @@ struct device_write_op
   OutItT out;
 
   template <typename IndexT, typename T>
-  __device__ __forceinline__ void operator()(IndexT index, T val)
+  _CCCL_DEVICE void operator()(IndexT index, T val)
   {
     // val is a thrust::tuple(value, input_index). Only write out the value part.
     out[index] = thrust::get<0>(val);
@@ -45,7 +45,7 @@ struct select_op
   std::size_t select_every_nth;
 
   template <typename T, typename IndexT>
-  __device__ __host__ __forceinline__ bool operator()(thrust::tuple<T, IndexT> key_index_pair)
+  _CCCL_HOST_DEVICE bool operator()(thrust::tuple<T, IndexT> key_index_pair)
   {
     // Select every n-th item
     return (thrust::get<1>(key_index_pair) % select_every_nth == 0);
@@ -57,7 +57,7 @@ struct index_to_gather_index_op
   std::size_t gather_stride;
 
   template <typename IndexT>
-  __device__ __host__ __forceinline__ IndexT operator()(IndexT index)
+  _CCCL_HOST_DEVICE IndexT operator()(IndexT index)
   {
     // Gather the i-th output item from input[i*3]
     return index * static_cast<IndexT>(gather_stride);

From 1b2bd98e500871b90c4d94841793e73062f24797 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 23 Aug 2024 00:35:28 -0700
Subject: [PATCH 3/5] addresses review comments

---
 thrust/testing/tabulate_output_iterator.cu     | 18 +++++++++++++++---
 .../thrust/iterator/tabulate_output_iterator.h |  5 +++--
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/thrust/testing/tabulate_output_iterator.cu b/thrust/testing/tabulate_output_iterator.cu
index 8f76acf208c..789ed6cf04e 100644
--- a/thrust/testing/tabulate_output_iterator.cu
+++ b/thrust/testing/tabulate_output_iterator.cu
@@ -19,6 +19,18 @@ struct host_write_op
 {
   OutItT out;
 
+  template <typename IndexT, typename T>
+  _CCCL_HOST void operator()(IndexT index, T val)
+  {
+    out[index] = val;
+  }
+};
+
+template <typename OutItT>
+struct host_write_first_op
+{
+  OutItT out;
+
   template <typename IndexT, typename T>
   _CCCL_HOST void operator()(IndexT index, T val)
   {
@@ -28,7 +40,7 @@ struct host_write_op
 };
 
 template <typename OutItT>
-struct device_write_op
+struct device_write_first_op
 {
   OutItT out;
 
@@ -77,8 +89,8 @@ void TestTabulateOutputIterator()
 
   // Use operator type that supports the targeted system
   using op_t = typename ::cuda::std::conditional<(::cuda::std::is_same<space, thrust::host_system_tag>::value),
-                                                 host_write_op<it_t>,
-                                                 device_write_op<it_t>>::type;
+                                                 host_write_first_op<it_t>,
+                                                 device_write_first_op<it_t>>::type;
 
   // Construct tabulate_output_iterator
   op_t op{output.begin()};
diff --git a/thrust/thrust/iterator/tabulate_output_iterator.h b/thrust/thrust/iterator/tabulate_output_iterator.h
index e290489e172..8dd26cec79b 100644
--- a/thrust/thrust/iterator/tabulate_output_iterator.h
+++ b/thrust/thrust/iterator/tabulate_output_iterator.h
@@ -12,6 +12,7 @@
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
 #  pragma system_header
 #endif // no system header
+
 #include <thrust/iterator/detail/tabulate_output_iterator.inl>
 
 THRUST_NAMESPACE_BEGIN
@@ -26,8 +27,8 @@ THRUST_NAMESPACE_BEGIN
  */
 
 /*! \p tabulate_output_iterator is a special kind of output iterator which, whenever a value is assigned to a
- * dereferenced iterator, calls the given callable with the index of the dereferenced iterator and the the assigned
- * value.
+ * dereferenced iterator, calls the given callable with the index that corresponds to the offset of the dereferenced
+ * iterator and the the assigned value.
  *
  * The following code snippet demonstrated how to create a \p tabulate_output_iterator which prints the index and the
  * assigned value.

From 34f7b538e5d2bb1b30aa7c2b72815897eae2e3cc Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 23 Aug 2024 06:45:31 -0700
Subject: [PATCH 4/5] fixes documentation and example

---
 thrust/thrust/iterator/tabulate_output_iterator.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/thrust/thrust/iterator/tabulate_output_iterator.h b/thrust/thrust/iterator/tabulate_output_iterator.h
index 8dd26cec79b..2dcfc5efcd7 100644
--- a/thrust/thrust/iterator/tabulate_output_iterator.h
+++ b/thrust/thrust/iterator/tabulate_output_iterator.h
@@ -28,7 +28,7 @@ THRUST_NAMESPACE_BEGIN
 
 /*! \p tabulate_output_iterator is a special kind of output iterator which, whenever a value is assigned to a
  * dereferenced iterator, calls the given callable with the index that corresponds to the offset of the dereferenced
- * iterator and the the assigned value.
+ * iterator and the assigned value.
  *
  * The following code snippet demonstrated how to create a \p tabulate_output_iterator which prints the index and the
  * assigned value.
@@ -37,7 +37,7 @@ THRUST_NAMESPACE_BEGIN
  * #include <thrust/iterator/tabulate_output_iterator.h>
  *
  *  // note: functor inherits form binary function
- *  struct print_op : public thrust::binary_function<int, float, void>
+ *  struct print_op
  *  {
  *    __host__ __device__
  *    void operator()(int index, float value) const

From e7270b0e808d35a900cc1580bd0251018fa4e3d6 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 23 Aug 2024 06:56:04 -0700
Subject: [PATCH 5/5] moves to using alias template instead of member type

---
 thrust/thrust/iterator/detail/tabulate_output_iterator.inl | 7 ++-----
 thrust/thrust/iterator/tabulate_output_iterator.h          | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/thrust/thrust/iterator/detail/tabulate_output_iterator.inl b/thrust/thrust/iterator/detail/tabulate_output_iterator.inl
index f473abf5636..b5ed7258015 100644
--- a/thrust/thrust/iterator/detail/tabulate_output_iterator.inl
+++ b/thrust/thrust/iterator/detail/tabulate_output_iterator.inl
@@ -48,18 +48,15 @@ private:
   DifferenceT index;
 };
 
-// Compute the iterator_adaptor instantiation to be used for tabulate_output_iterator
+// Alias template for the iterator_adaptor instantiation to be used for tabulate_output_iterator
 template <typename BinaryFunction, typename System, typename DifferenceT>
-struct tabulate_output_iterator_base
-{
-  using type =
+using tabulate_output_iterator_base =
     thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, System, DifferenceT>,
                              counting_iterator<DifferenceT>,
                              thrust::use_default,
                              System,
                              thrust::use_default,
                              tabulate_output_iterator_proxy<BinaryFunction, DifferenceT>>;
-};
 
 // Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
 // type_traits to enable its use with algorithms.
diff --git a/thrust/thrust/iterator/tabulate_output_iterator.h b/thrust/thrust/iterator/tabulate_output_iterator.h
index 2dcfc5efcd7..af9a244063e 100644
--- a/thrust/thrust/iterator/tabulate_output_iterator.h
+++ b/thrust/thrust/iterator/tabulate_output_iterator.h
@@ -60,13 +60,13 @@ THRUST_NAMESPACE_BEGIN
  */
 
 template <typename BinaryFunction, typename System = use_default, typename DifferenceT = ptrdiff_t>
-class tabulate_output_iterator : public detail::tabulate_output_iterator_base<BinaryFunction, System, DifferenceT>::type
+class tabulate_output_iterator : public detail::tabulate_output_iterator_base<BinaryFunction, System, DifferenceT>
 {
   /*! \cond
    */
 
 public:
-  using super_t = typename detail::tabulate_output_iterator_base<BinaryFunction, System, DifferenceT>::type;
+  using super_t = detail::tabulate_output_iterator_base<BinaryFunction, System, DifferenceT>;
 
   friend class thrust::iterator_core_access;
   /*! \endcond