From dc3f7deafcbeeb0a9a327fc62e3ce95ba8aa9c52 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 27 Oct 2022 09:02:38 -0400
Subject: [PATCH] Move Thrust's pinned_allocator into cudf

Thrust 1.17 removes the experimental/pinned_allocator. While
Thrust offers a replacement in `thrust::system::cuda::universal_host_pinned_memory_resource`. In doing so we also need to move the consumers to being CUDA sources which would negatively impact our compile time.

Instead we move Thrust's removed pinned_allocator into
cudf and continue to use it
---
 cpp/benchmarks/io/text/multibyte_split.cpp    |   5 +-
 cpp/include/cudf/utilities/pinned_allocator.h | 214 ++++++++++++++++++
 cpp/src/io/text/bgzip_data_chunk_source.cu    |   4 +-
 .../io/text/data_chunk_source_factories.cpp   |   4 +-
 cpp/src/io/utilities/hostdevice_vector.hpp    |   4 +-
 5 files changed, 222 insertions(+), 9 deletions(-)
 create mode 100644 cpp/include/cudf/utilities/pinned_allocator.h
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index 380766fee46..d93107ce5a6 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -31,9 +31,9 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_allocator.h>
 
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #include <thrust/transform.h>
 
 #include <nvbench/nvbench.cuh>
@@ -135,8 +135,7 @@ static void bench_multibyte_split(nvbench::state& state,
   auto const delim_factor = static_cast<double>(delim_percent) / 100;
   auto device_input       = create_random_input(file_size_approx, delim_factor, 0.05, delim);
   auto host_input         = std::vector<char>{};
-  auto host_pinned_input =
-    thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>>{};
+  auto host_pinned_input  = thrust::host_vector<char, cudf::pinned_allocator<char>>{};
 
   if (source_type == data_chunk_source_type::host || source_type == data_chunk_source_type::file ||
       source_type == data_chunk_source_type::file_bgzip) {
diff --git a/cpp/include/cudf/utilities/pinned_allocator.h b/cpp/include/cudf/utilities/pinned_allocator.h
new file mode 100644
index 00000000000..18e6947b3b2
--- /dev/null
+++ b/cpp/include/cudf/utilities/pinned_allocator.h
@@ -0,0 +1,214 @@
+/*
+ *  Copyright 2008-2022 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/experimental/pinned_allocator.h
+ *  \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <stdexcept>
+#include <limits>
+#include <string>
+#include <thrust/system/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+
+namespace cudf
+{
+
+/*! \p pinned_allocator is a CUDA-specific host memory allocator
+ *  that employs \c cudaMallocHost for allocation.
+ *
+ * This implementation is ported from the experimental/pinned_allocator
+ * that Thrust used to provide.
+ *
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
+ */
+template<typename T> class pinned_allocator;
+
+template<>
+  class pinned_allocator<void>
+{
+  public:
+    typedef void           value_type;
+    typedef void       *   pointer;
+    typedef const void *   const_pointer;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    // convert a pinned_allocator<void> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+}; // end pinned_allocator
+
+
+template<typename T>
+  class pinned_allocator
+{
+  public:
+    //! \{
+    typedef T              value_type;
+    typedef T*             pointer;
+    typedef const T*       const_pointer;
+    typedef T&             reference;
+    typedef const T&       const_reference;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+    //! \}
+
+    // convert a pinned_allocator<T> to pinned_allocator<U>
+    template<typename U>
+      struct rebind
+    {
+      typedef pinned_allocator<U> other;
+    }; // end rebind
+
+    /*! \p pinned_allocator's null constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator() {}
+
+    /*! \p pinned_allocator's null destructor does nothing.
+     */
+    __host__ __device__
+    inline ~pinned_allocator() {}
+
+    /*! \p pinned_allocator's copy constructor does nothing.
+     */
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator const &) {}
+
+    /*! This version of \p pinned_allocator's copy constructor
+     *  is templated on the \c value_type of the \p pinned_allocator
+     *  to copy from.  It is provided merely for convenience; it
+     *  does nothing.
+     */
+    template<typename U>
+    __host__ __device__
+    inline pinned_allocator(pinned_allocator<U> const &) {}
+
+    /*! This method returns the address of a \c reference of
+     *  interest.
+     *
+     *  \p r The \c reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline pointer address(reference r) { return &r; }
+
+    /*! This method returns the address of a \c const_reference
+     *  of interest.
+     *
+     *  \p r The \c const_reference of interest.
+     *  \return \c r's address.
+     */
+    __host__ __device__
+    inline const_pointer address(const_reference r) { return &r; }
+
+    /*! This method allocates storage for objects in pinned host
+     *  memory.
+     *
+     *  \p cnt The number of objects to allocate.
+     *  \return a \c pointer to the newly allocated objects.
+     *  \note This method does not invoke \p value_type's constructor.
+     *        It is the responsibility of the caller to initialize the
+     *        objects at the returned \c pointer.
+     */
+    __host__
+    inline pointer allocate(size_type cnt,
+                            const_pointer = 0)
+    {
+      if(cnt > this->max_size())
+      {
+        throw std::bad_alloc();
+      } // end if
+
+      pointer result(0);
+      cudaError_t error = cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type));
+
+      if(error)
+      {
+        cudaGetLastError(); // Clear global CUDA error state.
+        throw std::bad_alloc();
+      } // end if
+
+      return result;
+    } // end allocate()
+
+    /*! This method deallocates pinned host memory previously allocated
+     *  with this \c pinned_allocator.
+     *
+     *  \p p A \c pointer to the previously allocated memory.
+     *  \p cnt The number of objects previously allocated at
+     *         \p p.
+     *  \note This method does not invoke \p value_type's destructor.
+     *        It is the responsibility of the caller to destroy
+     *        the objects stored at \p p.
+     */
+    __host__
+    inline void deallocate(pointer p, size_type /*cnt*/)
+    {
+      cudaError_t error = cudaFreeHost(p);
+
+      cudaGetLastError(); // Clear global CUDA error state.
+
+      if(error)
+      {
+        cudaGetLastError(); // Clear global CUDA error state.
+        throw thrust::system_error(error, thrust::cuda_category());
+      } // end if
+    } // end deallocate()
+
+    /*! This method returns the maximum size of the \c cnt parameter
+     *  accepted by the \p allocate() method.
+     *
+     *  \return The maximum number of objects that may be allocated
+     *          by a single call to \p allocate().
+     */
+    inline size_type max_size() const
+    {
+      return (std::numeric_limits<size_type>::max)() / sizeof(T);
+    } // end max_size()
+
+    /*! This method tests this \p pinned_allocator for equality to
+     *  another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c true.
+     */
+    __host__ __device__
+    inline bool operator==(pinned_allocator const& x) const { return true; }
+
+    /*! This method tests this \p pinned_allocator for inequality
+     *  to another.
+     *
+     *  \param x The other \p pinned_allocator of interest.
+     *  \return This method always returns \c false.
+     */
+    __host__ __device__
+    inline bool operator!=(pinned_allocator const &x) const { return !operator==(x); }
+}; // end pinned_allocator
+
+/*! \}
+ */
+
+} // end cudf
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 7f1f6688bec..02af44f075e 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -23,6 +23,7 @@
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_allocator.h>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -30,7 +31,6 @@
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #include <thrust/transform.h>
 
 #include <fstream>
@@ -66,7 +66,7 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
  private:
   template <typename T>
   using pinned_host_vector =
-    thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;
+    thrust::host_vector<T, cudf::pinned_allocator<T>>;
 
   template <typename T>
   static void copy_to_device(const pinned_host_vector<T>& host,
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 9a549951d66..4e12f9f8fdf 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -18,11 +18,11 @@
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/utilities/pinned_allocator.h>
 
 #include <rmm/device_buffer.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 
 #include <fstream>
 
@@ -37,7 +37,7 @@ namespace {
 class istream_data_chunk_reader : public data_chunk_reader {
   struct host_ticket {
     cudaEvent_t event;
-    thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>> buffer;
+    thrust::host_vector<char, cudf::pinned_allocator<char>> buffer;
   };
 
  public:
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 6e34d862ed4..80d2427f41a 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -19,12 +19,12 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
+#include <cudf/utilities/pinned_allocator.h>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 
 /**
  * @brief A helper class that wraps fixed-length device memory for the GPU, and
@@ -126,7 +126,7 @@ class hostdevice_vector {
   }
 
  private:
-  thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>> h_data;
+  thrust::host_vector<T, cudf::pinned_allocator<T>> h_data;
   rmm::device_uvector<T> d_data;
 };