From c34b32b248309382d1329adc97fe7c9d3e55eefe Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 31 Mar 2021 00:51:01 +0530
Subject: [PATCH 01/10] Initial memory tracking resource And sample use in
 parquet writer bench

---
 cpp/benchmarks/fixture/benchmark_fixture.hpp  | 149 +++++++++++++++++-
 .../io/parquet/parquet_writer_benchmark.cpp   |   3 +
 2 files changed, 151 insertions(+), 1 deletion(-)
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index dd1bbcba0b4..a330ccf26d9 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -30,8 +30,150 @@ inline auto make_pool()
 {
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
 }
+
+template <typename Upstream>
+class memory_tracking_resource final : public rmm::mr::device_memory_resource {
+ public:
+  /**
+   * @brief Construct a new tracking resource adaptor using `upstream` to satisfy
+   * allocation requests and tracking information about each allocation/free to
+   * the members current_allocated_size_ and max_allocated_size_.
+   *
+   * @throws `rmm::logic_error` if `upstream == nullptr`
+   *
+   * @param upstream The resource used for allocating/deallocating device memory
+   */
+  memory_tracking_resource(Upstream* upstream) : upstream_{upstream}
+  {
+    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+  }
+
+  memory_tracking_resource()                                = delete;
+  ~memory_tracking_resource()                               = default;
+  memory_tracking_resource(memory_tracking_resource const&) = delete;
+  memory_tracking_resource(memory_tracking_resource&&)      = default;
+  memory_tracking_resource& operator=(memory_tracking_resource const&) = delete;
+  memory_tracking_resource& operator=(memory_tracking_resource&&) = default;
+
+  /**
+   * @brief Return pointer to the upstream resource.
+   *
+   * @return Upstream* Pointer to the upstream resource.
+   */
+  Upstream* get_upstream() const noexcept { return upstream_; }
+
+  /**
+   * @brief Checks whether the upstream resource supports streams.
+   *
+   * @return true The upstream resource supports streams
+   * @return false The upstream resource does not support streams.
+   */
+  bool supports_streams() const noexcept override { return upstream_->supports_streams(); }
+
+  /**
+   * @brief Query whether the resource supports the get_mem_info API.
+   *
+   * @return bool true if the upstream resource supports get_mem_info, false otherwise.
+   */
+  bool supports_get_mem_info() const noexcept override
+  {
+    return upstream_->supports_get_mem_info();
+  }
+
+  size_t max_allocated_size() const noexcept { return max_allocated_size_; }
+  size_t current_allocated_size() const noexcept { return current_allocated_size_; }
+
+ private:
+  /**
+   * @brief Allocates memory of size at least `bytes` using the upstream
+   * resource and logs the allocation.
+   *
+   * If the upstream allocation is successful updates the current total memory and peak memory
+   * allocated with this resource
+   *
+   * The returned pointer has at least 256B alignment.
+   *
+   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
+   * by the upstream resource.
+   *
+   * @param bytes The size, in bytes, of the allocation
+   * @param stream Stream on which to perform the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    auto const p = upstream_->allocate(bytes, stream);
+    current_allocated_size_ += bytes;
+    max_allocated_size_ = std::max(current_allocated_size_, max_allocated_size_);
+    return p;
+  }
+
+  /**
+   * @brief Free allocation of size `bytes` pointed to by `p` and log the
+   * deallocation.
+   *
+   * Updates the current total memory and peak memory allocated with this resource
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes Size of the allocation
+   * @param stream Stream on which to perform the deallocation
+   */
+  void do_deallocate(void* p, std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    current_allocated_size_ -= bytes;
+    upstream_->deallocate(p, bytes, stream);
+  }
+
+  /**
+   * @brief Compare the upstream resource to another.
+   *
+   * @throws Nothing.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    if (this == &other)
+      return true;
+    else {
+      memory_tracking_resource<Upstream> const* cast =
+        dynamic_cast<memory_tracking_resource<Upstream> const*>(&other);
+      if (cast != nullptr)
+        return upstream_->is_equal(*cast->get_upstream());
+      else
+        return upstream_->is_equal(other);
+    }
+  }
+
+  /**
+   * @brief Get free and available memory from upstream resource.
+   *
+   * @throws `rmm::cuda_error` if unable to retrieve memory info.
+   *
+   * @param stream Stream on which to get the mem info.
+   * @return std::pair contaiing free_size and total_size of memory
+   */
+  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override
+  {
+    return upstream_->get_mem_info(stream);
+  }
+
+  size_t current_allocated_size_ = 0;
+  size_t max_allocated_size_     = 0;
+
+  Upstream* upstream_;  ///< The upstream resource used for satisfying
+                        ///< allocation requests
+};
+
 }  // namespace
 
+using memory_tracking_pool_resource_type =
+  memory_tracking_resource<decltype(make_pool())::element_type>;
+
 /**
  * @brief Google Benchmark fixture for libcudf benchmarks
  *
@@ -67,7 +209,10 @@ class benchmark : public ::benchmark::Fixture {
  public:
   virtual void SetUp(const ::benchmark::State& state)
   {
-    mr = make_pool();
+    auto pool = make_pool();
+    mr        = std::make_shared<memory_tracking_pool_resource_type>(
+      memory_tracking_pool_resource_type(pool.get()));
+    pool_mr = pool;
     rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
   }
 
@@ -75,6 +220,7 @@ class benchmark : public ::benchmark::Fixture {
   {
     // reset default resource to the initial resource
     rmm::mr::set_current_device_resource(nullptr);
+    pool_mr.reset();
     mr.reset();
   }
 
@@ -86,6 +232,7 @@ class benchmark : public ::benchmark::Fixture {
   }
 
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
+  std::shared_ptr<rmm::mr::device_memory_resource> pool_mr;
 };
 
 }  // namespace cudf
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index d17e7b126c7..923af428b8f 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -59,6 +59,9 @@ void BM_parq_write_varying_inout(benchmark::State& state)
   }
 
   state.SetBytesProcessed(data_size * state.iterations());
+  auto mr =
+    dynamic_cast<cudf::memory_tracking_pool_resource_type*>(rmm::mr::get_current_device_resource());
+  state.counters["peak mem"] = mr->max_allocated_size();
 }
 
 void BM_parq_write_varying_options(benchmark::State& state)

From b9147f432014e0cc3f23e5b782e6c6409e929299 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 31 Mar 2021 03:34:55 +0530
Subject: [PATCH 02/10] Localize the memory usage calculator to just the API
 being benchmarked Separate out memory_tracking_resource into its own header
 and remove association with benchmark fixture

---
 .../common/memory_tracking_resource.hpp       | 169 ++++++++++++++++++
 cpp/benchmarks/fixture/benchmark_fixture.hpp  | 151 +---------------
 .../io/parquet/parquet_writer_benchmark.cpp   |  16 +-
 3 files changed, 185 insertions(+), 151 deletions(-)
 create mode 100644 cpp/benchmarks/common/memory_tracking_resource.hpp

diff --git a/cpp/benchmarks/common/memory_tracking_resource.hpp b/cpp/benchmarks/common/memory_tracking_resource.hpp
new file mode 100644
index 00000000000..0a1cc6d175d
--- /dev/null
+++ b/cpp/benchmarks/common/memory_tracking_resource.hpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf {
+
+/**
+ * @brief Resource that uses `Upstream` to allocate memory and tracks the current and peak memory
+ * allocated using this resource
+ *
+ * An instance of this resource can be constructed with an existing, upstream resource in order to
+ * satisfy allocation requests and track memory use.
+ *
+ * @tparam Upstream Type of the upstream resource used for allocation/deallocation.
+ */
+template <typename Upstream>
+class memory_tracking_resource final : public rmm::mr::device_memory_resource {
+ public:
+  /**
+   * @brief Construct a new tracking resource adaptor using `upstream` to satisfy allocation
+   * requests and tracking information about each allocation/free to the members
+   * current_allocated_size_ and max_allocated_size_.
+   *
+   * @throws `rmm::logic_error` if `upstream == nullptr`
+   *
+   * @param upstream The resource used for allocating/deallocating device memory
+   */
+  memory_tracking_resource(Upstream* upstream) : upstream_{upstream}
+  {
+    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+  }
+
+  memory_tracking_resource()                                = delete;
+  ~memory_tracking_resource()                               = default;
+  memory_tracking_resource(memory_tracking_resource const&) = delete;
+  memory_tracking_resource(memory_tracking_resource&&)      = default;
+  memory_tracking_resource& operator=(memory_tracking_resource const&) = delete;
+  memory_tracking_resource& operator=(memory_tracking_resource&&) = default;
+
+  /**
+   * @brief Return pointer to the upstream resource.
+   *
+   * @return Upstream* Pointer to the upstream resource.
+   */
+  Upstream* get_upstream() const noexcept { return upstream_; }
+
+  /**
+   * @brief Checks whether the upstream resource supports streams.
+   *
+   * @return true The upstream resource supports streams
+   * @return false The upstream resource does not support streams.
+   */
+  bool supports_streams() const noexcept override { return upstream_->supports_streams(); }
+
+  /**
+   * @brief Query whether the resource supports the get_mem_info API.
+   *
+   * @return bool true if the upstream resource supports get_mem_info, false otherwise.
+   */
+  bool supports_get_mem_info() const noexcept override
+  {
+    return upstream_->supports_get_mem_info();
+  }
+
+  size_t max_allocated_size() const noexcept { return max_allocated_size_; }
+  size_t current_allocated_size() const noexcept { return current_allocated_size_; }
+
+ private:
+  /**
+   * @brief Allocates memory of size at least `bytes` using the upstream resource and updates the
+   * size of memory in use.
+   *
+   * If the upstream allocation is successful updates the current total memory and peak memory
+   * allocated with this resource
+   *
+   * The returned pointer has at least 256B alignment.
+   *
+   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
+   * by the upstream resource.
+   *
+   * @param bytes The size, in bytes, of the allocation
+   * @param stream Stream on which to perform the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    auto const p = upstream_->allocate(bytes, stream);
+    current_allocated_size_ += bytes;
+    max_allocated_size_ = std::max(current_allocated_size_, max_allocated_size_);
+    return p;
+  }
+
+  /**
+   * @brief Free allocation of size `bytes` pointed to by `p` and log the deallocation.
+   *
+   * Updates the current total memory and peak memory allocated with this resource
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes Size of the allocation
+   * @param stream Stream on which to perform the deallocation
+   */
+  void do_deallocate(void* p, std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    current_allocated_size_ -= bytes;
+    upstream_->deallocate(p, bytes, stream);
+  }
+
+  /**
+   * @brief Compare the upstream resource to another.
+   *
+   * @throws Nothing.
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equal
+   */
+  bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    if (this == &other)
+      return true;
+    else {
+      memory_tracking_resource<Upstream> const* cast =
+        dynamic_cast<memory_tracking_resource<Upstream> const*>(&other);
+      if (cast != nullptr)
+        return upstream_->is_equal(*cast->get_upstream());
+      else
+        return upstream_->is_equal(other);
+    }
+  }
+
+  /**
+   * @brief Get free and available memory from upstream resource.
+   *
+   * @throws `rmm::cuda_error` if unable to retrieve memory info.
+   *
+   * @param stream Stream on which to get the mem info.
+   * @return std::pair contaiing free_size and total_size of memory
+   */
+  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override
+  {
+    return upstream_->get_mem_info(stream);
+  }
+
+  size_t current_allocated_size_ = 0;
+  size_t max_allocated_size_     = 0;
+
+  Upstream* upstream_;  ///< The upstream resource used for satisfying
+                        ///< allocation requests
+};
+
+}  // namespace cudf
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index a330ccf26d9..4a84b479198 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <benchmark/benchmark.h>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -30,150 +32,8 @@ inline auto make_pool()
 {
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
 }
-
-template <typename Upstream>
-class memory_tracking_resource final : public rmm::mr::device_memory_resource {
- public:
-  /**
-   * @brief Construct a new tracking resource adaptor using `upstream` to satisfy
-   * allocation requests and tracking information about each allocation/free to
-   * the members current_allocated_size_ and max_allocated_size_.
-   *
-   * @throws `rmm::logic_error` if `upstream == nullptr`
-   *
-   * @param upstream The resource used for allocating/deallocating device memory
-   */
-  memory_tracking_resource(Upstream* upstream) : upstream_{upstream}
-  {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
-  }
-
-  memory_tracking_resource()                                = delete;
-  ~memory_tracking_resource()                               = default;
-  memory_tracking_resource(memory_tracking_resource const&) = delete;
-  memory_tracking_resource(memory_tracking_resource&&)      = default;
-  memory_tracking_resource& operator=(memory_tracking_resource const&) = delete;
-  memory_tracking_resource& operator=(memory_tracking_resource&&) = default;
-
-  /**
-   * @brief Return pointer to the upstream resource.
-   *
-   * @return Upstream* Pointer to the upstream resource.
-   */
-  Upstream* get_upstream() const noexcept { return upstream_; }
-
-  /**
-   * @brief Checks whether the upstream resource supports streams.
-   *
-   * @return true The upstream resource supports streams
-   * @return false The upstream resource does not support streams.
-   */
-  bool supports_streams() const noexcept override { return upstream_->supports_streams(); }
-
-  /**
-   * @brief Query whether the resource supports the get_mem_info API.
-   *
-   * @return bool true if the upstream resource supports get_mem_info, false otherwise.
-   */
-  bool supports_get_mem_info() const noexcept override
-  {
-    return upstream_->supports_get_mem_info();
-  }
-
-  size_t max_allocated_size() const noexcept { return max_allocated_size_; }
-  size_t current_allocated_size() const noexcept { return current_allocated_size_; }
-
- private:
-  /**
-   * @brief Allocates memory of size at least `bytes` using the upstream
-   * resource and logs the allocation.
-   *
-   * If the upstream allocation is successful updates the current total memory and peak memory
-   * allocated with this resource
-   *
-   * The returned pointer has at least 256B alignment.
-   *
-   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
-   * by the upstream resource.
-   *
-   * @param bytes The size, in bytes, of the allocation
-   * @param stream Stream on which to perform the allocation
-   * @return void* Pointer to the newly allocated memory
-   */
-  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
-  {
-    auto const p = upstream_->allocate(bytes, stream);
-    current_allocated_size_ += bytes;
-    max_allocated_size_ = std::max(current_allocated_size_, max_allocated_size_);
-    return p;
-  }
-
-  /**
-   * @brief Free allocation of size `bytes` pointed to by `p` and log the
-   * deallocation.
-   *
-   * Updates the current total memory and peak memory allocated with this resource
-   *
-   * @throws Nothing.
-   *
-   * @param p Pointer to be deallocated
-   * @param bytes Size of the allocation
-   * @param stream Stream on which to perform the deallocation
-   */
-  void do_deallocate(void* p, std::size_t bytes, rmm::cuda_stream_view stream) override
-  {
-    current_allocated_size_ -= bytes;
-    upstream_->deallocate(p, bytes, stream);
-  }
-
-  /**
-   * @brief Compare the upstream resource to another.
-   *
-   * @throws Nothing.
-   *
-   * @param other The other resource to compare to
-   * @return true If the two resources are equivalent
-   * @return false If the two resources are not equal
-   */
-  bool do_is_equal(device_memory_resource const& other) const noexcept override
-  {
-    if (this == &other)
-      return true;
-    else {
-      memory_tracking_resource<Upstream> const* cast =
-        dynamic_cast<memory_tracking_resource<Upstream> const*>(&other);
-      if (cast != nullptr)
-        return upstream_->is_equal(*cast->get_upstream());
-      else
-        return upstream_->is_equal(other);
-    }
-  }
-
-  /**
-   * @brief Get free and available memory from upstream resource.
-   *
-   * @throws `rmm::cuda_error` if unable to retrieve memory info.
-   *
-   * @param stream Stream on which to get the mem info.
-   * @return std::pair contaiing free_size and total_size of memory
-   */
-  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override
-  {
-    return upstream_->get_mem_info(stream);
-  }
-
-  size_t current_allocated_size_ = 0;
-  size_t max_allocated_size_     = 0;
-
-  Upstream* upstream_;  ///< The upstream resource used for satisfying
-                        ///< allocation requests
-};
-
 }  // namespace
 
-using memory_tracking_pool_resource_type =
-  memory_tracking_resource<decltype(make_pool())::element_type>;
-
 /**
  * @brief Google Benchmark fixture for libcudf benchmarks
  *
@@ -209,10 +69,7 @@ class benchmark : public ::benchmark::Fixture {
  public:
   virtual void SetUp(const ::benchmark::State& state)
   {
-    auto pool = make_pool();
-    mr        = std::make_shared<memory_tracking_pool_resource_type>(
-      memory_tracking_pool_resource_type(pool.get()));
-    pool_mr = pool;
+    mr = make_pool();
     rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
   }
 
@@ -220,7 +77,6 @@ class benchmark : public ::benchmark::Fixture {
   {
     // reset default resource to the initial resource
     rmm::mr::set_current_device_resource(nullptr);
-    pool_mr.reset();
     mr.reset();
   }
 
@@ -232,7 +88,6 @@ class benchmark : public ::benchmark::Fixture {
   }
 
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
-  std::shared_ptr<rmm::mr::device_memory_resource> pool_mr;
 };
 
 }  // namespace cudf
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index 923af428b8f..83ff0021516 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -17,6 +17,7 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -50,6 +51,10 @@ void BM_parq_write_varying_inout(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(sink_type);
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options opts =
@@ -57,11 +62,10 @@ void BM_parq_write_varying_inout(benchmark::State& state)
         .compression(compression);
     cudf_io::write_parquet(opts);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  auto mr =
-    dynamic_cast<cudf::memory_tracking_pool_resource_type*>(rmm::mr::get_current_device_resource());
-  state.counters["peak mem"] = mr->max_allocated_size();
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 void BM_parq_write_varying_options(benchmark::State& state)
@@ -79,6 +83,10 @@ void BM_parq_write_varying_options(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(io_type::FILEPATH);
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options const options =
@@ -88,8 +96,10 @@ void BM_parq_write_varying_options(benchmark::State& state)
         .column_chunks_file_path(file_path);
     cudf_io::write_parquet(options);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 #define PARQ_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                              \

From c49d60b4437e563e606a09dfe6a3d694b2900fbe Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Wed, 31 Mar 2021 04:30:43 +0530
Subject: [PATCH 03/10] Update
 cpp/benchmarks/common/memory_tracking_resource.hpp

Co-authored-by: David <45795991+davidwendt@users.noreply.github.com>
---
 cpp/benchmarks/common/memory_tracking_resource.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/benchmarks/common/memory_tracking_resource.hpp b/cpp/benchmarks/common/memory_tracking_resource.hpp
index 0a1cc6d175d..2482c0b5d0f 100644
--- a/cpp/benchmarks/common/memory_tracking_resource.hpp
+++ b/cpp/benchmarks/common/memory_tracking_resource.hpp
@@ -111,8 +111,6 @@ class memory_tracking_resource final : public rmm::mr::device_memory_resource {
    *
    * Updates the current total memory and peak memory allocated with this resource
    *
-   * @throws Nothing.
-   *
    * @param p Pointer to be deallocated
    * @param bytes Size of the allocation
    * @param stream Stream on which to perform the deallocation

From 4664f9a5295cf2bc190fcca0b5ac0eff8ef35f32 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 31 Mar 2021 14:37:20 +0530
Subject: [PATCH 04/10] Expand mem tracking to all cuIO benchmarks

---
 cpp/benchmarks/fixture/benchmark_fixture.hpp     |  2 +-
 cpp/benchmarks/io/csv/csv_reader_benchmark.cpp   | 16 +++++++++++++++-
 cpp/benchmarks/io/csv/csv_writer_benchmark.cpp   | 15 ++++++++++++++-
 cpp/benchmarks/io/orc/orc_reader_benchmark.cpp   | 16 +++++++++++++++-
 cpp/benchmarks/io/orc/orc_writer_benchmark.cpp   | 15 ++++++++++++++-
 .../io/parquet/parquet_reader_benchmark.cpp      | 16 +++++++++++++++-
 .../io/parquet/parquet_writer_benchmark.cpp      |  2 +-
 7 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 4a84b479198..7ca2300543d 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
index a3ee1a3f333..b3b4996a934 100644
--- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -51,12 +52,18 @@ void BM_csv_read_varying_input(benchmark::State& state)
   cudf_io::csv_reader_options const read_options =
     cudf_io::csv_reader_options::builder(source_sink.make_source_info());
 
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_csv(read_options);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 void BM_csv_read_varying_options(benchmark::State& state)
@@ -94,6 +101,10 @@ void BM_csv_read_varying_options(benchmark::State& state)
 
   size_t const chunk_size             = csv_data.size() / num_chunks;
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
@@ -124,8 +135,11 @@ void BM_csv_read_varying_options(benchmark::State& state)
       cudf_io::read_csv(read_options);
     }
   }
+  rmm::mr::set_current_device_resource(mr);
+
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 #define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)       \
diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
index fcb9155e646..026b852ebd5 100644
--- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -42,6 +43,10 @@ void BM_csv_write_varying_inout(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(sink_type);
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
@@ -50,8 +55,10 @@ void BM_csv_write_varying_inout(benchmark::State& state)
         .rows_per_chunk(1 << 14);  // TODO: remove once default is sensible
     cudf_io::write_csv(options);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 void BM_csv_write_varying_options(benchmark::State& state)
@@ -69,6 +76,10 @@ void BM_csv_write_varying_options(benchmark::State& state)
 
   std::string const na_per(na_per_len, '#');
   std::vector<char> csv_data;
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
@@ -78,8 +89,10 @@ void BM_csv_write_varying_options(benchmark::State& state)
         .rows_per_chunk(rows_per_chunk);
     cudf_io::write_csv(options);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 #define CSV_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)       \
diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
index d38747b934f..51823085132 100644
--- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -58,12 +59,18 @@ void BM_orc_read_varying_input(benchmark::State& state)
   cudf_io::orc_reader_options read_opts =
     cudf_io::orc_reader_options::builder(source_sink.make_source_info());
 
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_orc(read_opts);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 std::vector<std::string> get_col_names(std::vector<char> const& orc_data)
@@ -112,6 +119,10 @@ void BM_orc_read_varying_options(benchmark::State& state)
 
   auto const num_stripes              = data_size / (64 << 20);
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
@@ -141,8 +152,11 @@ void BM_orc_read_varying_options(benchmark::State& state)
 
     CUDF_EXPECTS(rows_read == view.num_rows(), "Benchmark did not read the entire table");
   }
+  rmm::mr::set_current_device_resource(mr);
+
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 #define ORC_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                               \
diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
index bddfc3dfaa2..fa20ea8e96a 100644
--- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -50,6 +51,10 @@ void BM_orc_write_varying_inout(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(sink_type);
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::orc_writer_options options =
@@ -57,8 +62,10 @@ void BM_orc_write_varying_inout(benchmark::State& state)
         .compression(compression);
     cudf_io::write_orc(options);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 void BM_orc_write_varying_options(benchmark::State& state)
@@ -75,6 +82,10 @@ void BM_orc_write_varying_options(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(io_type::FILEPATH);
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::orc_writer_options const options =
@@ -83,8 +94,10 @@ void BM_orc_write_varying_options(benchmark::State& state)
         .enable_statistics(enable_stats);
     cudf_io::write_orc(options);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 #define ORC_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                               \
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
index 8fc8b29d19d..41afa4c8637 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -58,12 +59,18 @@ void BM_parq_read_varying_input(benchmark::State& state)
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(source_sink.make_source_info());
 
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer const raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_parquet(read_opts);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 std::vector<std::string> get_col_names(std::vector<char> const& parquet_data)
@@ -112,6 +119,10 @@ void BM_parq_read_varying_options(benchmark::State& state)
 
   auto const num_row_groups           = data_size / (128 << 20);
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+
+  rmm::mr::set_current_device_resource(&tracking_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
@@ -141,8 +152,11 @@ void BM_parq_read_varying_options(benchmark::State& state)
 
     CUDF_EXPECTS(rows_read == view.num_rows(), "Benchmark did not read the entire table");
   }
+  rmm::mr::set_current_device_resource(mr);
+
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
+  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
 }
 
 #define PARQ_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                              \
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index 83ff0021516..ec8da1b6e4d 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 5c90e76a5a46b3e1477bd886b99eaae1d431fd4f Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 31 Mar 2021 14:38:40 +0530
Subject: [PATCH 05/10] Update docs

---
 cpp/benchmarks/common/memory_tracking_resource.hpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/cpp/benchmarks/common/memory_tracking_resource.hpp b/cpp/benchmarks/common/memory_tracking_resource.hpp
index 2482c0b5d0f..cbdb47bde73 100644
--- a/cpp/benchmarks/common/memory_tracking_resource.hpp
+++ b/cpp/benchmarks/common/memory_tracking_resource.hpp
@@ -91,9 +91,6 @@ class memory_tracking_resource final : public rmm::mr::device_memory_resource {
    *
    * The returned pointer has at least 256B alignment.
    *
-   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
-   * by the upstream resource.
-   *
    * @param bytes The size, in bytes, of the allocation
    * @param stream Stream on which to perform the allocation
    * @return void* Pointer to the newly allocated memory
@@ -124,8 +121,6 @@ class memory_tracking_resource final : public rmm::mr::device_memory_resource {
   /**
    * @brief Compare the upstream resource to another.
    *
-   * @throws Nothing.
-   *
    * @param other The other resource to compare to
    * @return true If the two resources are equivalent
    * @return false If the two resources are not equal
@@ -147,8 +142,6 @@ class memory_tracking_resource final : public rmm::mr::device_memory_resource {
   /**
    * @brief Get free and available memory from upstream resource.
    *
-   * @throws `rmm::cuda_error` if unable to retrieve memory info.
-   *
    * @param stream Stream on which to get the mem info.
    * @return std::pair contaiing free_size and total_size of memory
    */

From 30a921cf0dee6605363b3c13a1b65d9dee00d041 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 2 Jul 2021 00:25:51 +0530
Subject: [PATCH 06/10] Move to new statistics resource

---
 .../common/memory_tracking_resource.hpp       | 160 ------------------
 .../io/csv/csv_reader_benchmark.cpp           |  15 +-
 .../io/csv/csv_writer_benchmark.cpp           |  15 +-
 .../io/orc/orc_reader_benchmark.cpp           |  15 +-
 .../io/orc/orc_writer_benchmark.cpp           |  15 +-
 .../io/parquet/parquet_reader_benchmark.cpp   |  15 +-
 .../io/parquet/parquet_writer_benchmark.cpp   |  17 +-
 .../parquet_writer_chunks_benchmark.cpp       |  14 ++
 8 files changed, 64 insertions(+), 202 deletions(-)
 delete mode 100644 cpp/benchmarks/common/memory_tracking_resource.hpp

diff --git a/cpp/benchmarks/common/memory_tracking_resource.hpp b/cpp/benchmarks/common/memory_tracking_resource.hpp
deleted file mode 100644
index cbdb47bde73..00000000000
--- a/cpp/benchmarks/common/memory_tracking_resource.hpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <rmm/mr/device/device_memory_resource.hpp>
-
-namespace cudf {
-
-/**
- * @brief Resource that uses `Upstream` to allocate memory and tracks the current and peak memory
- * allocated using this resource
- *
- * An instance of this resource can be constructed with an existing, upstream resource in order to
- * satisfy allocation requests and track memory use.
- *
- * @tparam Upstream Type of the upstream resource used for allocation/deallocation.
- */
-template <typename Upstream>
-class memory_tracking_resource final : public rmm::mr::device_memory_resource {
- public:
-  /**
-   * @brief Construct a new tracking resource adaptor using `upstream` to satisfy allocation
-   * requests and tracking information about each allocation/free to the members
-   * current_allocated_size_ and max_allocated_size_.
-   *
-   * @throws `rmm::logic_error` if `upstream == nullptr`
-   *
-   * @param upstream The resource used for allocating/deallocating device memory
-   */
-  memory_tracking_resource(Upstream* upstream) : upstream_{upstream}
-  {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
-  }
-
-  memory_tracking_resource()                                = delete;
-  ~memory_tracking_resource()                               = default;
-  memory_tracking_resource(memory_tracking_resource const&) = delete;
-  memory_tracking_resource(memory_tracking_resource&&)      = default;
-  memory_tracking_resource& operator=(memory_tracking_resource const&) = delete;
-  memory_tracking_resource& operator=(memory_tracking_resource&&) = default;
-
-  /**
-   * @brief Return pointer to the upstream resource.
-   *
-   * @return Upstream* Pointer to the upstream resource.
-   */
-  Upstream* get_upstream() const noexcept { return upstream_; }
-
-  /**
-   * @brief Checks whether the upstream resource supports streams.
-   *
-   * @return true The upstream resource supports streams
-   * @return false The upstream resource does not support streams.
-   */
-  bool supports_streams() const noexcept override { return upstream_->supports_streams(); }
-
-  /**
-   * @brief Query whether the resource supports the get_mem_info API.
-   *
-   * @return bool true if the upstream resource supports get_mem_info, false otherwise.
-   */
-  bool supports_get_mem_info() const noexcept override
-  {
-    return upstream_->supports_get_mem_info();
-  }
-
-  size_t max_allocated_size() const noexcept { return max_allocated_size_; }
-  size_t current_allocated_size() const noexcept { return current_allocated_size_; }
-
- private:
-  /**
-   * @brief Allocates memory of size at least `bytes` using the upstream resource and updates the
-   * size of memory in use.
-   *
-   * If the upstream allocation is successful updates the current total memory and peak memory
-   * allocated with this resource
-   *
-   * The returned pointer has at least 256B alignment.
-   *
-   * @param bytes The size, in bytes, of the allocation
-   * @param stream Stream on which to perform the allocation
-   * @return void* Pointer to the newly allocated memory
-   */
-  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
-  {
-    auto const p = upstream_->allocate(bytes, stream);
-    current_allocated_size_ += bytes;
-    max_allocated_size_ = std::max(current_allocated_size_, max_allocated_size_);
-    return p;
-  }
-
-  /**
-   * @brief Free allocation of size `bytes` pointed to by `p` and log the deallocation.
-   *
-   * Updates the current total memory and peak memory allocated with this resource
-   *
-   * @param p Pointer to be deallocated
-   * @param bytes Size of the allocation
-   * @param stream Stream on which to perform the deallocation
-   */
-  void do_deallocate(void* p, std::size_t bytes, rmm::cuda_stream_view stream) override
-  {
-    current_allocated_size_ -= bytes;
-    upstream_->deallocate(p, bytes, stream);
-  }
-
-  /**
-   * @brief Compare the upstream resource to another.
-   *
-   * @param other The other resource to compare to
-   * @return true If the two resources are equivalent
-   * @return false If the two resources are not equal
-   */
-  bool do_is_equal(device_memory_resource const& other) const noexcept override
-  {
-    if (this == &other)
-      return true;
-    else {
-      memory_tracking_resource<Upstream> const* cast =
-        dynamic_cast<memory_tracking_resource<Upstream> const*>(&other);
-      if (cast != nullptr)
-        return upstream_->is_equal(*cast->get_upstream());
-      else
-        return upstream_->is_equal(other);
-    }
-  }
-
-  /**
-   * @brief Get free and available memory from upstream resource.
-   *
-   * @param stream Stream on which to get the mem info.
-   * @return std::pair contaiing free_size and total_size of memory
-   */
-  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override
-  {
-    return upstream_->get_mem_info(stream);
-  }
-
-  size_t current_allocated_size_ = 0;
-  size_t max_allocated_size_     = 0;
-
-  Upstream* upstream_;  ///< The upstream resource used for satisfying
-                        ///< allocation requests
-};
-
-}  // namespace cudf
diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
index b3b4996a934..8169d4c060e 100644
--- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
@@ -17,13 +17,14 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
-#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/csv.hpp>
 
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr size_t data_size         = 256 << 20;
@@ -53,9 +54,9 @@ void BM_csv_read_varying_input(benchmark::State& state)
     cudf_io::csv_reader_options::builder(source_sink.make_source_info());
 
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_csv(read_options);
@@ -63,7 +64,7 @@ void BM_csv_read_varying_input(benchmark::State& state)
   rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 void BM_csv_read_varying_options(benchmark::State& state)
@@ -102,9 +103,9 @@ void BM_csv_read_varying_options(benchmark::State& state)
   size_t const chunk_size             = csv_data.size() / num_chunks;
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
@@ -139,7 +140,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
 
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 #define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)       \
diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
index 026b852ebd5..42e12c2d265 100644
--- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
@@ -17,13 +17,14 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
-#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/csv.hpp>
 
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr size_t data_size         = 256 << 20;
@@ -44,9 +45,9 @@ void BM_csv_write_varying_inout(benchmark::State& state)
 
   cuio_source_sink_pair source_sink(sink_type);
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
@@ -58,7 +59,7 @@ void BM_csv_write_varying_inout(benchmark::State& state)
   rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 void BM_csv_write_varying_options(benchmark::State& state)
@@ -77,9 +78,9 @@ void BM_csv_write_varying_options(benchmark::State& state)
   std::string const na_per(na_per_len, '#');
   std::vector<char> csv_data;
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
@@ -92,7 +93,7 @@ void BM_csv_write_varying_options(benchmark::State& state)
   rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 #define CSV_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)       \
diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
index 79ce6bed3ce..77d929aecaa 100644
--- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
@@ -17,13 +17,14 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
-#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/orc.hpp>
 
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr int64_t data_size        = 512 << 20;
@@ -60,9 +61,9 @@ void BM_orc_read_varying_input(benchmark::State& state)
     cudf_io::orc_reader_options::builder(source_sink.make_source_info());
 
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_orc(read_opts);
@@ -70,7 +71,7 @@ void BM_orc_read_varying_input(benchmark::State& state)
   rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 std::vector<std::string> get_col_names(std::vector<char> const& orc_data)
@@ -118,9 +119,9 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const num_stripes              = data_size / (64 << 20);
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
@@ -154,7 +155,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
 
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 #define ORC_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                               \
diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
index fa20ea8e96a..2a5fe7941c8 100644
--- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
@@ -17,13 +17,14 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
-#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/orc.hpp>
 
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr int64_t data_size        = 512 << 20;
@@ -52,9 +53,9 @@ void BM_orc_write_varying_inout(benchmark::State& state)
 
   cuio_source_sink_pair source_sink(sink_type);
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::orc_writer_options options =
@@ -65,7 +66,7 @@ void BM_orc_write_varying_inout(benchmark::State& state)
   rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 void BM_orc_write_varying_options(benchmark::State& state)
@@ -83,9 +84,9 @@ void BM_orc_write_varying_options(benchmark::State& state)
 
   cuio_source_sink_pair source_sink(io_type::FILEPATH);
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::orc_writer_options const options =
@@ -97,7 +98,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
   rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 #define ORC_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                               \
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
index 41afa4c8637..41b073db36c 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
@@ -17,13 +17,14 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
-#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/parquet.hpp>
 
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr size_t data_size         = 512 << 20;
@@ -60,9 +61,9 @@ void BM_parq_read_varying_input(benchmark::State& state)
     cudf_io::parquet_reader_options::builder(source_sink.make_source_info());
 
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer const raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_parquet(read_opts);
@@ -70,7 +71,7 @@ void BM_parq_read_varying_input(benchmark::State& state)
   rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 std::vector<std::string> get_col_names(std::vector<char> const& parquet_data)
@@ -120,9 +121,9 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const num_row_groups           = data_size / (128 << 20);
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
@@ -156,7 +157,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
 
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 #define PARQ_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                              \
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index ec8da1b6e4d..02b2bbec3b0 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -17,13 +17,16 @@
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
-#include <benchmarks/common/memory_tracking_resource.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/parquet.hpp>
 
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr size_t data_size         = 512 << 20;
@@ -52,9 +55,9 @@ void BM_parq_write_varying_inout(benchmark::State& state)
 
   cuio_source_sink_pair source_sink(sink_type);
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options opts =
@@ -65,7 +68,7 @@ void BM_parq_write_varying_inout(benchmark::State& state)
   rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 void BM_parq_write_varying_options(benchmark::State& state)
@@ -84,9 +87,9 @@ void BM_parq_write_varying_options(benchmark::State& state)
 
   cuio_source_sink_pair source_sink(io_type::FILEPATH);
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  cudf::memory_tracking_resource<rmm::mr::device_memory_resource> tracking_mr(mr);
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
 
-  rmm::mr::set_current_device_resource(&tracking_mr);
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options const options =
@@ -99,7 +102,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
   rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = tracking_mr.max_allocated_size();
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 #define PARQ_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                              \
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
index b38dda4d17e..f30cfa64768 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
@@ -29,6 +29,8 @@
 
 #include <cudf/io/parquet.hpp>
 
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr int64_t data_size = 512 << 20;
@@ -47,14 +49,20 @@ void PQ_write(benchmark::State& state)
   auto tbl = create_random_table({cudf::type_id::INT32}, num_cols, table_size_bytes{data_size});
   cudf::table_view view = tbl->view();
 
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
+
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options opts =
       cudf_io::parquet_writer_options::builder(cudf_io::sink_info(), view);
     cudf_io::write_parquet(opts);
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 void PQ_write_chunked(benchmark::State& state)
@@ -68,6 +76,10 @@ void PQ_write_chunked(benchmark::State& state)
       {cudf::type_id::INT32}, num_cols, table_size_bytes{size_t(data_size / num_tables)}));
   }
 
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
+
+  rmm::mr::set_current_device_resource(&statistics_mr);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::chunked_parquet_writer_options opts =
@@ -78,8 +90,10 @@ void PQ_write_chunked(benchmark::State& state)
     });
     writer.close();
   }
+  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
+  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
 }
 
 #define PWBM_BENCHMARK_DEFINE(name, size, num_columns)                                    \

From 8d85fa9bf214255ecee8bf620e0546c480a32074 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Sat, 10 Jul 2021 01:26:57 +0530
Subject: [PATCH 07/10] Proposed RAII statistics resource wrapper

---
 cpp/benchmarks/fixture/benchmark_fixture.hpp  | 19 +++++++++++++++++++
 .../io/parquet/parquet_writer_benchmark.cpp   | 12 ++----------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 7ca2300543d..8bfd9bfdfaa 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -21,6 +21,7 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
 
 namespace cudf {
 
@@ -90,4 +91,22 @@ class benchmark : public ::benchmark::Fixture {
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
 };
 
+class memory_stats_logger {
+ public:
+  memory_stats_logger()
+    : existing_mr(rmm::mr::get_current_device_resource()),
+      statistics_mr(rmm::mr::make_statistics_adaptor(existing_mr))
+  {
+    rmm::mr::set_current_device_resource(&statistics_mr);
+  }
+
+  ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); }
+
+  size_t peak_memory_usage() { return statistics_mr.get_bytes_counter().peak; }
+
+ private:
+  rmm::mr::device_memory_resource* existing_mr;
+  rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource> statistics_mr;
+};
+
 }  // namespace cudf
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index 02b2bbec3b0..4d278a47f79 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -23,10 +23,6 @@
 
 #include <cudf/io/parquet.hpp>
 
-#include <rmm/mr/device/statistics_resource_adaptor.hpp>
-
-#include <rmm/mr/device/statistics_resource_adaptor.hpp>
-
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr size_t data_size         = 512 << 20;
@@ -54,10 +50,7 @@ void BM_parq_write_varying_inout(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(sink_type);
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options opts =
@@ -65,10 +58,9 @@ void BM_parq_write_varying_inout(benchmark::State& state)
         .compression(compression);
     cudf_io::write_parquet(opts);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 void BM_parq_write_varying_options(benchmark::State& state)

From 804cb80e33c2c23789c092ad7b72afadcc725757 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Wed, 14 Jul 2021 01:32:17 +0530
Subject: [PATCH 08/10] Change all io benchmarks to use RAII stats logger

---
 cpp/benchmarks/io/csv/csv_reader_benchmark.cpp   | 16 ++++------------
 cpp/benchmarks/io/csv/csv_writer_benchmark.cpp   | 16 ++++------------
 cpp/benchmarks/io/orc/orc_reader_benchmark.cpp   | 16 ++++------------
 cpp/benchmarks/io/orc/orc_writer_benchmark.cpp   | 16 ++++------------
 .../io/parquet/parquet_reader_benchmark.cpp      | 16 ++++------------
 .../io/parquet/parquet_writer_benchmark.cpp      |  8 ++------
 .../parquet/parquet_writer_chunks_benchmark.cpp  | 16 ++++------------
 7 files changed, 26 insertions(+), 78 deletions(-)

diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
index 8169d4c060e..b796b284d0a 100644
--- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
@@ -53,18 +53,14 @@ void BM_csv_read_varying_input(benchmark::State& state)
   cudf_io::csv_reader_options const read_options =
     cudf_io::csv_reader_options::builder(source_sink.make_source_info());
 
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_csv(read_options);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 void BM_csv_read_varying_options(benchmark::State& state)
@@ -102,10 +98,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
 
   size_t const chunk_size             = csv_data.size() / num_chunks;
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
@@ -136,11 +129,10 @@ void BM_csv_read_varying_options(benchmark::State& state)
       cudf_io::read_csv(read_options);
     }
   }
-  rmm::mr::set_current_device_resource(mr);
 
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 #define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)       \
diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
index 42e12c2d265..c25b56f88dd 100644
--- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
@@ -44,10 +44,7 @@ void BM_csv_write_varying_inout(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(sink_type);
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
@@ -56,10 +53,9 @@ void BM_csv_write_varying_inout(benchmark::State& state)
         .rows_per_chunk(1 << 14);  // TODO: remove once default is sensible
     cudf_io::write_csv(options);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 void BM_csv_write_varying_options(benchmark::State& state)
@@ -77,10 +73,7 @@ void BM_csv_write_varying_options(benchmark::State& state)
 
   std::string const na_per(na_per_len, '#');
   std::vector<char> csv_data;
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
@@ -90,10 +83,9 @@ void BM_csv_write_varying_options(benchmark::State& state)
         .rows_per_chunk(rows_per_chunk);
     cudf_io::write_csv(options);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 #define CSV_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)       \
diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
index 77d929aecaa..03f5832e500 100644
--- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
@@ -60,18 +60,14 @@ void BM_orc_read_varying_input(benchmark::State& state)
   cudf_io::orc_reader_options read_opts =
     cudf_io::orc_reader_options::builder(source_sink.make_source_info());
 
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_orc(read_opts);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 std::vector<std::string> get_col_names(std::vector<char> const& orc_data)
@@ -118,10 +114,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
 
   auto const num_stripes              = data_size / (64 << 20);
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
@@ -151,11 +144,10 @@ void BM_orc_read_varying_options(benchmark::State& state)
 
     CUDF_EXPECTS(rows_read == view.num_rows(), "Benchmark did not read the entire table");
   }
-  rmm::mr::set_current_device_resource(mr);
 
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 #define ORC_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                               \
diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
index 2a5fe7941c8..fbd560071bd 100644
--- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
@@ -52,10 +52,7 @@ void BM_orc_write_varying_inout(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(sink_type);
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::orc_writer_options options =
@@ -63,10 +60,9 @@ void BM_orc_write_varying_inout(benchmark::State& state)
         .compression(compression);
     cudf_io::write_orc(options);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 void BM_orc_write_varying_options(benchmark::State& state)
@@ -83,10 +79,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(io_type::FILEPATH);
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::orc_writer_options const options =
@@ -95,10 +88,9 @@ void BM_orc_write_varying_options(benchmark::State& state)
         .enable_statistics(enable_stats);
     cudf_io::write_orc(options);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 #define ORC_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                               \
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
index 41b073db36c..42c8b1dc641 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
@@ -60,18 +60,14 @@ void BM_parq_read_varying_input(benchmark::State& state)
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(source_sink.make_source_info());
 
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer const raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_parquet(read_opts);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 std::vector<std::string> get_col_names(std::vector<char> const& parquet_data)
@@ -120,10 +116,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
 
   auto const num_row_groups           = data_size / (128 << 20);
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
@@ -153,11 +146,10 @@ void BM_parq_read_varying_options(benchmark::State& state)
 
     CUDF_EXPECTS(rows_read == view.num_rows(), "Benchmark did not read the entire table");
   }
-  rmm::mr::set_current_device_resource(mr);
 
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 #define PARQ_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                              \
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index 4d278a47f79..b4c11179c35 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -78,10 +78,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(io_type::FILEPATH);
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options const options =
@@ -91,10 +88,9 @@ void BM_parq_write_varying_options(benchmark::State& state)
         .column_chunks_file_path(file_path);
     cudf_io::write_parquet(options);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(data_size * state.iterations());
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 #define PARQ_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                              \
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
index f30cfa64768..90758088a4f 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
@@ -49,20 +49,16 @@ void PQ_write(benchmark::State& state)
   auto tbl = create_random_table({cudf::type_id::INT32}, num_cols, table_size_bytes{data_size});
   cudf::table_view view = tbl->view();
 
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options opts =
       cudf_io::parquet_writer_options::builder(cudf_io::sink_info(), view);
     cudf_io::write_parquet(opts);
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 void PQ_write_chunked(benchmark::State& state)
@@ -76,10 +72,7 @@ void PQ_write_chunked(benchmark::State& state)
       {cudf::type_id::INT32}, num_cols, table_size_bytes{size_t(data_size / num_tables)}));
   }
 
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-  auto statistics_mr                  = rmm::mr::make_statistics_adaptor(mr);
-
-  rmm::mr::set_current_device_resource(&statistics_mr);
+  auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::chunked_parquet_writer_options opts =
@@ -90,10 +83,9 @@ void PQ_write_chunked(benchmark::State& state)
     });
     writer.close();
   }
-  rmm::mr::set_current_device_resource(mr);
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
-  state.counters["peak_memory_usage"] = statistics_mr.get_bytes_counter().peak;
+  state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
 }
 
 #define PWBM_BENCHMARK_DEFINE(name, size, num_columns)                                    \

From 4662f6e2855989f8645cab72adf66062a85d590c Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 16 Jul 2021 23:16:49 +0530
Subject: [PATCH 09/10] Remove extra header

---
 cpp/benchmarks/io/csv/csv_reader_benchmark.cpp                | 2 --
 cpp/benchmarks/io/csv/csv_writer_benchmark.cpp                | 2 --
 cpp/benchmarks/io/orc/orc_reader_benchmark.cpp                | 2 --
 cpp/benchmarks/io/orc/orc_writer_benchmark.cpp                | 2 --
 cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp        | 2 --
 cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp | 2 --
 6 files changed, 12 deletions(-)

diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
index b796b284d0a..3f5549a3148 100644
--- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
@@ -23,8 +23,6 @@
 
 #include <cudf/io/csv.hpp>
 
-#include <rmm/mr/device/statistics_resource_adaptor.hpp>
-
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr size_t data_size         = 256 << 20;
diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
index c25b56f88dd..fdd7c63eece 100644
--- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
@@ -23,8 +23,6 @@
 
 #include <cudf/io/csv.hpp>
 
-#include <rmm/mr/device/statistics_resource_adaptor.hpp>
-
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr size_t data_size         = 256 << 20;
diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
index 03f5832e500..549605fbaee 100644
--- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
@@ -23,8 +23,6 @@
 
 #include <cudf/io/orc.hpp>
 
-#include <rmm/mr/device/statistics_resource_adaptor.hpp>
-
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr int64_t data_size        = 512 << 20;
diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
index fbd560071bd..de5dd2c7b9d 100644
--- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
@@ -23,8 +23,6 @@
 
 #include <cudf/io/orc.hpp>
 
-#include <rmm/mr/device/statistics_resource_adaptor.hpp>
-
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr int64_t data_size        = 512 << 20;
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
index 42c8b1dc641..045aa0e043b 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
@@ -23,8 +23,6 @@
 
 #include <cudf/io/parquet.hpp>
 
-#include <rmm/mr/device/statistics_resource_adaptor.hpp>
-
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr size_t data_size         = 512 << 20;
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
index 90758088a4f..0041af80a15 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
@@ -29,8 +29,6 @@
 
 #include <cudf/io/parquet.hpp>
 
-#include <rmm/mr/device/statistics_resource_adaptor.hpp>
-
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
 constexpr int64_t data_size = 512 << 20;

From baa41504b203072b1656c334730000cd7e443b7e Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Sat, 17 Jul 2021 01:46:46 +0530
Subject: [PATCH 10/10] Update cpp/benchmarks/fixture/benchmark_fixture.hpp

Co-authored-by: Vukasin Milovanovic <vukasin.milovanovic.87@gmail.com>
---
 cpp/benchmarks/fixture/benchmark_fixture.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 8bfd9bfdfaa..8476a137c12 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -102,7 +102,7 @@ class memory_stats_logger {
 
   ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); }
 
-  size_t peak_memory_usage() { return statistics_mr.get_bytes_counter().peak; }
+  size_t peak_memory_usage() const noexcept { return statistics_mr.get_bytes_counter().peak; }
 
  private:
   rmm::mr::device_memory_resource* existing_mr;