diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8aed7089dc5..ed600000135 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -576,7 +576,7 @@ add_library(cudftestutil STATIC
             tests/utilities/column_utilities.cu
             tests/utilities/table_utilities.cu
             tests/io/metadata_utilities.cpp
-            tests/strings/utilities.cu)
+            tests/strings/utilities.cpp)
 
 set_target_properties(cudftestutil
     PROPERTIES BUILD_RPATH                         "\$ORIGIN"
diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
index ba2bc245484..0ec2590bdb5 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.cpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -297,12 +297,21 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
     }
   }
 
+  // cudf expects the null mask buffer to be padded up to 64 bytes. so allocate the proper size and
+  // copy what we have.
+  rmm::device_buffer result_bitmask{cudf::bitmask_allocation_size_bytes(num_rows),
+                                    rmm::cuda_stream_default};
+  cudaMemcpyAsync(result_bitmask.data(),
+                  null_mask.data(),
+                  null_mask.size() * sizeof(cudf::bitmask_type),
+                  cudaMemcpyHostToDevice,
+                  rmm::cuda_stream_default);
+
   return std::make_unique<cudf::column>(
     cudf::data_type{cudf::type_to_id<T>()},
     num_rows,
     rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default),
-    rmm::device_buffer(
-      null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
+    std::move(result_bitmask));
 }
 
 /**
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 6661f518639..374af536dc5 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -503,9 +503,12 @@ std::unique_ptr<Base> make_merge_m2_aggregation();
  *
  * Compute covariance between two columns.
  * The input columns are child columns of a non-nullable struct columns.
+ * @param min_periods Minimum number of non-null observations required to produce a result.
+ * @param ddof Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N is
+ *        the number of non-null observations.
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_covariance_aggregation();
+std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods = 1, size_type ddof = 1);
 
 /**
  * @brief Factory to create a CORRELATION aggregation
@@ -513,10 +516,12 @@ std::unique_ptr<Base> make_covariance_aggregation();
  * Compute correlation coefficient between two columns.
  * The input columns are child columns of a non-nullable struct columns.
  *
- * @param[in] type: correlation_type
+ * @param type correlation_type
+ * @param min_periods Minimum number of non-null observations required to produce a result.
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_correlation_aggregation(correlation_type type);
+std::unique_ptr<Base> make_correlation_aggregation(correlation_type type,
+                                                   size_type min_periods = 1);
 
 /**
  * @brief Factory to create a TDIGEST aggregation
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 19df8d8e7b6..cffefcaf9cd 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -192,6 +192,15 @@ CUDA_HOST_DEVICE_CALLABLE constexpr void ast_operator_dispatcher(ast_operator op
     case ast_operator::NOT:
       f.template operator()<ast_operator::NOT>(std::forward<Ts>(args)...);
       break;
+    case ast_operator::CAST_TO_INT64:
+      f.template operator()<ast_operator::CAST_TO_INT64>(std::forward<Ts>(args)...);
+      break;
+    case ast_operator::CAST_TO_UINT64:
+      f.template operator()<ast_operator::CAST_TO_UINT64>(std::forward<Ts>(args)...);
+      break;
+    case ast_operator::CAST_TO_FLOAT64:
+      f.template operator()<ast_operator::CAST_TO_FLOAT64>(std::forward<Ts>(args)...);
+      break;
     default:
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Invalid operator.");
@@ -780,6 +789,26 @@ struct operator_functor<ast_operator::NOT, false> {
   }
 };
 
+template <typename To>
+struct cast {
+  static constexpr auto arity{1};
+  template <typename From>
+  CUDA_DEVICE_CALLABLE auto operator()(From f) -> decltype(static_cast<To>(f))
+  {
+    return static_cast<To>(f);
+  }
+};
+
+template <>
+struct operator_functor<ast_operator::CAST_TO_INT64, false> : cast<int64_t> {
+};
+template <>
+struct operator_functor<ast_operator::CAST_TO_UINT64, false> : cast<uint64_t> {
+};
+template <>
+struct operator_functor<ast_operator::CAST_TO_FLOAT64, false> : cast<double> {
+};
+
 /*
  * The default specialization of nullable operators is to fall back to the non-nullable
  * implementation
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 5454f9a2b95..7ae40a7d65f 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -88,29 +88,32 @@ enum class ast_operator {
                      ///< NULL_LOGICAL_OR(null, false) is null, and NULL_LOGICAL_OR(valid, valid) ==
                      ///< LOGICAL_OR(valid, valid)
   // Unary operators
-  IDENTITY,    ///< Identity function
-  SIN,         ///< Trigonometric sine
-  COS,         ///< Trigonometric cosine
-  TAN,         ///< Trigonometric tangent
-  ARCSIN,      ///< Trigonometric sine inverse
-  ARCCOS,      ///< Trigonometric cosine inverse
-  ARCTAN,      ///< Trigonometric tangent inverse
-  SINH,        ///< Hyperbolic sine
-  COSH,        ///< Hyperbolic cosine
-  TANH,        ///< Hyperbolic tangent
-  ARCSINH,     ///< Hyperbolic sine inverse
-  ARCCOSH,     ///< Hyperbolic cosine inverse
-  ARCTANH,     ///< Hyperbolic tangent inverse
-  EXP,         ///< Exponential (base e, Euler number)
-  LOG,         ///< Natural Logarithm (base e)
-  SQRT,        ///< Square-root (x^0.5)
-  CBRT,        ///< Cube-root (x^(1.0/3))
-  CEIL,        ///< Smallest integer value not less than arg
-  FLOOR,       ///< largest integer value not greater than arg
-  ABS,         ///< Absolute value
-  RINT,        ///< Rounds the floating-point argument arg to an integer value
-  BIT_INVERT,  ///< Bitwise Not (~)
-  NOT          ///< Logical Not (!)
+  IDENTITY,        ///< Identity function
+  SIN,             ///< Trigonometric sine
+  COS,             ///< Trigonometric cosine
+  TAN,             ///< Trigonometric tangent
+  ARCSIN,          ///< Trigonometric sine inverse
+  ARCCOS,          ///< Trigonometric cosine inverse
+  ARCTAN,          ///< Trigonometric tangent inverse
+  SINH,            ///< Hyperbolic sine
+  COSH,            ///< Hyperbolic cosine
+  TANH,            ///< Hyperbolic tangent
+  ARCSINH,         ///< Hyperbolic sine inverse
+  ARCCOSH,         ///< Hyperbolic cosine inverse
+  ARCTANH,         ///< Hyperbolic tangent inverse
+  EXP,             ///< Exponential (base e, Euler number)
+  LOG,             ///< Natural Logarithm (base e)
+  SQRT,            ///< Square-root (x^0.5)
+  CBRT,            ///< Cube-root (x^(1.0/3))
+  CEIL,            ///< Smallest integer value not less than arg
+  FLOOR,           ///< largest integer value not greater than arg
+  ABS,             ///< Absolute value
+  RINT,            ///< Rounds the floating-point argument arg to an integer value
+  BIT_INVERT,      ///< Bitwise Not (~)
+  NOT,             ///< Logical Not (!)
+  CAST_TO_INT64,   ///< Cast value to int64_t
+  CAST_TO_UINT64,  ///< Cast value to uint64_t
+  CAST_TO_FLOAT64  ///< Cast value to double
 };
 
 /**
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index e12ed3f521e..69bde7f57fd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -901,7 +901,14 @@ class merge_m2_aggregation final : public groupby_aggregation {
  */
 class covariance_aggregation final : public groupby_aggregation {
  public:
-  explicit covariance_aggregation() : aggregation{COVARIANCE} {}
+  explicit covariance_aggregation(size_type min_periods, size_type ddof)
+    : aggregation{COVARIANCE}, _min_periods{min_periods}, _ddof(ddof)
+  {
+  }
+  size_type _min_periods;
+  size_type _ddof;
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
 
   std::unique_ptr<aggregation> clone() const override
   {
@@ -913,6 +920,12 @@ class covariance_aggregation final : public groupby_aggregation {
     return collector.visit(col_type, *this);
   }
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+ protected:
+  size_t hash_impl() const
+  {
+    return std::hash<size_type>{}(_min_periods) ^ std::hash<size_type>{}(_ddof);
+  }
 };
 
 /**
@@ -920,8 +933,12 @@ class covariance_aggregation final : public groupby_aggregation {
  */
 class correlation_aggregation final : public groupby_aggregation {
  public:
-  explicit correlation_aggregation(correlation_type type) : aggregation{CORRELATION}, _type{type} {}
+  explicit correlation_aggregation(correlation_type type, size_type min_periods)
+    : aggregation{CORRELATION}, _type{type}, _min_periods{min_periods}
+  {
+  }
   correlation_type _type;
+  size_type _min_periods;
 
   bool is_equal(aggregation const& _other) const override
   {
@@ -944,7 +961,10 @@ class correlation_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
  protected:
-  size_t hash_impl() const { return std::hash<int>{}(static_cast<int>(_type)); }
+  size_t hash_impl() const
+  {
+    return std::hash<int>{}(static_cast<int>(_type)) ^ std::hash<size_type>{}(_min_periods);
+  }
 };
 
 /**
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 89e589d306a..aac44bed50e 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -24,55 +24,21 @@ namespace cudf {
 namespace io {
 namespace detail {
 namespace csv {
+
 /**
- * @brief Class to read CSV dataset data into columns.
+ * @brief Reads the entire dataset.
+ *
+ * @param sources Input `datasource` object to read the dataset from
+ * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return The set of columns along with table metadata
  */
-class reader {
- private:
-  class impl;
-  std::unique_ptr<impl> _impl;
-
- public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  csv_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Constructor from an array of datasources
-   *
-   * @param sources Input `datasource` objects to read the dataset from
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-                  csv_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~reader();
-
-  /**
-   * @brief Reads the entire dataset.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return The set of columns along with table metadata
-   */
-  table_with_metadata read(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-};
+table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
+                             csv_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 class writer {
  public:
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index eee2d396d9f..5fa07fd5568 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -24,7 +24,9 @@
 #include <cudf_test/cxxopts.hpp>
 #include <cudf_test/file_utilities.hpp>
 
+#include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/binning_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -217,6 +219,8 @@ class TempDirTestEnvironment : public ::testing::Environment {
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
+inline auto make_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
+
 inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
 
 inline auto make_pool()
@@ -224,6 +228,11 @@ inline auto make_pool()
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
 }
 
+inline auto make_arena()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
+}
+
 inline auto make_binning()
 {
   auto pool = make_pool();
@@ -253,7 +262,9 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
 {
   if (allocation_mode == "binning") return make_binning();
   if (allocation_mode == "cuda") return make_cuda();
+  if (allocation_mode == "async") return make_async();
   if (allocation_mode == "pool") return make_pool();
+  if (allocation_mode == "arena") return make_arena();
   if (allocation_mode == "managed") return make_managed();
   CUDF_FAIL("Invalid RMM allocation mode: " + allocation_mode);
 }
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 3c6ab157d46..31bf9d65d56 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -713,23 +713,25 @@ template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_
 
 /// Factory to create a COVARIANCE aggregation
 template <typename Base>
-std::unique_ptr<Base> make_covariance_aggregation()
+std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods, size_type ddof)
 {
-  return std::make_unique<detail::covariance_aggregation>();
+  return std::make_unique<detail::covariance_aggregation>(min_periods, ddof);
 }
-template std::unique_ptr<aggregation> make_covariance_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_covariance_aggregation<groupby_aggregation>();
+template std::unique_ptr<aggregation> make_covariance_aggregation<aggregation>(
+  size_type min_periods, size_type ddof);
+template std::unique_ptr<groupby_aggregation> make_covariance_aggregation<groupby_aggregation>(
+  size_type min_periods, size_type ddof);
 
 /// Factory to create a CORRELATION aggregation
 template <typename Base>
-std::unique_ptr<Base> make_correlation_aggregation(correlation_type type)
+std::unique_ptr<Base> make_correlation_aggregation(correlation_type type, size_type min_periods)
 {
-  return std::make_unique<detail::correlation_aggregation>(type);
+  return std::make_unique<detail::correlation_aggregation>(type, min_periods);
 }
 template std::unique_ptr<aggregation> make_correlation_aggregation<aggregation>(
-  correlation_type type);
+  correlation_type type, size_type min_periods);
 template std::unique_ptr<groupby_aggregation> make_correlation_aggregation<groupby_aggregation>(
-  correlation_type type);
+  correlation_type type, size_type min_periods);
 
 template <typename Base>
 std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids)
diff --git a/cpp/src/aggregation/result_cache.cpp b/cpp/src/aggregation/result_cache.cpp
index 1889ae67ee3..ea6894b5ed3 100644
--- a/cpp/src/aggregation/result_cache.cpp
+++ b/cpp/src/aggregation/result_cache.cpp
@@ -30,10 +30,10 @@ void result_cache::add_result(column_view const& input,
 {
   // We can't guarantee that agg will outlive the cache, so we need to take ownership of a copy.
   // To allow lookup by reference, make the key a reference and keep the owner in the value pair.
-  auto owned_agg       = agg.clone();
-  auto const& key      = *owned_agg;
-  auto value           = std::make_pair(std::move(owned_agg), std::move(col));
-  _cache[{input, key}] = std::move(value);
+  auto owned_agg  = agg.clone();
+  auto const& key = *owned_agg;
+  // try_emplace doesn't update/insert if already present
+  _cache.try_emplace({input, key}, std::move(owned_agg), std::move(col));
 }
 
 column_view result_cache::get_result(column_view const& input, aggregation const& agg) const
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index e471fccda07..83c6c1bca57 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -573,6 +573,7 @@ void aggregate_result_functor::operator()<aggregation::COVARIANCE>(aggregation c
   CUDF_EXPECTS(values.num_children() == 2,
                "Input to `groupby covariance` must be a structs column having 2 children columns.");
 
+  auto const& cov_agg = dynamic_cast<cudf::detail::covariance_aggregation const&>(agg);
   // Covariance only for valid values in both columns.
   // in non-identical null mask cases, this prevents caching of the results - STD, MEAN, COUNT.
   auto [_, values_child0, values_child1] =
@@ -596,6 +597,8 @@ void aggregate_result_functor::operator()<aggregation::COVARIANCE>(aggregation c
                                             count,
                                             mean0,
                                             mean1,
+                                            cov_agg._min_periods,
+                                            cov_agg._ddof,
                                             stream,
                                             mr));
 };
@@ -629,28 +632,33 @@ void aggregate_result_functor::operator()<aggregation::CORRELATION>(aggregation
   aggregate_result_functor(values_child0, helper, cache, stream, mr).operator()<aggregation::STD>(*std_agg);
   aggregate_result_functor(values_child1, helper, cache, stream, mr).operator()<aggregation::STD>(*std_agg);
 
-  auto const stddev0 = cache.get_result(values_child0, *std_agg);
-  auto const stddev1 = cache.get_result(values_child1, *std_agg);
-
-  auto mean_agg    = make_mean_aggregation();
-  auto const mean0 = cache.get_result(values_child0, *mean_agg);
-  auto const mean1 = cache.get_result(values_child1, *mean_agg);
-  auto count_agg   = make_count_aggregation();
-  auto const count = cache.get_result(values_child0, *count_agg);
-
   // Compute covariance here to avoid repeated computation of mean & count
-  auto cov_agg = make_covariance_aggregation();
-  cache.add_result(values,
-                   *cov_agg,
-                   detail::group_covariance(get_grouped_values().child(0),
-                                            get_grouped_values().child(1),
-                                            helper.group_labels(stream),
-                                            helper.num_groups(stream),
-                                            count,
-                                            mean0,
-                                            mean1,
-                                            stream,
-                                            mr));
+  auto cov_agg = make_covariance_aggregation(corr_agg._min_periods);
+  if (not cache.has_result(values, *cov_agg)) {
+    auto mean_agg    = make_mean_aggregation();
+    auto const mean0 = cache.get_result(values_child0, *mean_agg);
+    auto const mean1 = cache.get_result(values_child1, *mean_agg);
+    auto count_agg   = make_count_aggregation();
+    auto const count = cache.get_result(values_child0, *count_agg);
+
+    auto const& cov_agg_obj = dynamic_cast<cudf::detail::covariance_aggregation const&>(*cov_agg);
+    cache.add_result(values,
+                     *cov_agg,
+                     detail::group_covariance(get_grouped_values().child(0),
+                                              get_grouped_values().child(1),
+                                              helper.group_labels(stream),
+                                              helper.num_groups(stream),
+                                              count,
+                                              mean0,
+                                              mean1,
+                                              cov_agg_obj._min_periods,
+                                              cov_agg_obj._ddof,
+                                              stream,
+                                              mr));
+  }
+
+  auto const stddev0    = cache.get_result(values_child0, *std_agg);
+  auto const stddev1    = cache.get_result(values_child1, *std_agg);
   auto const covariance = cache.get_result(values, *cov_agg);
   cache.add_result(
     values, agg, detail::group_correlation(covariance, stddev0, stddev1, stream, mr));
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
index e43d0185e93..cdcf4311be7 100644
--- a/cpp/src/groupby/sort/group_correlation.cu
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -113,6 +113,8 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
                                          column_view const& count,
                                          column_view const& mean_0,
                                          column_view const& mean_1,
+                                         size_type min_periods,
+                                         size_type ddof,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
@@ -140,8 +142,13 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
 
   auto d_values_0 = column_device_view::create(values_0, stream);
   auto d_values_1 = column_device_view::create(values_1, stream);
-  covariance_transform<result_type> covariance_transform_op{
-    *d_values_0, *d_values_1, mean0_ptr, mean1_ptr, count.data<size_type>(), group_labels.begin()};
+  covariance_transform<result_type> covariance_transform_op{*d_values_0,
+                                                            *d_values_1,
+                                                            mean0_ptr,
+                                                            mean1_ptr,
+                                                            count.data<size_type>(),
+                                                            group_labels.begin(),
+                                                            ddof};
 
   auto result = make_numeric_column(
     data_type(type_to_id<result_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -157,8 +164,8 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
                         thrust::make_discard_iterator(),
                         d_result);
 
-  auto is_null = [ddof = covariance_transform_op.ddof] __device__(size_type group_size) {
-    return not(group_size == 0 or group_size - ddof <= 0);
+  auto is_null = [ddof, min_periods] __device__(size_type group_size) {
+    return not(group_size == 0 or group_size - ddof <= 0 or group_size < min_periods);
   };
   auto [new_nullmask, null_count] =
     cudf::detail::valid_if(count.begin<size_type>(), count.end<size_type>(), is_null, stream, mr);
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index df9f5b391fb..935ef9554a9 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -52,14 +52,12 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  auto const superimposed = structs::detail::superimpose_parent_nulls(order_by, stream, mr);
-  table_view const order_table{{std::get<0>(superimposed)}};
   auto const flattened = cudf::structs::detail::flatten_nested_columns(
-    order_table, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
+    table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
   auto const d_flat_order = table_device_view::create(flattened, stream);
   row_equality_comparator<has_nulls> comparator(*d_flat_order, *d_flat_order, true);
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                       order_table.num_rows(),
+                                       flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
                                        stream,
                                        mr);
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 789a289a07e..75708c7b01c 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -451,6 +451,8 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
  * @param count The count of valid rows of the grouped values of both columns
  * @param mean_0 The mean of the first grouped values column
  * @param mean_1 The mean of the second grouped values column
+ * @param min_periods The minimum number of non-null rows required to consider the covariance
+ * @param ddof The delta degrees of freedom used in the calculation of the variance
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
@@ -461,6 +463,8 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
                                          column_view const& count,
                                          column_view const& mean_0,
                                          column_view const& mean_1,
+                                         size_type min_periods,
+                                         size_type ddof,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index d0e47d93bc6..b9915da90b9 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -238,6 +238,7 @@ struct HasherDispatcher {
                   std::is_same_v<Element, string_view>) {
       hasher->process(input_col.element<Element>(row_index));
     } else {
+      (void)row_index;
       cudf_assert(false && "Unsupported type for hash function.");
     }
   }
@@ -263,6 +264,8 @@ struct ListHasherDispatcher {
         if (input_col.is_valid(i)) { hasher->process(input_col.element<Element>(i)); }
       }
     } else {
+      (void)offset_begin;
+      (void)offset_end;
       cudf_assert(false && "Unsupported type for hash function.");
     }
   }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 99b593c99b9..7f032b6987c 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -19,14 +19,21 @@
  * @brief cuDF-IO CSV reader class implementation
  */
 
-#include "reader_impl.hpp"
+#include "csv_common.h"
+#include "csv_gpu.h"
 
 #include <io/comp/io_uncomp.h>
+#include <io/utilities/column_buffer.hpp>
+#include <io/utilities/hostdevice_vector.hpp>
 #include <io/utilities/parsing_utils.cuh>
 #include <io/utilities/type_conversion.hpp>
 
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/detail/csv.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/table/table.hpp>
@@ -37,10 +44,14 @@
 
 #include <algorithm>
 #include <iostream>
+#include <memory>
 #include <numeric>
+#include <string>
 #include <tuple>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
+#include <vector>
 
 using std::string;
 using std::vector;
@@ -56,27 +67,40 @@ namespace csv {
 using namespace cudf::io::csv;
 using namespace cudf::io;
 
+namespace {
+
 /**
- * @brief Translates a dtype string and returns its dtype enumeration and any
- * extended dtype flags that are supported by cuIO. Often, this is a column
- * with the same underlying dtype the basic types, but with different parsing
- * interpretations.
- *
- * @param[in] dtype String containing the basic or extended dtype
+ * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span.
  *
- * @return Tuple of data_type and flags
+ * Row offsets are stored this way to avoid reallocation/copies when discarding front or back
+ * elements.
  */
-std::tuple<data_type, column_parse::flags> get_dtype_info(const std::string& dtype)
-{
-  if (dtype == "hex" || dtype == "hex64") {
-    return std::make_tuple(data_type{cudf::type_id::INT64}, column_parse::as_hexadecimal);
-  }
-  if (dtype == "hex32") {
-    return std::make_tuple(data_type{cudf::type_id::INT32}, column_parse::as_hexadecimal);
+class selected_rows_offsets {
+  rmm::device_uvector<uint64_t> all;
+  device_span<uint64_t const> selected;
+
+ public:
+  selected_rows_offsets(rmm::device_uvector<uint64_t>&& data,
+                        device_span<uint64_t const> selected_span)
+    : all{std::move(data)}, selected{selected_span}
+  {
   }
+  selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {}
 
-  return std::make_tuple(convert_string_to_dtype(dtype), column_parse::as_default);
-}
+  operator device_span<uint64_t const>() const { return selected; }
+  void shrink(size_t size)
+  {
+    CUDF_EXPECTS(size <= selected.size(), "New size must be smaller");
+    selected = selected.subspan(0, size);
+  }
+  void erase_first_n(size_t n)
+  {
+    CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove");
+    selected = selected.subspan(n, selected.size() - n);
+  }
+  auto size() const { return selected.size(); }
+  auto data() const { return selected.data(); }
+};
 
 /**
  * @brief Removes the first and Last quote in the string
@@ -96,10 +120,10 @@ string removeQuotes(string str, char quotechar)
  * @brief Parse the first row to set the column names in the raw_csv parameter.
  * The first row can be either the header row, or the first data row
  */
-std::vector<std::string> setColumnNames(std::vector<char> const& header,
-                                        parse_options_view const& opts,
-                                        int header_row,
-                                        std::string prefix)
+std::vector<std::string> get_column_names(std::vector<char> const& header,
+                                          parse_options_view const& parse_opts,
+                                          int header_row,
+                                          std::string prefix)
 {
   std::vector<std::string> col_names;
 
@@ -112,35 +136,36 @@ std::vector<std::string> setColumnNames(std::vector<char> const& header,
   bool quotation = false;
   for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) {
     // Flip the quotation flag if current character is a quotechar
-    if (first_row[pos] == opts.quotechar) {
+    if (first_row[pos] == parse_opts.quotechar) {
       quotation = !quotation;
     }
     // Check if end of a column/row
-    else if (pos == first_row.size() - 1 || (!quotation && first_row[pos] == opts.terminator) ||
-             (!quotation && first_row[pos] == opts.delimiter)) {
+    else if (pos == first_row.size() - 1 ||
+             (!quotation && first_row[pos] == parse_opts.terminator) ||
+             (!quotation && first_row[pos] == parse_opts.delimiter)) {
       // This is the header, add the column name
       if (header_row >= 0) {
         // Include the current character, in case the line is not terminated
         int col_name_len = pos - prev + 1;
         // Exclude the delimiter/terminator is present
-        if (first_row[pos] == opts.delimiter || first_row[pos] == opts.terminator) {
+        if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) {
           --col_name_len;
         }
         // Also exclude '\r' character at the end of the column name if it's
         // part of the terminator
-        if (col_name_len > 0 && opts.terminator == '\n' && first_row[pos] == '\n' &&
+        if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' &&
             first_row[pos - 1] == '\r') {
           --col_name_len;
         }
 
         const string new_col_name(first_row.data() + prev, col_name_len);
-        col_names.push_back(removeQuotes(new_col_name, opts.quotechar));
+        col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
 
         // Stop parsing when we hit the line terminator; relevant when there is
         // a blank line following the header. In this case, first_row includes
         // multiple line terminators at the end, as the new recStart belongs to
         // a line that comes after the blank line(s)
-        if (!quotation && first_row[pos] == opts.terminator) { break; }
+        if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
       } else {
         // This is the first data row, add the automatically generated name
         col_names.push_back(prefix + std::to_string(num_cols));
@@ -148,8 +173,8 @@ std::vector<std::string> setColumnNames(std::vector<char> const& header,
       num_cols++;
 
       // Skip adjacent delimiters if delim_whitespace is set
-      while (opts.multi_delimiter && pos < first_row.size() && first_row[pos] == opts.delimiter &&
-             first_row[pos + 1] == opts.delimiter) {
+      while (parse_opts.multi_delimiter && pos < first_row.size() &&
+             first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) {
         ++pos;
       }
       prev = pos + 1;
@@ -170,277 +195,43 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream)
   container.resize(1, stream);
 }
 
-std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
-{
-  auto range_offset      = opts_.get_byte_range_offset();
-  auto range_size        = opts_.get_byte_range_size();
-  auto range_size_padded = opts_.get_byte_range_size_with_padding();
-  auto skip_rows         = opts_.get_skiprows();
-  auto skip_end_rows     = opts_.get_skipfooter();
-  auto num_rows          = opts_.get_nrows();
-
-  if (range_offset > 0 || range_size > 0) {
-    CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE,
-                 "Reading compressed data using `byte range` is unsupported");
-  }
-
-  // Transfer source data to GPU
-  if (!source_->is_empty()) {
-    auto const data_size = (range_size_padded != 0) ? range_size_padded : source_->size();
-    auto const buffer    = source_->host_read(range_offset, data_size);
-
-    auto h_data = host_span<char const>(  //
-      reinterpret_cast<const char*>(buffer->data()),
-      buffer->size());
-
-    std::vector<char> h_uncomp_data_owner;
-
-    if (opts_.get_compression() != compression_type::NONE) {
-      h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression());
-      h_data              = h_uncomp_data_owner;
-    }
-
-    // None of the parameters for row selection is used, we are parsing the entire file
-    const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
-                                 skip_end_rows <= 0 && num_rows == -1;
-
-    // With byte range, find the start of the first data row
-    size_t const data_start_offset = (range_offset != 0) ? find_first_row_start(h_data) : 0;
-
-    // TODO: Allow parsing the header outside the mapped range
-    CUDF_EXPECTS((range_offset == 0 || opts_.get_header() < 0),
-                 "byte_range offset with header not supported");
-
-    // Gather row offsets
-    auto data_row_offsets =
-      load_data_and_gather_row_offsets(h_data,
-                                       data_start_offset,
-                                       (range_size) ? range_size : h_data.size(),
-                                       (skip_rows > 0) ? skip_rows : 0,
-                                       num_rows,
-                                       load_whole_file,
-                                       stream);
-    auto& row_offsets = data_row_offsets.second;
-    // Exclude the rows that are to be skipped from the end
-    if (skip_end_rows > 0 && static_cast<size_t>(skip_end_rows) < row_offsets.size()) {
-      row_offsets.shrink(row_offsets.size() - skip_end_rows);
-    }
-    return data_row_offsets;
-  }
-  return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
-}
-
-std::vector<data_type> reader::impl::select_data_types(
-  std::map<std::string, data_type> const& col_type_map)
-{
-  std::vector<data_type> selected_dtypes;
-
-  for (int col = 0; col < num_actual_cols_; col++) {
-    if (column_flags_[col] & column_parse::enabled) {
-      auto const col_type_it = col_type_map.find(col_names_[col]);
-      CUDF_EXPECTS(col_type_it != col_type_map.end(),
-                   "Must specify data types for all active columns");
-      selected_dtypes.emplace_back(col_type_it->second);
-    }
-  }
-  return selected_dtypes;
-}
-
-std::vector<data_type> reader::impl::select_data_types(std::vector<data_type> const& dtypes)
-{
-  std::vector<data_type> selected_dtypes;
-
-  if (dtypes.size() == 1) {
-    // If it's a single dtype, assign that dtype to all active columns
-    selected_dtypes.resize(num_active_cols_, dtypes.front());
-  } else {
-    // If it's a list, assign dtypes to active columns in the given order
-    CUDF_EXPECTS(static_cast<int>(dtypes.size()) >= num_actual_cols_,
-                 "Must specify data types for all columns");
-
-    for (int col = 0; col < num_actual_cols_; col++) {
-      if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); }
-    }
-  }
-  return selected_dtypes;
-}
-
-table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
-{
-  auto const data_row_offsets = select_data_and_row_offsets(stream);
-  auto const& data            = data_row_offsets.first;
-  auto const& row_offsets     = data_row_offsets.second;
-
-  // Exclude the end-of-data row from number of rows with actual data
-  num_records_ = std::max(row_offsets.size(), 1ul) - 1;
-
-  // Check if the user gave us a list of column names
-  if (not opts_.get_names().empty()) {
-    column_flags_.resize(opts_.get_names().size(), column_parse::enabled);
-    col_names_ = opts_.get_names();
-  } else {
-    col_names_ = setColumnNames(header_, opts.view(), opts_.get_header(), opts_.get_prefix());
-
-    num_actual_cols_ = num_active_cols_ = col_names_.size();
-
-    column_flags_.resize(num_actual_cols_, column_parse::enabled);
-
-    // Rename empty column names to "Unnamed: col_index"
-    for (size_t col_idx = 0; col_idx < col_names_.size(); ++col_idx) {
-      if (col_names_[col_idx].empty()) {
-        col_names_[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
-      }
-    }
-
-    // Looking for duplicates
-    std::unordered_map<string, int> col_names_histogram;
-    for (auto& col_name : col_names_) {
-      // Operator [] inserts a default-initialized value if the given key is not
-      // present
-      if (++col_names_histogram[col_name] > 1) {
-        if (opts_.is_enabled_mangle_dupe_cols()) {
-          // Rename duplicates of column X as X.1, X.2, ...; First appearance
-          // stays as X
-          do {
-            col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
-          } while (col_names_histogram[col_name]++);
-        } else {
-          // All duplicate columns will be ignored; First appearance is parsed
-          const auto idx     = &col_name - col_names_.data();
-          column_flags_[idx] = column_parse::disabled;
-        }
-      }
-    }
-
-    // Update the number of columns to be processed, if some might have been
-    // removed
-    if (!opts_.is_enabled_mangle_dupe_cols()) { num_active_cols_ = col_names_histogram.size(); }
-  }
-
-  // User can specify which columns should be parsed
-  if (!opts_.get_use_cols_indexes().empty() || !opts_.get_use_cols_names().empty()) {
-    std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled);
-
-    for (const auto index : opts_.get_use_cols_indexes()) {
-      column_flags_[index] = column_parse::enabled;
-    }
-    num_active_cols_ = std::unordered_set<int>(opts_.get_use_cols_indexes().begin(),
-                                               opts_.get_use_cols_indexes().end())
-                         .size();
-
-    for (const auto& name : opts_.get_use_cols_names()) {
-      const auto it = std::find(col_names_.begin(), col_names_.end(), name);
-      if (it != col_names_.end()) {
-        auto curr_it = it - col_names_.begin();
-        if (column_flags_[curr_it] == column_parse::disabled) {
-          column_flags_[curr_it] = column_parse::enabled;
-          num_active_cols_++;
-        }
-      }
-    }
-  }
-
-  // User can specify which columns should be read as datetime
-  if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) {
-    for (const auto index : opts_.get_parse_dates_indexes()) {
-      column_flags_[index] |= column_parse::as_datetime;
-    }
-
-    for (const auto& name : opts_.get_parse_dates_names()) {
-      auto it = std::find(col_names_.begin(), col_names_.end(), name);
-      if (it != col_names_.end()) {
-        column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
-      }
-    }
-  }
-
-  // User can specify which columns should be parsed as hexadecimal
-  if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) {
-    for (const auto index : opts_.get_parse_hex_indexes()) {
-      column_flags_[index] |= column_parse::as_hexadecimal;
-    }
-
-    for (const auto& name : opts_.get_parse_hex_names()) {
-      auto it = std::find(col_names_.begin(), col_names_.end(), name);
-      if (it != col_names_.end()) {
-        column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal;
-      }
-    }
-  }
-
-  // Return empty table rather than exception if nothing to load
-  if (num_active_cols_ == 0) { return {std::make_unique<table>(), {}}; }
-
-  auto metadata    = table_metadata{};
-  auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
-
-  bool has_to_infer_column_types =
-    std::visit([](const auto& dtypes) { return dtypes.empty(); }, opts_.get_dtypes());
-
-  std::vector<data_type> column_types;
-  if (has_to_infer_column_types) {
-    column_types = infer_column_types(data, row_offsets, stream);
-  } else {
-    column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); },
-                              opts_.get_dtypes());
-  }
-
-  out_columns.reserve(column_types.size());
-
-  if (num_records_ != 0) {
-    auto out_buffers = decode_data(data, row_offsets, column_types, stream);
-    for (size_t i = 0; i < column_types.size(); ++i) {
-      metadata.column_names.emplace_back(out_buffers[i].name);
-      if (column_types[i].id() == type_id::STRING && opts.quotechar != '\0' &&
-          opts.doublequote == true) {
-        // PANDAS' default behavior of enabling doublequote for two consecutive
-        // quotechars in quoted fields results in reduction to a single quotechar
-        // TODO: Would be much more efficient to perform this operation in-place
-        // during the conversion stage
-        const std::string quotechar(1, opts.quotechar);
-        const std::string dblquotechar(2, opts.quotechar);
-        std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
-        out_columns.emplace_back(
-          cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr_));
-      } else {
-        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr_));
-      }
-    }
-  } else {
-    // Create empty columns
-    for (size_t i = 0; i < column_types.size(); ++i) {
-      out_columns.emplace_back(make_empty_column(column_types[i]));
-    }
-    // Handle empty metadata
-    for (int col = 0; col < num_actual_cols_; ++col) {
-      if (column_flags_[col] & column_parse::enabled) {
-        metadata.column_names.emplace_back(col_names_[col]);
-      }
-    }
-  }
-  return {std::make_unique<table>(std::move(out_columns)), std::move(metadata)};
-}
-
-size_t reader::impl::find_first_row_start(host_span<char const> data)
+size_t find_first_row_start(char row_terminator, host_span<char const> data)
 {
   // For now, look for the first terminator (assume the first terminator isn't within a quote)
   // TODO: Attempt to infer this from the data
   size_t pos = 0;
-  while (pos < data.size() && data[pos] != opts.terminator) {
+  while (pos < data.size() && data[pos] != row_terminator) {
     ++pos;
   }
   return std::min(pos + 1, data.size());
 }
 
-std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
-                                               size_t range_begin,
-                                               size_t range_end,
-                                               size_t skip_rows,
-                                               int64_t num_rows,
-                                               bool load_whole_file,
-                                               rmm::cuda_stream_view stream)
+/**
+ * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
+ *
+ * This function scans the input data to record the row offsets (relative to the start of the
+ * input data). A row is actually the data/offset between two termination symbols.
+ *
+ * @param data Uncompressed input data in host memory
+ * @param range_begin Only include rows starting after this position
+ * @param range_end Only include rows starting before this position
+ * @param skip_rows Number of rows to skip from the start
+ * @param num_rows Number of rows to read; -1: all remaining data
+ * @param load_whole_file Hint that the entire data will be needed on gpu
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Input data and row offsets in the device memory
+ */
+std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather_row_offsets(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  std::vector<char>& header,
+  host_span<char const> data,
+  size_t range_begin,
+  size_t range_end,
+  size_t skip_rows,
+  int64_t num_rows,
+  bool load_whole_file,
+  rmm::cuda_stream_view stream)
 {
   constexpr size_t max_chunk_bytes = 64 * 1024 * 1024;  // 64MB
   size_t buffer_size               = std::min(max_chunk_bytes, data.size());
@@ -449,7 +240,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
   hostdevice_vector<uint64_t> row_ctx(max_blocks);
   size_t buffer_pos  = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size());
   size_t pos         = std::min(range_begin, data.size());
-  size_t header_rows = (opts_.get_header() >= 0) ? opts_.get_header() + 1 : 0;
+  size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0;
   uint64_t ctx       = 0;
 
   // For compatibility with the previous parser, a row is considered in-range if the
@@ -475,7 +266,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
 
     // Pass 1: Count the potential number of rows in each character block for each
     // possible parser state at the beginning of the block.
-    uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(opts.view(),
+    uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
                                                                  row_ctx.device_ptr(),
                                                                  device_span<uint64_t>(),
                                                                  d_data,
@@ -514,7 +305,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
                                stream.value()));
 
       // Pass 2: Output row offsets
-      cudf::io::csv::gpu::gather_row_offsets(opts.view(),
+      cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
                                              row_ctx.device_ptr(),
                                              all_row_offsets,
                                              d_data,
@@ -551,8 +342,8 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
       // num_rows does not include blank rows
       if (num_rows >= 0) {
         if (all_row_offsets.size() > header_rows + static_cast<size_t>(num_rows)) {
-          size_t num_blanks =
-            cudf::io::csv::gpu::count_blank_rows(opts.view(), d_data, all_row_offsets, stream);
+          size_t num_blanks = cudf::io::csv::gpu::count_blank_rows(
+            parse_opts.view(), d_data, all_row_offsets, stream);
           if (all_row_offsets.size() - num_blanks > header_rows + static_cast<size_t>(num_rows)) {
             // Got the desired number of rows
             break;
@@ -571,7 +362,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
   } while (pos < data.size());
 
   auto const non_blank_row_offsets =
-    io::csv::gpu::remove_blank_rows(opts.view(), d_data, all_row_offsets, stream);
+    io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream);
   auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets};
 
   // Remove header rows and extract header
@@ -588,7 +379,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
     const auto header_end   = buffer_pos + row_ctx[1];
     CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(),
                  "Invalid csv header location");
-    header_.assign(data.begin() + header_start, data.begin() + header_end);
+    header.assign(data.begin() + header_start, data.begin() + header_end);
     if (header_rows > 0) { row_offsets.erase_first_n(header_rows); }
   }
   // Apply num_rows limit
@@ -598,30 +389,145 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
   return {std::move(d_data), std::move(row_offsets)};
 }
 
-std::vector<data_type> reader::impl::infer_column_types(device_span<char const> data,
-                                                        device_span<uint64_t const> row_offsets,
-                                                        rmm::cuda_stream_view stream)
+std::pair<rmm::device_uvector<char>, selected_rows_offsets> select_data_and_row_offsets(
+  cudf::io::datasource* source,
+  csv_reader_options const& reader_opts,
+  std::vector<char>& header,
+  parse_options const& parse_opts,
+  rmm::cuda_stream_view stream)
+{
+  auto range_offset      = reader_opts.get_byte_range_offset();
+  auto range_size        = reader_opts.get_byte_range_size();
+  auto range_size_padded = reader_opts.get_byte_range_size_with_padding();
+  auto skip_rows         = reader_opts.get_skiprows();
+  auto skip_end_rows     = reader_opts.get_skipfooter();
+  auto num_rows          = reader_opts.get_nrows();
+
+  if (range_offset > 0 || range_size > 0) {
+    CUDF_EXPECTS(reader_opts.get_compression() == compression_type::NONE,
+                 "Reading compressed data using `byte range` is unsupported");
+  }
+
+  // Transfer source data to GPU
+  if (!source->is_empty()) {
+    auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
+    auto buffer    = source->host_read(range_offset, data_size);
+
+    auto h_data = host_span<char const>(  //
+      reinterpret_cast<const char*>(buffer->data()),
+      buffer->size());
+
+    std::vector<char> h_uncomp_data_owner;
+
+    if (reader_opts.get_compression() != compression_type::NONE) {
+      h_uncomp_data_owner = get_uncompressed_data(h_data, reader_opts.get_compression());
+      h_data              = h_uncomp_data_owner;
+    }
+    // None of the parameters for row selection is used, we are parsing the entire file
+    const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
+                                 skip_end_rows <= 0 && num_rows == -1;
+
+    // With byte range, find the start of the first data row
+    size_t const data_start_offset =
+      (range_offset != 0) ? find_first_row_start(parse_opts.terminator, h_data) : 0;
+
+    // TODO: Allow parsing the header outside the mapped range
+    CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0),
+                 "byte_range offset with header not supported");
+
+    // Gather row offsets
+    auto data_row_offsets =
+      load_data_and_gather_row_offsets(reader_opts,
+                                       parse_opts,
+                                       header,
+                                       h_data,
+                                       data_start_offset,
+                                       (range_size) ? range_size : h_data.size(),
+                                       (skip_rows > 0) ? skip_rows : 0,
+                                       num_rows,
+                                       load_whole_file,
+                                       stream);
+    auto& row_offsets = data_row_offsets.second;
+    // Exclude the rows that are to be skipped from the end
+    if (skip_end_rows > 0 && static_cast<size_t>(skip_end_rows) < row_offsets.size()) {
+      row_offsets.shrink(row_offsets.size() - skip_end_rows);
+    }
+    return data_row_offsets;
+  }
+  return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
+}
+
+std::vector<data_type> select_data_types(std::vector<column_parse::flags> const& column_flags,
+                                         std::vector<data_type> const& dtypes,
+                                         int32_t num_actual_columns,
+                                         int32_t num_active_columns)
+{
+  std::vector<data_type> selected_dtypes;
+
+  if (dtypes.size() == 1) {
+    // If it's a single dtype, assign that dtype to all active columns
+    selected_dtypes.resize(num_active_columns, dtypes.front());
+  } else {
+    // If it's a list, assign dtypes to active columns in the given order
+    CUDF_EXPECTS(static_cast<int>(dtypes.size()) >= num_actual_columns,
+                 "Must specify data types for all columns");
+
+    for (int i = 0; i < num_actual_columns; i++) {
+      if (column_flags[i] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[i]); }
+    }
+  }
+  return selected_dtypes;
+}
+
+std::vector<data_type> get_data_types_from_column_names(
+  std::vector<column_parse::flags> const& column_flags,
+  std::map<std::string, data_type> const& column_type_map,
+  std::vector<std::string> const& column_names,
+  int32_t num_actual_columns)
+{
+  std::vector<data_type> selected_dtypes;
+
+  for (int32_t i = 0; i < num_actual_columns; i++) {
+    if (column_flags[i] & column_parse::enabled) {
+      auto const col_type_it = column_type_map.find(column_names[i]);
+      CUDF_EXPECTS(col_type_it != column_type_map.end(),
+                   "Must specify data types for all active columns");
+      selected_dtypes.emplace_back(col_type_it->second);
+    }
+  }
+
+  return selected_dtypes;
+}
+
+std::vector<data_type> infer_column_types(parse_options const& parse_opts,
+                                          std::vector<column_parse::flags> const& column_flags,
+                                          device_span<char const> data,
+                                          device_span<uint64_t const> row_offsets,
+                                          int32_t num_records,
+                                          int32_t num_active_columns,
+                                          data_type timestamp_type,
+                                          rmm::cuda_stream_view stream)
 {
   std::vector<data_type> dtypes;
-  if (num_records_ == 0) {
-    dtypes.resize(num_active_cols_, data_type{type_id::EMPTY});
+  if (num_records == 0) {
+    dtypes.resize(num_active_columns, data_type{type_id::EMPTY});
   } else {
     auto column_stats =
-      cudf::io::csv::gpu::detect_column_types(opts.view(),
+      cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
                                               data,
-                                              make_device_uvector_async(column_flags_, stream),
+                                              make_device_uvector_async(column_flags, stream),
                                               row_offsets,
-                                              num_active_cols_,
+                                              num_active_columns,
                                               stream);
 
     stream.synchronize();
 
-    for (int col = 0; col < num_active_cols_; col++) {
+    for (int col = 0; col < num_active_columns; col++) {
       unsigned long long int_count_total = column_stats[col].big_int_count +
                                            column_stats[col].negative_small_int_count +
                                            column_stats[col].positive_small_int_count;
 
-      if (column_stats[col].null_count == num_records_) {
+      if (column_stats[col].null_count == num_records) {
         // Entire column is NULL; allocate the smallest amount of memory
         dtypes.emplace_back(cudf::type_id::INT8);
       } else if (column_stats[col].string_count > 0L) {
@@ -649,9 +555,9 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
     }
   }
 
-  if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
+  if (timestamp_type.id() != cudf::type_id::EMPTY) {
     for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
+      if (cudf::is_timestamp(type)) { type = timestamp_type; }
     }
   }
 
@@ -663,43 +569,50 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
   return dtypes;
 }
 
-std::vector<column_buffer> reader::impl::decode_data(device_span<char const> data,
-                                                     device_span<uint64_t const> row_offsets,
-                                                     host_span<data_type const> column_types,
-                                                     rmm::cuda_stream_view stream)
+std::vector<column_buffer> decode_data(parse_options const& parse_opts,
+                                       std::vector<column_parse::flags> const& column_flags,
+                                       std::vector<std::string> const& column_names,
+                                       device_span<char const> data,
+                                       device_span<uint64_t const> row_offsets,
+                                       host_span<data_type const> column_types,
+                                       int32_t num_records,
+                                       int32_t num_actual_columns,
+                                       int32_t num_active_columns,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
   out_buffers.reserve(column_types.size());
 
-  for (int col = 0, active_col = 0; col < num_actual_cols_; ++col) {
-    if (column_flags_[col] & column_parse::enabled) {
+  for (int col = 0, active_col = 0; col < num_actual_columns; ++col) {
+    if (column_flags[col] & column_parse::enabled) {
       const bool is_final_allocation = column_types[active_col].id() != type_id::STRING;
       auto out_buffer =
         column_buffer(column_types[active_col],
-                      num_records_,
+                      num_records,
                       true,
                       stream,
-                      is_final_allocation ? mr_ : rmm::mr::get_current_device_resource());
+                      is_final_allocation ? mr : rmm::mr::get_current_device_resource());
 
-      out_buffer.name         = col_names_[col];
+      out_buffer.name         = column_names[col];
       out_buffer.null_count() = UNKNOWN_NULL_COUNT;
       out_buffers.emplace_back(std::move(out_buffer));
       active_col++;
     }
   }
 
-  thrust::host_vector<void*> h_data(num_active_cols_);
-  thrust::host_vector<bitmask_type*> h_valid(num_active_cols_);
+  thrust::host_vector<void*> h_data(num_active_columns);
+  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
 
-  for (int i = 0; i < num_active_cols_; ++i) {
+  for (int i = 0; i < num_active_columns; ++i) {
     h_data[i]  = out_buffers[i].data();
     h_valid[i] = out_buffers[i].null_mask();
   }
 
-  cudf::io::csv::gpu::decode_row_column_data(opts.view(),
+  cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(),
                                              data,
-                                             make_device_uvector_async(column_flags_, stream),
+                                             make_device_uvector_async(column_flags, stream),
                                              row_offsets,
                                              make_device_uvector_async(column_types, stream),
                                              make_device_uvector_async(h_data, stream),
@@ -709,6 +622,209 @@ std::vector<column_buffer> reader::impl::decode_data(device_span<char const> dat
   return out_buffers;
 }
 
+table_with_metadata read_csv(cudf::io::datasource* source,
+                             csv_reader_options const& reader_opts,
+                             parse_options const& parse_opts,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+{
+  std::vector<char> header;
+
+  auto const data_row_offsets =
+    select_data_and_row_offsets(source, reader_opts, header, parse_opts, stream);
+
+  auto const& data        = data_row_offsets.first;
+  auto const& row_offsets = data_row_offsets.second;
+
+  // Exclude the end-of-data row from number of rows with actual data
+  auto num_records        = std::max(row_offsets.size(), 1ul) - 1;
+  auto column_flags       = std::vector<column_parse::flags>();
+  auto column_names       = std::vector<std::string>();
+  auto num_actual_columns = static_cast<int32_t>(reader_opts.get_names().size());
+  auto num_active_columns = num_actual_columns;
+
+  // Check if the user gave us a list of column names
+  if (not reader_opts.get_names().empty()) {
+    column_flags.resize(reader_opts.get_names().size(), column_parse::enabled);
+    column_names = reader_opts.get_names();
+  } else {
+    column_names = get_column_names(
+      header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
+
+    num_actual_columns = num_active_columns = column_names.size();
+
+    column_flags.resize(num_actual_columns, column_parse::enabled);
+
+    // Rename empty column names to "Unnamed: col_index"
+    for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) {
+      if (column_names[col_idx].empty()) {
+        column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
+      }
+    }
+
+    // Looking for duplicates
+    std::unordered_map<string, int> col_names_histogram;
+    for (auto& col_name : column_names) {
+      // Operator [] inserts a default-initialized value if the given key is not
+      // present
+      if (++col_names_histogram[col_name] > 1) {
+        if (reader_opts.is_enabled_mangle_dupe_cols()) {
+          // Rename duplicates of column X as X.1, X.2, ...; First appearance
+          // stays as X
+          do {
+            col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
+          } while (col_names_histogram[col_name]++);
+        } else {
+          // All duplicate columns will be ignored; First appearance is parsed
+          const auto idx    = &col_name - column_names.data();
+          column_flags[idx] = column_parse::disabled;
+        }
+      }
+    }
+
+    // Update the number of columns to be processed, if some might have been
+    // removed
+    if (!reader_opts.is_enabled_mangle_dupe_cols()) {
+      num_active_columns = col_names_histogram.size();
+    }
+  }
+
+  // User can specify which columns should be parsed
+  if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) {
+    std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled);
+
+    for (const auto index : reader_opts.get_use_cols_indexes()) {
+      column_flags[index] = column_parse::enabled;
+    }
+    num_active_columns = std::unordered_set<int>(reader_opts.get_use_cols_indexes().begin(),
+                                                 reader_opts.get_use_cols_indexes().end())
+                           .size();
+
+    for (const auto& name : reader_opts.get_use_cols_names()) {
+      const auto it = std::find(column_names.begin(), column_names.end(), name);
+      if (it != column_names.end()) {
+        auto curr_it = it - column_names.begin();
+        if (column_flags[curr_it] == column_parse::disabled) {
+          column_flags[curr_it] = column_parse::enabled;
+          num_active_columns++;
+        }
+      }
+    }
+  }
+
+  // User can specify which columns should be read as datetime
+  if (!reader_opts.get_parse_dates_indexes().empty() ||
+      !reader_opts.get_parse_dates_names().empty()) {
+    for (const auto index : reader_opts.get_parse_dates_indexes()) {
+      column_flags[index] |= column_parse::as_datetime;
+    }
+
+    for (const auto& name : reader_opts.get_parse_dates_names()) {
+      auto it = std::find(column_names.begin(), column_names.end(), name);
+      if (it != column_names.end()) {
+        column_flags[it - column_names.begin()] |= column_parse::as_datetime;
+      }
+    }
+  }
+
+  // User can specify which columns should be parsed as hexadecimal
+  if (!reader_opts.get_parse_hex_indexes().empty() || !reader_opts.get_parse_hex_names().empty()) {
+    for (const auto index : reader_opts.get_parse_hex_indexes()) {
+      column_flags[index] |= column_parse::as_hexadecimal;
+    }
+
+    for (const auto& name : reader_opts.get_parse_hex_names()) {
+      auto it = std::find(column_names.begin(), column_names.end(), name);
+      if (it != column_names.end()) {
+        column_flags[it - column_names.begin()] |= column_parse::as_hexadecimal;
+      }
+    }
+  }
+
+  // Return empty table rather than exception if nothing to load
+  if (num_active_columns == 0) { return {std::make_unique<table>(), {}}; }
+
+  auto metadata    = table_metadata{};
+  auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
+
+  bool has_to_infer_column_types =
+    std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
+
+  std::vector<data_type> column_types;
+  if (has_to_infer_column_types) {
+    column_types = infer_column_types(  //
+      parse_opts,
+      column_flags,
+      data,
+      row_offsets,
+      num_records,
+      num_active_columns,
+      reader_opts.get_timestamp_type(),
+      stream);
+  } else {
+    column_types =
+      std::visit(cudf::detail::visitor_overload{
+                   [&](const std::vector<data_type>& data_types) {
+                     return select_data_types(
+                       column_flags, data_types, num_actual_columns, num_active_columns);
+                   },
+                   [&](const std::map<std::string, data_type>& data_types) {
+                     return get_data_types_from_column_names(  //
+                       column_flags,
+                       data_types,
+                       column_names,
+                       num_actual_columns);
+                   }},
+                 reader_opts.get_dtypes());
+  }
+
+  out_columns.reserve(column_types.size());
+
+  if (num_records != 0) {
+    auto out_buffers = decode_data(  //
+      parse_opts,
+      column_flags,
+      column_names,
+      data,
+      row_offsets,
+      column_types,
+      num_records,
+      num_actual_columns,
+      num_active_columns,
+      stream,
+      mr);
+    for (size_t i = 0; i < column_types.size(); ++i) {
+      metadata.column_names.emplace_back(out_buffers[i].name);
+      if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' &&
+          parse_opts.doublequote == true) {
+        // PANDAS' default behavior of enabling doublequote for two consecutive
+        // quotechars in quoted fields results in reduction to a single quotechar
+        // TODO: Would be much more efficient to perform this operation in-place
+        // during the conversion stage
+        const std::string quotechar(1, parse_opts.quotechar);
+        const std::string dblquotechar(2, parse_opts.quotechar);
+        std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
+        out_columns.emplace_back(
+          cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr));
+      } else {
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr));
+      }
+    }
+  } else {
+    // Create empty columns
+    for (size_t i = 0; i < column_types.size(); ++i) {
+      out_columns.emplace_back(make_empty_column(column_types[i]));
+    }
+    // Handle empty metadata
+    for (int col = 0; col < num_actual_columns; ++col) {
+      if (column_flags[col] & column_parse::enabled) {
+        metadata.column_names.emplace_back(column_names[col]);
+      }
+    }
+  }
+  return {std::make_unique<table>(std::move(out_columns)), std::move(metadata)};
+}
+
 /**
  * @brief Create a serialized trie for N/A value matching, based on the options.
  */
@@ -807,33 +923,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   return parse_opts;
 }
 
-reader::impl::impl(std::unique_ptr<datasource> source,
-                   csv_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : mr_(mr), source_(std::move(source)), opts_(options)
-{
-  num_actual_cols_ = opts_.get_names().size();
-  num_active_cols_ = num_actual_cols_;
-
-  opts = make_parse_options(options, stream);
-}
+}  // namespace
 
-// Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-               csv_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
+                             csv_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(std::move(sources[0]), options, stream, mr);
-}
+  auto parse_options = make_parse_options(options, stream);
 
-// Destructor within this translation unit
-reader::~reader() = default;
-
-// Forward to implementation
-table_with_metadata reader::read(rmm::cuda_stream_view stream) { return _impl->read(stream); }
+  return read_csv(source.get(), options, parse_options, stream, mr);
+}
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
deleted file mode 100644
index de363a46ffe..00000000000
--- a/cpp/src/io/csv/reader_impl.hpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "csv_common.h"
-#include "csv_gpu.h"
-
-#include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-#include <io/utilities/trie.cuh>
-
-#include <cudf/io/csv.hpp>
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-using cudf::host_span;
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
-using namespace cudf::io::csv;
-using namespace cudf::io;
-
-/**
- * @brief Implementation for CSV reader
- *
- * The CSV reader is implemented in 4 stages:
- * Stage 1: read and optionally decompress the input data in host memory
- * (may be a memory-mapped view of the data on disk)
- *
- * Stage 2: gather the offset of each data row within the csv data.
- * Since the number of rows in a given character block may depend on the
- * initial parser state (like whether the block starts in a middle of a
- * quote or not), a separate row count and output parser state is computed
- * for every possible input parser state per 16KB character block.
- * The result is then used to infer the parser state and starting row at
- * the beginning of every character block.
- * A second pass can then output the location of every row (which is needed
- * for the subsequent parallel conversion of every row from csv text
- * to cudf binary form)
- *
- * Stage 3: Optional stage to infer the data type of each CSV column.
- *
- * Stage 4: Convert every row from csv text form to cudf binary form.
- */
-class reader::impl {
- public:
-  /**
-   * @brief Constructor from a dataset source with reader options.
-   *
-   * @param source Dataset source
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit impl(std::unique_ptr<datasource> source,
-                csv_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Read an entire set or a subset of data and returns a set of columns.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return The set of columns along with metadata
-   */
-  table_with_metadata read(rmm::cuda_stream_view stream);
-
- private:
-  /**
-   * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span.
-   *
-   * Row offsets are stored this way to avoid reallocation/copies when discarding front or back
-   * elements.
-   */
-  class selected_rows_offsets {
-    rmm::device_uvector<uint64_t> all;
-    device_span<uint64_t const> selected;
-
-   public:
-    selected_rows_offsets(rmm::device_uvector<uint64_t>&& data,
-                          device_span<uint64_t const> selected_span)
-      : all{std::move(data)}, selected{selected_span}
-    {
-    }
-    selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {}
-
-    operator device_span<uint64_t const>() const { return selected; }
-    void shrink(size_t size)
-    {
-      CUDF_EXPECTS(size <= selected.size(), "New size must be smaller");
-      selected = selected.subspan(0, size);
-    }
-    void erase_first_n(size_t n)
-    {
-      CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove");
-      selected = selected.subspan(n, selected.size() - n);
-    }
-    auto size() const { return selected.size(); }
-    auto data() const { return selected.data(); }
-  };
-
-  /**
-   * @brief Selectively loads data on the GPU and gathers offsets of rows to read.
-   *
-   * Selection is based on read options.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  select_data_and_row_offsets(rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
-   *
-   * This function scans the input data to record the row offsets (relative to the start of the
-   * input data). A row is actually the data/offset between two termination symbols.
-   *
-   * @param data Uncompressed input data in host memory
-   * @param range_begin Only include rows starting after this position
-   * @param range_end Only include rows starting before this position
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read; -1: all remaining data
-   * @param load_whole_file Hint that the entire data will be needed on gpu
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Input data and row offsets in the device memory
-   */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  load_data_and_gather_row_offsets(host_span<char const> data,
-                                   size_t range_begin,
-                                   size_t range_end,
-                                   size_t skip_rows,
-                                   int64_t num_rows,
-                                   bool load_whole_file,
-                                   rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Find the start position of the first data row
-   *
-   * @param h_data Uncompressed input data in host memory
-   *
-   * @return Byte position of the first row
-   */
-  size_t find_first_row_start(host_span<char const> data);
-
-  /**
-   * @brief Automatically infers each column's data type based on the CSV's data within that column.
-   *
-   * @param data The CSV data from which to infer the columns' data types
-   * @param row_offsets The row offsets into the CSV's data
-   * @param stream The stream to which the type inference-kernel will be dispatched
-   * @return The columns' inferred data types
-   */
-  std::vector<data_type> infer_column_types(device_span<char const> data,
-                                            device_span<uint64_t const> row_offsets,
-                                            rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Selects the columns' data types from the map of dtypes.
-   *
-   * @param col_type_map Column name -> data type map specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
-
-  /**
-   * @brief Selects the columns' data types from the list of dtypes.
-   *
-   * @param dtypes Vector of data types specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
-
-  /**
-   * @brief Converts the row-column data and outputs to column bufferrs.
-   *
-   * @param column_types Column types
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return list of column buffers of decoded data, or ptr/size in the case of strings.
-   */
-  std::vector<column_buffer> decode_data(device_span<char const> data,
-                                         device_span<uint64_t const> row_offsets,
-                                         host_span<data_type const> column_types,
-                                         rmm::cuda_stream_view stream);
-
- private:
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-  std::unique_ptr<datasource> source_;
-  const csv_reader_options opts_;
-
-  cudf::size_type num_records_ = 0;  // Number of rows with actual data
-  int num_active_cols_         = 0;  // Number of columns to read
-  int num_actual_cols_         = 0;  // Number of columns in the dataset
-
-  // Parsing options
-  parse_options opts{};
-  std::vector<column_parse::flags> column_flags_;
-
-  // Intermediate data
-  std::vector<std::string> col_names_;
-  std::vector<char> header_;
-};
-
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 511a1a22ee7..5ae5d77be1d 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -201,8 +201,6 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
 
 table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
 {
-  namespace csv = cudf::io::detail::csv;
-
   CUDF_FUNC_RANGE();
 
   options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
@@ -211,10 +209,13 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
                                       options.get_byte_range_offset(),
                                       options.get_byte_range_size_with_padding());
 
-  auto reader =
-    std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+  CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported.");
 
-  return reader->read();
+  return cudf::io::detail::csv::read_csv(  //
+    std::move(datasources[0]),
+    options,
+    rmm::cuda_stream_default,
+    mr);
 }
 
 // Freeform API wraps the detail writer class API
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 19533c9fbdd..10fc1015528 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -68,7 +68,7 @@ struct parse_options {
   cudf::detail::optional_trie trie_na;
   bool multi_delimiter;
 
-  parse_options_view view()
+  parse_options_view view() const
   {
     return {delimiter,
             terminator,
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index bb6a85094f5..e7f1e867a41 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -50,14 +50,12 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  auto const superimposed = structs::detail::superimpose_parent_nulls(order_by, stream, mr);
-  table_view const order_table{{std::get<0>(superimposed)}};
   auto const flattened = cudf::structs::detail::flatten_nested_columns(
-    order_table, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
+    table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
   auto const d_flat_order = table_device_view::create(flattened, stream);
   row_equality_comparator<has_nulls> comparator(*d_flat_order, *d_flat_order, true);
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                       order_table.num_rows(),
+                                       flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
                                        stream,
                                        mr);
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 0e00221dabf..96ce14644c9 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -567,11 +567,11 @@ class regex_compiler {
         case LBRA: /* must have been RBRA */
           op1                                    = popand('(');
           id_inst2                               = m_prog.add_inst(RBRA);
-          m_prog.inst_at(id_inst2).u1.subid      = ator.subid;  // subidstack[subidstack.size()-1];
+          m_prog.inst_at(id_inst2).u1.subid      = ator.subid;
           m_prog.inst_at(op1.id_last).u2.next_id = id_inst2;
           id_inst1                               = m_prog.add_inst(LBRA);
-          m_prog.inst_at(id_inst1).u1.subid   = ator.subid;  // subidstack[subidstack.size() - 1];
-          m_prog.inst_at(id_inst1).u2.next_id = op1.id_first;
+          m_prog.inst_at(id_inst1).u1.subid      = ator.subid;
+          m_prog.inst_at(id_inst1).u2.next_id    = op1.id_first;
           pushand(id_inst1, id_inst2);
           return;
         case OR:
@@ -826,7 +826,8 @@ reprog reprog::create_from(const char32_t* pattern)
 {
   reprog rtn;
   regex_compiler compiler(pattern, ANY, rtn);  // future feature: ANYNL
-  // rtn->print();
+  // for debugging, it can be helpful to call rtn.print() here to dump
+  // out the instructions that have been created from the given pattern
   return rtn;
 }
 
@@ -912,6 +913,7 @@ void reprog::optimize2()
   _startinst_ids.push_back(-1);  // terminator mark
 }
 
+#ifndef NDEBUG
 void reprog::print()
 {
   printf("Instructions:\n");
@@ -992,6 +994,7 @@ void reprog::print()
   }
   if (_num_capturing_groups) printf("Number of capturing groups: %d\n", _num_capturing_groups);
 }
+#endif
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 854fce15fd4..66e99756615 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -198,10 +198,10 @@ __device__ inline int32_t reprog_device::regexec(
 {
   int32_t match                   = 0;
   auto checkstart                 = jnk.starttype;
-  auto txtlen                     = dstr.length();
   auto pos                        = begin;
   auto eos                        = end;
   char32_t c                      = 0;
+  auto last_character             = false;
   string_view::const_iterator itr = string_view::const_iterator(dstr, pos);
 
   jnk.list1->reset();
@@ -235,7 +235,9 @@ __device__ inline int32_t reprog_device::regexec(
         jnk.list1->activate(ids[i++], (group_id == 0 ? pos : -1), -1);
     }
 
-    c = static_cast<char32_t>(pos >= txtlen ? 0 : *itr);
+    last_character = (pos >= dstr.length());
+
+    c = static_cast<char32_t>(last_character ? 0 : *itr);
 
     // expand LBRA, RBRA, BOL, EOL, BOW, NBOW, and OR
     bool expanded = false;
@@ -274,7 +276,7 @@ __device__ inline int32_t reprog_device::regexec(
             }
             break;
           case EOL:
-            if ((c == 0) || (inst->u1.c == '$' && c == '\n')) {
+            if (last_character || (inst->u1.c == '$' && c == '\n')) {
               id_activate = inst->u2.next_id;
               expanded    = true;
             }
@@ -360,7 +362,7 @@ __device__ inline int32_t reprog_device::regexec(
     ++itr;
     swaplist(jnk.list1, jnk.list2);
     checkstart = jnk.list1->size > 0 ? 0 : 1;
-  } while (c && (jnk.list1->size > 0 || match == 0));
+  } while (!last_character && (jnk.list1->size > 0 || match == 0));
 
   return match;
 }
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index 175918a0846..8cfd6d24fae 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -109,6 +109,23 @@ TEST_F(TransformTest, BasicAddition)
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, BasicAdditionCast)
+{
+  auto c_0   = column_wrapper<int64_t>{3, 20, 1, 50};
+  auto c_1   = column_wrapper<int8_t>{10, 7, 20, 0};
+  auto table = cudf::table_view{{c_0, c_1}};
+
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto cast       = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_INT64, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, cast);
+
+  auto expected = column_wrapper<int64_t>{13, 27, 21, 50};
+  auto result   = cudf::compute_column(table, expression);
+
+  cudf::test::expect_columns_equal(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, BasicEquality)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp
index 90d230ef1eb..4aa4ef236f0 100644
--- a/cpp/tests/groupby/correlation_tests.cpp
+++ b/cpp/tests/groupby/correlation_tests.cpp
@@ -32,7 +32,8 @@ using namespace cudf::test::iterators;
 namespace cudf {
 namespace test {
 
-using structs = structs_column_wrapper;
+constexpr auto nan = std::numeric_limits<double>::quiet_NaN();
+using structs      = structs_column_wrapper;
 
 template <typename V>
 struct groupby_correlation_test : public cudf::test::BaseFixture {
@@ -54,8 +55,7 @@ TYPED_TEST(groupby_correlation_test, basic)
   auto vals     = structs{{member_0, member_1}};
 
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
-  fixed_width_column_wrapper<R, double> expect_vals{
-    {1.0, 0.6, std::numeric_limits<double>::quiet_NaN()}};
+  fixed_width_column_wrapper<R, double> expect_vals{{1.0, 0.6, nan}};
 
   auto agg =
     cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
@@ -129,8 +129,7 @@ TYPED_TEST(groupby_correlation_test, null_keys_and_values)
   auto vals = structs{{val0, val1}};
 
   fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, no_nulls());
-  fixed_width_column_wrapper<R> expect_vals(
-    {1.0, 0.6, std::numeric_limits<double>::quiet_NaN(), 0.}, {1, 1, 1, 0});
+  fixed_width_column_wrapper<R> expect_vals({1.0, 0.6, nan, 0.}, {1, 1, 1, 0});
 
   auto agg =
     cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
@@ -153,8 +152,7 @@ TYPED_TEST(groupby_correlation_test, null_values_same)
   auto vals = structs{{val0, val1}};
 
   fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, no_nulls());
-  fixed_width_column_wrapper<R> expect_vals(
-    {1.0, 0.6, std::numeric_limits<double>::quiet_NaN(), 0.}, {1, 1, 1, 0});
+  fixed_width_column_wrapper<R> expect_vals({1.0, 0.6, nan, 0.}, {1, 1, 1, 0});
 
   auto agg =
     cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
@@ -181,14 +179,41 @@ TYPED_TEST(groupby_correlation_test, null_values_different)
   auto vals = structs{{val0, val1}};
 
   fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, no_nulls());
-  fixed_width_column_wrapper<R> expect_vals({1.0, 0., std::numeric_limits<double>::quiet_NaN(), 0.},
-                                            {1, 1, 1, 0});
+  fixed_width_column_wrapper<R> expect_vals({1.0, 0., nan, 0.}, {1, 1, 1, 0});
 
   auto agg =
     cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
 }
 
+TYPED_TEST(groupby_correlation_test, min_periods)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+
+  fixed_width_column_wrapper<R, double> expect_vals1{{1.0, 0.6, nan}};
+  auto agg1 =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON, 3);
+  test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R, double> expect_vals2{{1.0, 0.6, nan}, {0, 1, 0}};
+  auto agg2 =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON, 4);
+  test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R, double> expect_vals3{{1.0, 0.6, nan}, {0, 0, 0}};
+  auto agg3 =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON, 5);
+  test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3), force_use_sort_impl::YES);
+}
+
 struct groupby_dictionary_correlation_test : public cudf::test::BaseFixture {
 };
 
@@ -203,8 +228,7 @@ TEST_F(groupby_dictionary_correlation_test, basic)
   auto vals     = structs{{member_0, member_1}};
 
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
-  fixed_width_column_wrapper<R, double> expect_vals{
-    {1.0, 0.6, std::numeric_limits<double>::quiet_NaN()}};
+  fixed_width_column_wrapper<R, double> expect_vals{{1.0, 0.6, nan}};
 
   auto agg =
     cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp
index 039fce16222..3a4fbf92387 100644
--- a/cpp/tests/groupby/covariance_tests.cpp
+++ b/cpp/tests/groupby/covariance_tests.cpp
@@ -175,6 +175,53 @@ TYPED_TEST(groupby_covariance_test, null_values_different)
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
 }
 
+TYPED_TEST(groupby_covariance_test, min_periods)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+
+  fixed_width_column_wrapper<R, double> expect_vals1{{1.0, 1.0, 0.0}};
+  auto agg1 = cudf::make_covariance_aggregation<groupby_aggregation>(3);
+  test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R, double> expect_vals2{{1.0, 1.0, 0.0}, {0, 1, 0}};
+  auto agg2 = cudf::make_covariance_aggregation<groupby_aggregation>(4);
+  test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R, double> expect_vals3{{1.0, 1.0, 0.0}, {0, 0, 0}};
+  auto agg3 = cudf::make_covariance_aggregation<groupby_aggregation>(5);
+  test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, ddof)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+
+  fixed_width_column_wrapper<R, double> expect_vals1{{2.0, 1.5, 0.0}};
+  auto agg1 = cudf::make_covariance_aggregation<groupby_aggregation>(1, 2);
+  test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES);
+
+  auto const inf = std::numeric_limits<double>::infinity();
+  fixed_width_column_wrapper<R, double> expect_vals2{{inf, 3.0, 0.0}, {0, 1, 0}};
+  auto agg2 = cudf::make_covariance_aggregation<groupby_aggregation>(1, 3);
+  test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES);
+}
+
 struct groupby_dictionary_covariance_test : public cudf::test::BaseFixture {
 };
 
diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp
index 683eeb7eb01..6bbf25646d0 100644
--- a/cpp/tests/groupby/keys_tests.cpp
+++ b/cpp/tests/groupby/keys_tests.cpp
@@ -289,5 +289,66 @@ TEST_F(groupby_dictionary_keys_test, basic)
                   force_use_sort_impl::YES);
 }
 
+struct groupby_cache_test : public cudf::test::BaseFixture {
+};
+
+// To check if the cache doesn't insert multiple times to cache for same aggregation on a column in
+// same request.
+// If this test fails, then insert happened and key stored in cache map becomes dangling reference.
+// Any comparison with same aggregation as key will fail.
+TEST_F(groupby_cache_test, duplicate_agggregations)
+{
+  using K = int32_t;
+  using V = int32_t;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  groupby::groupby gb_obj(table_view({keys}));
+
+  std::vector<groupby::aggregation_request> requests;
+  requests.emplace_back(groupby::aggregation_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<groupby_aggregation>());
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<groupby_aggregation>());
+
+  // hash groupby
+  EXPECT_NO_THROW(gb_obj.aggregate(requests));
+
+  // sort groupby
+  // WAR to force groupby to use sort implementation
+  requests[0].aggregations.push_back(make_nth_element_aggregation<groupby_aggregation>(0));
+  EXPECT_NO_THROW(gb_obj.aggregate(requests));
+}
+
+// To check if the cache doesn't insert multiple times to cache for same aggregation on same column
+// but in different requests.
+// If this test fails, then insert happened and key stored in cache map becomes dangling reference.
+// Any comparison with same aggregation as key will fail.
+TEST_F(groupby_cache_test, duplicate_columns)
+{
+  using K = int32_t;
+  using V = int32_t;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  groupby::groupby gb_obj(table_view({keys}));
+
+  std::vector<groupby::aggregation_request> requests;
+  requests.emplace_back(groupby::aggregation_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<groupby_aggregation>());
+  requests.emplace_back(groupby::aggregation_request());
+  requests[1].values = vals;
+  requests[1].aggregations.push_back(cudf::make_sum_aggregation<groupby_aggregation>());
+
+  // hash groupby
+  EXPECT_NO_THROW(gb_obj.aggregate(requests));
+
+  // sort groupby
+  // WAR to force groupby to use sort implementation
+  requests[0].aggregations.push_back(make_nth_element_aggregation<groupby_aggregation>(0));
+  EXPECT_NO_THROW(gb_obj.aggregate(requests));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index 17e08bd21c5..ff9f79ea87f 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -28,12 +28,11 @@
 struct StringsCharsTest : public cudf::test::BaseFixture {
 };
 
-class StringsCharsTestTypes
-  : public StringsCharsTest,
-    public testing::WithParamInterface<cudf::strings::string_character_types> {
+class CharsTypes : public StringsCharsTest,
+                   public testing::WithParamInterface<cudf::strings::string_character_types> {
 };
 
-TEST_P(StringsCharsTestTypes, AllTypes)
+TEST_P(CharsTypes, AllTypes)
 {
   std::vector<const char*> h_strings{"Héllo",
                                      "thesé",
@@ -84,8 +83,8 @@ TEST_P(StringsCharsTestTypes, AllTypes)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-INSTANTIATE_TEST_CASE_P(StringsCharsTestAllTypes,
-                        StringsCharsTestTypes,
+INSTANTIATE_TEST_CASE_P(StringsCharsTest,
+                        CharsTypes,
                         testing::ValuesIn(std::array<cudf::strings::string_character_types, 7>{
                           cudf::strings::string_character_types::DECIMAL,
                           cudf::strings::string_character_types::NUMERIC,
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index ddd6fc9e1dc..bb9a4c9cf5e 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -21,6 +21,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <algorithm>
 #include <vector>
 
 struct StringsContainsTests : public cudf::test::BaseFixture {
@@ -236,6 +237,30 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
   }
 }
 
+TEST_F(StringsContainsTests, EmbeddedNullCharacter)
+{
+  std::vector<std::string> data(10);
+  std::generate(data.begin(), data.end(), [n = 0]() mutable {
+    char first      = static_cast<char>('A' + n++);
+    char raw_data[] = {first, '\0', 'B'};
+    return std::string{raw_data, 3};
+  });
+  cudf::test::strings_column_wrapper input(data.begin(), data.end());
+  auto strings_view = cudf::strings_column_view(input);
+
+  auto results  = cudf::strings::contains_re(strings_view, "A");
+  auto expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  results  = cudf::strings::contains_re(strings_view, "B");
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  results  = cudf::strings::contains_re(strings_view, "J\\0B");
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0, 0, 0, 0, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsContainsTests, CountTest)
 {
   std::vector<const char*> h_strings{
diff --git a/cpp/tests/strings/pad_tests.cpp b/cpp/tests/strings/pad_tests.cpp
index a64304d1027..f344b5432a2 100644
--- a/cpp/tests/strings/pad_tests.cpp
+++ b/cpp/tests/strings/pad_tests.cpp
@@ -104,11 +104,10 @@ TEST_F(StringsPadTest, ZeroSizeStringsColumn)
   cudf::test::expect_strings_empty(results->view());
 }
 
-class StringsPadParmsTest : public StringsPadTest,
-                            public testing::WithParamInterface<cudf::size_type> {
+class PadParameters : public StringsPadTest, public testing::WithParamInterface<cudf::size_type> {
 };
 
-TEST_P(StringsPadParmsTest, Padding)
+TEST_P(PadParameters, Padding)
 {
   std::vector<std::string> h_strings{"eee ddd", "bb cc", "aa", "bbb", "fff", "", "o"};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
@@ -128,8 +127,8 @@ TEST_P(StringsPadParmsTest, Padding)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-INSTANTIATE_TEST_CASE_P(StringsPadParmWidthTest,
-                        StringsPadParmsTest,
+INSTANTIATE_TEST_CASE_P(StringsPadTest,
+                        PadParameters,
                         testing::ValuesIn(std::array<cudf::size_type, 3>{5, 6, 7}));
 
 TEST_F(StringsPadTest, ZFill)
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 1f01f0f1429..fc1c20d8719 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@
 
 #include <vector>
 
-struct StringsReplaceTests : public cudf::test::BaseFixture {
+struct StringsReplaceRegexTest : public cudf::test::BaseFixture {
 };
 
-TEST_F(StringsReplaceTests, ReplaceRegexTest)
+TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
 {
   std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "the fat cat lays next to the other accénted cat",
@@ -59,7 +59,7 @@ TEST_F(StringsReplaceTests, ReplaceRegexTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, ReplaceMultiRegexTest)
+TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
 {
   std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "the fat cat lays next to the other accénted cat",
@@ -95,7 +95,7 @@ TEST_F(StringsReplaceTests, ReplaceMultiRegexTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, InvalidRegex)
+TEST_F(StringsReplaceRegexTest, InvalidRegex)
 {
   cudf::test::strings_column_wrapper strings(
     {"abc*def|ghi+jkl", ""});  // these do not really matter
@@ -116,7 +116,7 @@ TEST_F(StringsReplaceTests, InvalidRegex)
                cudf::logic_error);
 }
 
-TEST_F(StringsReplaceTests, WithEmptyPattern)
+TEST_F(StringsReplaceRegexTest, WithEmptyPattern)
 {
   std::vector<const char*> h_strings{"asd", "xcv"};
   cudf::test::strings_column_wrapper strings(
@@ -133,7 +133,7 @@ TEST_F(StringsReplaceTests, WithEmptyPattern)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
-TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest)
+TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
   std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "the fat cat lays next to the other accénted cat",
@@ -167,7 +167,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, ReplaceBackrefsRegexAltIndexPatternTest)
+TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexAltIndexPatternTest)
 {
   cudf::test::strings_column_wrapper strings({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"});
   auto strings_view = cudf::strings_column_view(strings);
@@ -181,7 +181,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexAltIndexPatternTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest)
+TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest)
 {
   cudf::test::strings_column_wrapper strings(
     {"A543", "Z756", "", "tést-string", "two-thréé four-fivé", "abcd-éfgh", "tést-string-again"});
@@ -200,7 +200,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, BackrefWithGreedyQuantifier)
+TEST_F(StringsReplaceRegexTest, BackrefWithGreedyQuantifier)
 {
   cudf::test::strings_column_wrapper input(
     {"<h1>title</h1><h2>ABC</h2>", "<h1>1234567</h1><h2>XYZ</h2>"});
@@ -217,7 +217,7 @@ TEST_F(StringsReplaceTests, BackrefWithGreedyQuantifier)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, ReplaceBackrefsRegexErrorTest)
+TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest)
 {
   cudf::test::strings_column_wrapper strings({"this string left intentionally blank"});
   auto view = cudf::strings_column_view(strings);
@@ -228,7 +228,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexErrorTest)
   EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", ""), cudf::logic_error);
 }
 
-TEST_F(StringsReplaceTests, MediumReplaceRegex)
+TEST_F(StringsReplaceRegexTest, MediumReplaceRegex)
 {
   // This results in 95 regex instructions and falls in the 'medium' range.
   std::string medium_regex =
@@ -256,7 +256,7 @@ TEST_F(StringsReplaceTests, MediumReplaceRegex)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, LargeReplaceRegex)
+TEST_F(StringsReplaceRegexTest, LargeReplaceRegex)
 {
   // This results in 117 regex instructions and falls in the 'large' range.
   std::string large_regex =
diff --git a/cpp/tests/strings/substring_tests.cpp b/cpp/tests/strings/substring_tests.cpp
index f9a71407a0d..448b61300fd 100644
--- a/cpp/tests/strings/substring_tests.cpp
+++ b/cpp/tests/strings/substring_tests.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
@@ -28,8 +28,6 @@
 #include <string>
 #include <vector>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 struct StringsSubstringsTest : public cudf::test::BaseFixture {
 };
 
@@ -51,11 +49,11 @@ TEST_F(StringsSubstringsTest, Substring)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-class SubstringParmsTest : public StringsSubstringsTest,
-                           public testing::WithParamInterface<cudf::size_type> {
+class Parameters : public StringsSubstringsTest,
+                   public testing::WithParamInterface<cudf::size_type> {
 };
 
-TEST_P(SubstringParmsTest, Substring)
+TEST_P(Parameters, Substring)
 {
   std::vector<std::string> h_strings{"basic strings", "that can", "be used", "with STL"};
   cudf::size_type start = GetParam();
@@ -72,7 +70,7 @@ TEST_P(SubstringParmsTest, Substring)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_P(SubstringParmsTest, Substring_From)
+TEST_P(Parameters, Substring_From)
 {
   std::vector<std::string> h_strings{"basic strings", "that can", "be used", "with STL"};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
@@ -96,7 +94,7 @@ TEST_P(SubstringParmsTest, Substring_From)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_P(SubstringParmsTest, AllEmpty)
+TEST_P(Parameters, AllEmpty)
 {
   std::vector<std::string> h_strings{"", "", "", ""};
   cudf::size_type start = GetParam();
@@ -118,7 +116,7 @@ TEST_P(SubstringParmsTest, AllEmpty)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_P(SubstringParmsTest, AllNulls)
+TEST_P(Parameters, AllNulls)
 {
   std::vector<const char*> h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
   cudf::test::strings_column_wrapper strings(
@@ -147,23 +145,9 @@ TEST_P(SubstringParmsTest, AllNulls)
 }
 
 INSTANTIATE_TEST_CASE_P(StringsSubstringsTest,
-                        SubstringParmsTest,
+                        Parameters,
                         testing::ValuesIn(std::array<cudf::size_type, 3>{1, 2, 3}));
 
-TEST_F(StringsSubstringsTest, ZeroSizeStringsColumn)
-{
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_column = cudf::strings_column_view(zero_size_strings_column);
-  auto results        = cudf::strings::slice_strings(strings_column, 1, 2);
-  cudf::test::expect_strings_empty(results->view());
-
-  cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
-  cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
-  results = cudf::strings::slice_strings(strings_column, starts_column, stops_column);
-  cudf::test::expect_strings_empty(results->view());
-}
-
 TEST_F(StringsSubstringsTest, NegativePositions)
 {
   cudf::test::strings_column_wrapper strings{
@@ -270,34 +254,59 @@ TEST_F(StringsSubstringsTest, MaxPositions)
 TEST_F(StringsSubstringsTest, Error)
 {
   cudf::test::strings_column_wrapper strings{"this string intentionally left blank"};
-  auto strings_column = cudf::strings_column_view(strings);
-  EXPECT_THROW(cudf::strings::slice_strings(strings_column, 0, 0, 0), cudf::logic_error);
-}
+  auto strings_view = cudf::strings_column_view(strings);
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, 0, 0, 0), cudf::logic_error);
 
-struct StringsSubstringsScalarDelimiterTest : public cudf::test::BaseFixture {
-};
+  auto delim_col = cudf::test::strings_column_wrapper({"", ""});
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, -1),
+               cudf::logic_error);
+
+  auto indexes = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2});
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes, indexes), cudf::logic_error);
 
-TEST_F(StringsSubstringsScalarDelimiterTest, ZeroSizeStringsColumn)
+  auto indexes_null = cudf::test::fixed_width_column_wrapper<int32_t>({1}, {0});
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes_null, indexes_null),
+               cudf::logic_error);
+
+  auto indexes_bad = cudf::test::fixed_width_column_wrapper<float>({1});
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes_bad, indexes_bad),
+               cudf::logic_error);
+}
+
+TEST_F(StringsSubstringsTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(col0);
+  cudf::column_view zero_size_strings_column(
+    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
+
+  auto results = cudf::strings::slice_strings(strings_view, 1, 2);
+  cudf::test::expect_strings_empty(results->view());
+
+  results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1);
+  cudf::test::expect_strings_empty(results->view());
+
+  cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
+  cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
+  results = cudf::strings::slice_strings(strings_view, starts_column, stops_column);
+  cudf::test::expect_strings_empty(results->view());
 
-  auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1);
+  results = cudf::strings::slice_strings(strings_view, strings_view, 1);
   cudf::test::expect_strings_empty(results->view());
 }
 
-TEST_F(StringsSubstringsScalarDelimiterTest, AllEmpty)
+TEST_F(StringsSubstringsTest, AllEmpty)
 {
   auto strings_col  = cudf::test::strings_column_wrapper({"", "", "", "", ""});
   auto strings_view = cudf::strings_column_view(strings_col);
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""});
+  auto exp_results  = cudf::column_view(strings_col);
 
   auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("e"), -1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
+  results = cudf::strings::slice_strings(strings_view, strings_view, -1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
 }
 
-TEST_F(StringsSubstringsScalarDelimiterTest, EmptyDelimiter)
+TEST_F(StringsSubstringsTest, EmptyDelimiter)
 {
   auto strings_col = cudf::test::strings_column_wrapper(
     {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
@@ -306,11 +315,18 @@ TEST_F(StringsSubstringsScalarDelimiterTest, EmptyDelimiter)
 
   auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
                                                         {true, true, false, true, true, true});
-  auto results     = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+
+  auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
+
+  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
+                                                      {true, false, true, false, true, false});
+
+  results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
 }
 
-TEST_F(StringsSubstringsScalarDelimiterTest, ZeroCount)
+TEST_F(StringsSubstringsTest, ZeroCount)
 {
   auto strings_col = cudf::test::strings_column_wrapper(
     {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
@@ -321,10 +337,16 @@ TEST_F(StringsSubstringsScalarDelimiterTest, ZeroCount)
                                                         {true, true, false, true, true, true});
 
   auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
+
+  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
+                                                      {true, false, true, false, true, false});
+
+  results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
 }
 
-TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
+TEST_F(StringsSubstringsTest, SearchScalarDelimiter)
 {
   auto strings_col = cudf::test::strings_column_wrapper(
     {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
@@ -336,7 +358,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
                                                           {true, true, false, true, true, true});
 
     auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -344,17 +366,17 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
       {"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true});
 
     auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
     auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_view.parent(), verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col);
   }
 
   {
     auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_view.parent(), verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col);
   }
 
   {
@@ -367,7 +389,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -380,7 +402,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -394,7 +416,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -408,7 +430,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -424,7 +446,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -441,76 +463,11 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 }
 
-struct StringsSubstringsColumnDelimiterTest : public cudf::test::BaseFixture {
-};
-
-TEST_F(StringsSubstringsColumnDelimiterTest, ZeroSizeStringsColumn)
-{
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(col0);
-
-  auto results = cudf::strings::slice_strings(strings_view, strings_view, 1);
-  // Check empty column
-  cudf::test::expect_strings_empty(results->view());
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, GenerateExceptions)
-{
-  auto col0      = cudf::test::strings_column_wrapper({"", "", "", "", ""});
-  auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", "."});
-
-  EXPECT_THROW(cudf::strings::slice_strings(
-                 cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1),
-               cudf::logic_error);
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, ColumnAllEmpty)
-{
-  auto col0      = cudf::test::strings_column_wrapper({"", "", "", "", ""});
-  auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", ".", "/"});
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""});
-
-  auto results = cudf::strings::slice_strings(
-    cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, DelimiterAllEmptyAndInvalid)
-{
-  auto col0 = cudf::test::strings_column_wrapper(
-    {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
-  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                      {true, false, true, false, true, false});
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                        {true, true, false, true, true, true});
-
-  auto results = cudf::strings::slice_strings(
-    cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, ZeroDelimiterCount)
-{
-  auto col0 = cudf::test::strings_column_wrapper(
-    {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
-  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                      {true, false, true, false, true, false});
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                        {true, true, false, true, true, true});
-
-  auto results = cudf::strings::slice_strings(
-    cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
+TEST_F(StringsSubstringsTest, SearchColumnDelimiter)
 {
   {
     auto col0 = cudf::test::strings_column_wrapper(
@@ -523,7 +480,7 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
 
     auto results = cudf::strings::slice_strings(
       cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -541,7 +498,7 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
 
     auto results = cudf::strings::slice_strings(
       cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -565,7 +522,7 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
 
     auto results = cudf::strings::slice_strings(
       cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -587,6 +544,6 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
 
     auto results = cudf::strings::slice_strings(
       cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 }
diff --git a/cpp/tests/strings/utilities.cu b/cpp/tests/strings/utilities.cpp
similarity index 100%
rename from cpp/tests/strings/utilities.cu
rename to cpp/tests/strings/utilities.cpp
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 0f10d6efe4a..6cf36208f0b 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -809,7 +809,7 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt
 std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c)
 {
   if (c.nullable()) {
-    auto num_bitmasks = bitmask_allocation_size_bytes(c.size()) / sizeof(bitmask_type);
+    auto num_bitmasks = num_bitmask_words(c.size());
     std::vector<bitmask_type> host_bitmask(num_bitmasks);
     if (c.offset() == 0) {
       CUDA_TRY(cudaMemcpy(host_bitmask.data(),
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 0dc10f9d717..82a22eb1dbc 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -133,7 +133,7 @@ TYPED_TEST(ColumnUtilitiesTest, NullableToHostAllValid)
 
   auto masks = cudf::test::detail::make_null_mask_vector(all_valid, all_valid + size);
 
-  EXPECT_TRUE(std::equal(masks.begin(), masks.end(), host_data.second.begin()));
+  EXPECT_TRUE(cudf::test::validate_host_masks(masks, host_data.second, size));
 }
 
 struct ColumnUtilitiesEquivalenceTest : public cudf::test::BaseFixture {
diff --git a/docs/cudf/source/basics/io-supported-types.rst b/docs/cudf/source/basics/io-supported-types.rst
index 544acb9c683..0962113eb25 100644
--- a/docs/cudf/source/basics/io-supported-types.rst
+++ b/docs/cudf/source/basics/io-supported-types.rst
@@ -56,7 +56,7 @@ The following table lists are compatible cudf types for each supported IO format
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | datetime64[ns]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | struct                | ❌     | ❌     | ✅     | ✅     | ❌      | ❌     | ❌     | ✅     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    | struct                | ❌     | ❌     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | decimal32             | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
diff --git a/java/src/main/java/ai/rapids/cudf/CloseableArray.java b/java/src/main/java/ai/rapids/cudf/CloseableArray.java
new file mode 100644
index 00000000000..5c75f2378e8
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CloseableArray.java
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/** Utility class that wraps an array of closeable instances and can be closed */
+public class CloseableArray<T extends AutoCloseable> implements AutoCloseable {
+  private T[] array;
+
+  public static <T extends AutoCloseable> CloseableArray<T> wrap(T[] array) {
+    return new CloseableArray<T>(array);
+  }
+
+  CloseableArray(T[] array) {
+    this.array = array;
+  }
+
+  public int size() {
+    return array.length;
+  }
+
+  public T get(int i) {
+    return array[i];
+  }
+
+  public T set(int i, T obj) {
+    array[i] = obj;
+    return obj;
+  }
+
+  public T[] getArray() {
+    return array;
+  }
+
+  public T[] release() {
+    T[] result = array;
+    array = null;
+    return result;
+  }
+
+  public void closeAt(int i) {
+    try {
+      T toClose = array[i];
+      array[i] = null;
+      toClose.close();
+    } catch (RuntimeException e) {
+      throw e;
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void close() {
+    close(null);
+  }
+
+  public void close(Exception pendingError) {
+    if (array == null) {
+      return;
+    }
+    T[] toClose = array;
+    array = null;
+    RuntimeException error = null;
+    if (pendingError instanceof RuntimeException) {
+      error = (RuntimeException) pendingError;
+    } else if (pendingError != null) {
+      error = new RuntimeException(pendingError);
+    }
+    for (T obj: toClose) {
+      if (obj != null) {
+        try {
+          obj.close();
+        } catch (RuntimeException e) {
+          if (error != null) {
+            error.addSuppressed(e);
+          } else {
+            error = e;
+          }
+        } catch (Exception e) {
+          if (error != null) {
+            error.addSuppressed(e);
+          } else {
+            error = new RuntimeException(e);
+          }
+        }
+      }
+    }
+    if (error != null) {
+      throw error;
+    }
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java
index 02e4d32617d..bb2d6dbde7d 100755
--- a/java/src/main/java/ai/rapids/cudf/Cuda.java
+++ b/java/src/main/java/ai/rapids/cudf/Cuda.java
@@ -15,6 +15,9 @@
  */
 package ai.rapids.cudf;
 
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -521,4 +524,27 @@ public static synchronized boolean isEnvCompatibleForTesting() {
    * Whether per-thread default stream is enabled.
    */
   public static native boolean isPtdsEnabled();
+
+  /**
+   * Copy data from multiple device buffer sources to multiple device buffer destinations.
+   * For each buffer to copy there is a corresponding entry in the destination address, source
+   * address, and copy size vectors.
+   * @param destAddrs vector of device destination addresses
+   * @param srcAddrs vector of device source addresses
+   * @param copySizes vector of copy sizes
+   * @param stream CUDA stream to use for the copy
+   */
+  public static void multiBufferCopyAsync(long [] destAddrs,
+                                          long [] srcAddrs,
+                                          long [] copySizes,
+                                          Stream stream) {
+    // Temporary sub-par stand-in for a multi-buffer copy CUDA kernel
+    assert(destAddrs.length == srcAddrs.length);
+    assert(copySizes.length == destAddrs.length);
+    try (NvtxRange copyRange = new NvtxRange("multiBufferCopyAsync", NvtxColor.CYAN)){
+      for (int i = 0; i < destAddrs.length; i++) {
+        asyncMemcpy(destAddrs[i], srcAddrs[i], copySizes[i], CudaMemcpyKind.DEVICE_TO_DEVICE, stream);
+      }
+    }
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
index a936d4830ee..05545807bb6 100644
--- a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
+++ b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
@@ -19,8 +19,6 @@
 package ai.rapids.cudf;
 
 import ai.rapids.cudf.ast.CompiledExpression;
-import ai.rapids.cudf.nvcomp.BatchedLZ4Decompressor;
-import ai.rapids.cudf.nvcomp.Decompressor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -248,16 +246,6 @@ static void register(Cuda.Event event, Cleaner cleaner) {
     all.add(new CleanerWeakReference(event, cleaner, collected, false));
   }
 
-  public static void register(Decompressor.Metadata metadata, Cleaner cleaner) {
-    // It is now registered...
-    all.add(new CleanerWeakReference(metadata, cleaner, collected, false));
-  }
-
-  public static void register(BatchedLZ4Decompressor.BatchedMetadata metadata, Cleaner cleaner) {
-    // It is now registered...
-    all.add(new CleanerWeakReference(metadata, cleaner, collected, false));
-  }
-
   static void register(CuFileDriver driver, Cleaner cleaner) {
     // It is now registered...
     all.add(new CleanerWeakReference(driver, cleaner, collected, false));
@@ -324,4 +312,4 @@ public String toString() {
           + "\n";
     }
   }
-}
\ No newline at end of file
+}
diff --git a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
index 07983f77aad..408c93ff0a1 100644
--- a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
@@ -82,6 +82,19 @@ public static RollingAggregation max() {
     return new RollingAggregation(Aggregation.max());
   }
 
+  /**
+   * Rolling Window Standard Deviation with 1 as delta degrees of freedom(DDOF).
+   */
+  public static RollingAggregation standardDeviation() {
+    return new RollingAggregation(Aggregation.standardDeviation());
+  }
+
+  /**
+   * Rolling Window Standard Deviation with configurable delta degrees of freedom(DDOF).
+   */
+  public static RollingAggregation standardDeviation(int ddof) {
+    return new RollingAggregation(Aggregation.standardDeviation(ddof));
+  }
 
   /**
    * Count number of valid, a.k.a. non-null, elements.
diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
index 9ef18dbd75d..6fb5a16d888 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
@@ -23,29 +23,32 @@
  * NOTE: This must be kept in sync with `jni_to_unary_operator` in CompiledExpression.cpp!
  */
 public enum UnaryOperator {
-  IDENTITY(0),      // Identity function
-  SIN(1),           // Trigonometric sine
-  COS(2),           // Trigonometric cosine
-  TAN(3),           // Trigonometric tangent
-  ARCSIN(4),        // Trigonometric sine inverse
-  ARCCOS(5),        // Trigonometric cosine inverse
-  ARCTAN(6),        // Trigonometric tangent inverse
-  SINH(7),          // Hyperbolic sine
-  COSH(8),          // Hyperbolic cosine
-  TANH(9),          // Hyperbolic tangent
-  ARCSINH(10),       // Hyperbolic sine inverse
-  ARCCOSH(11),       // Hyperbolic cosine inverse
-  ARCTANH(12),       // Hyperbolic tangent inverse
-  EXP(13),           // Exponential (base e, Euler number)
-  LOG(14),           // Natural Logarithm (base e)
-  SQRT(15),          // Square-root (x^0.5)
-  CBRT(16),          // Cube-root (x^(1.0/3))
-  CEIL(17),          // Smallest integer value not less than arg
-  FLOOR(18),         // largest integer value not greater than arg
-  ABS(19),           // Absolute value
-  RINT(20),          // Rounds the floating-point argument arg to an integer value
-  BIT_INVERT(21),    // Bitwise Not (~)
-  NOT(22);           // Logical Not (!)
+  IDENTITY(0),          // Identity function
+  SIN(1),               // Trigonometric sine
+  COS(2),               // Trigonometric cosine
+  TAN(3),               // Trigonometric tangent
+  ARCSIN(4),            // Trigonometric sine inverse
+  ARCCOS(5),            // Trigonometric cosine inverse
+  ARCTAN(6),            // Trigonometric tangent inverse
+  SINH(7),              // Hyperbolic sine
+  COSH(8),              // Hyperbolic cosine
+  TANH(9),              // Hyperbolic tangent
+  ARCSINH(10),          // Hyperbolic sine inverse
+  ARCCOSH(11),          // Hyperbolic cosine inverse
+  ARCTANH(12),          // Hyperbolic tangent inverse
+  EXP(13),              // Exponential (base e, Euler number)
+  LOG(14),              // Natural Logarithm (base e)
+  SQRT(15),             // Square-root (x^0.5)
+  CBRT(16),             // Cube-root (x^(1.0/3))
+  CEIL(17),             // Smallest integer value not less than arg
+  FLOOR(18),            // largest integer value not greater than arg
+  ABS(19),              // Absolute value
+  RINT(20),             // Rounds the floating-point argument arg to an integer value
+  BIT_INVERT(21),       // Bitwise Not (~)
+  NOT(22),              // Logical Not (!)
+  CAST_TO_INT64(23),    // Cast value to int64_t
+  CAST_TO_UINT64(24),   // Cast value to uint64_t
+  CAST_TO_FLOAT64(25);  // Cast value to double
 
   private final byte nativeId;
 
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
index 88b20414b0c..1ab3b97945d 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,199 +17,302 @@
 package ai.rapids.cudf.nvcomp;
 
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.CloseableArray;
 import ai.rapids.cudf.Cuda;
 import ai.rapids.cudf.DeviceMemoryBuffer;
 import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.MemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+import java.util.Arrays;
 
 /** Multi-buffer LZ4 compressor */
 public class BatchedLZ4Compressor {
-  /** Describes a batched compression result */
-  public static class BatchedCompressionResult {
-    private final DeviceMemoryBuffer[] compressedBuffers;
-    private final long[] compressedSizes;
-
-    BatchedCompressionResult(DeviceMemoryBuffer[] buffers, long[] sizes) {
-      this.compressedBuffers = buffers;
-      this.compressedSizes = sizes;
-    }
+  static final long MAX_CHUNK_SIZE = 16777216;  // in bytes
+  // each chunk has a 64-bit integer value as metadata containing the compressed size
+  static final long METADATA_BYTES_PER_CHUNK = 8;
 
-    /**
-     * Get the output compressed buffers corresponding to the input buffers.
-     * Note that the buffers are likely larger than required to store the compressed data.
-     */
-    public DeviceMemoryBuffer[] getCompressedBuffers() {
-      return compressedBuffers;
-    }
-
-    /** Get the corresponding amount of compressed data in each output buffer. */
-    public long[] getCompressedSizes() {
-      return compressedSizes;
-    }
-  }
+  private final long chunkSize;
+  private final long targetIntermediateBufferSize;
+  private final long maxOutputChunkSize;
 
   /**
-   * Get the amount of temporary storage space required to compress a batch of buffers.
-   * @param inputs    batch of data buffers to be individually compressed
-   * @param chunkSize compression chunk size to use
-   * @return amount in bytes of temporary storage space required to compress the batch
+   * Construct a batched LZ4 compressor instance
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk. Inputs
+   *                  larger than this will be compressed in multiple chunks.
+   * @param targetIntermediateBufferSize desired maximum size of intermediate device buffers
+   *                                     used during compression.
    */
-  public static long getTempSize(BaseDeviceMemoryBuffer[] inputs, long chunkSize) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-    }
-    int numBuffers = inputs.length;
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
-    }
-    return NvcompJni.batchedLZ4CompressGetTempSize(inputAddrs, inputSizes, chunkSize);
+  public BatchedLZ4Compressor(long chunkSize, long targetIntermediateBufferSize) {
+    validateChunkSize(chunkSize);
+    this.chunkSize = chunkSize;
+    this.maxOutputChunkSize = NvcompJni.batchedLZ4CompressGetMaxOutputChunkSize(chunkSize);
+    assert maxOutputChunkSize < Integer.MAX_VALUE;
+    this.targetIntermediateBufferSize = Math.max(targetIntermediateBufferSize, maxOutputChunkSize);
   }
 
   /**
-   * Get the amount of output storage space required to compress a batch of buffers.
-   * @param inputs     batch of data buffers to be individually compressed
-   * @param chunkSize  compression chunk size to use
-   * @param tempBuffer temporary storage space
-   * @return amount in bytes of output storage space corresponding to each input buffer in the batch
+   * Compress a batch of buffers with LZ4. The input buffers will be closed.
+   * @param origInputs buffers to compress
+   * @param stream CUDA stream to use
+   * @return compressed buffers corresponding to the input buffers
    */
-  public static long[] getOutputSizes(BaseDeviceMemoryBuffer[] inputs, long chunkSize,
-                                      BaseDeviceMemoryBuffer tempBuffer) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+  public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs = CloseableArray.wrap(origInputs)) {
+      if (chunkSize <= 0) {
+        throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return new DeviceMemoryBuffer[0];
+      }
+
+      // Each buffer is broken up into chunkSize chunks for compression.  Calculate how many
+      // chunks are needed for each input buffer.
+      int[] chunksPerInput = new int[numInputs];
+      int numChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        BaseDeviceMemoryBuffer buffer = inputs.get(i);
+        int numBufferChunks = getNumChunksInBuffer(buffer);
+        chunksPerInput[i] = numBufferChunks;
+        numChunks += numBufferChunks;
+      }
+
+      // Allocate buffers for each chunk and generate parallel lists of chunk source addresses,
+      // chunk destination addresses, and sizes.
+      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
+               allocCompressedBuffers(numChunks, stream);
+           DeviceMemoryBuffer compressedChunkSizes =
+               DeviceMemoryBuffer.allocate(numChunks * 8L, stream)) {
+        long[] inputChunkAddrs = new long[numChunks];
+        long[] inputChunkSizes = new long[numChunks];
+        long[] outputChunkAddrs = new long[numChunks];
+        buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes,
+            compressedBuffers, outputChunkAddrs);
+
+        long[] outputChunkSizes;
+        final long tempBufferSize = NvcompJni.batchedLZ4CompressGetTempSize(numChunks, chunkSize);
+        try (DeviceMemoryBuffer addrsAndSizes =
+                 putAddrsAndSizesOnDevice(inputChunkAddrs, inputChunkSizes, outputChunkAddrs, stream);
+             DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempBufferSize, stream)) {
+          final long devOutputAddrsPtr = addrsAndSizes.getAddress() + numChunks * 8L;
+          final long devInputSizesPtr = devOutputAddrsPtr + numChunks * 8L;
+          NvcompJni.batchedLZ4CompressAsync(
+              addrsAndSizes.getAddress(),
+              devInputSizesPtr,
+              chunkSize,
+              numChunks,
+              tempBuffer.getAddress(),
+              tempBufferSize,
+              devOutputAddrsPtr,
+              compressedChunkSizes.getAddress(),
+              stream.getStream());
+        }
+
+        // Synchronously copy the resulting compressed sizes per chunk.
+        outputChunkSizes = getOutputChunkSizes(compressedChunkSizes, stream);
+
+        // inputs are no longer needed at this point, so free them early
+        inputs.close();
+
+        // Combine compressed chunks into output buffers corresponding to each original input
+        return stitchOutput(chunksPerInput, compressedChunkSizes, outputChunkAddrs,
+            outputChunkSizes, stream);
+      }
     }
-    int numBuffers = inputs.length;
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
+  }
+
+  static void validateChunkSize(long chunkSize) {
+    if (chunkSize <= 0  || chunkSize > MAX_CHUNK_SIZE) {
+      throw new IllegalArgumentException("Invalid chunk size: " + chunkSize + " Max chunk size is: "
+          + MAX_CHUNK_SIZE + " bytes");
     }
-    return NvcompJni.batchedLZ4CompressGetOutputSize(inputAddrs, inputSizes, chunkSize,
-        tempBuffer.getAddress(), tempBuffer.getLength());
   }
 
-  /**
-   * Calculates the minimum size in bytes necessary to store the compressed output sizes
-   * when performing an asynchronous batch compression.
-   * @param numBuffers number of buffers in the batch
-   * @return minimum size of the compressed output sizes buffer needed
-   */
-  public static long getCompressedSizesBufferSize(int numBuffers) {
-    // Each compressed size value is a 64-bit long
-    return numBuffers * 8;
+  private static long ceilingDivide(long x, long y) {
+    return (x + y - 1) / y;
   }
 
-  /**
-   * Asynchronously compress a batch of input buffers. The compressed size output buffer must be
-   * pinned memory for this operation to be truly asynchronous. Note that the caller must
-   * synchronize on the specified CUDA stream in order to safely examine the compressed output
-   * sizes!
-   * @param compressedSizesOutputBuffer host memory where the compressed output size will be stored
-   * @param inputs     buffers to compress
-   * @param chunkSize  type of data within each buffer
-   * @param tempBuffer compression chunk size to use
-   * @param outputs    output buffers that will contain the compressed results
-   * @param stream     CUDA stream to use
-   */
-  public static void compressAsync(HostMemoryBuffer compressedSizesOutputBuffer,
-                                   BaseDeviceMemoryBuffer[] inputs, long chunkSize,
-                                   BaseDeviceMemoryBuffer tempBuffer,
-                                   BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-    }
-    int numBuffers = inputs.length;
-    if (outputs.length != numBuffers) {
-      throw new IllegalArgumentException("buffer count mismatch, " + numBuffers + " inputs and " +
-          outputs.length + " outputs");
-    }
-    if (compressedSizesOutputBuffer.getLength() < getCompressedSizesBufferSize(numBuffers)) {
-      throw new IllegalArgumentException("compressed output size buffer must be able to hold " +
-          "at least 8 bytes per buffer, size is only " + compressedSizesOutputBuffer.getLength());
-    }
+  private int getNumChunksInBuffer(MemoryBuffer buffer) {
+    return (int) ceilingDivide(buffer.getLength(), chunkSize);
+  }
 
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
+  private CloseableArray<DeviceMemoryBuffer> allocCompressedBuffers(long numChunks,
+                                                                    Cuda.Stream stream) {
+    final long chunksPerBuffer = targetIntermediateBufferSize / maxOutputChunkSize;
+    final long numBuffers = ceilingDivide(numChunks, chunksPerBuffer);
+    if (numBuffers > Integer.MAX_VALUE) {
+      throw new IllegalStateException("Too many chunks");
     }
-
-    long[] outputAddrs = new long[numBuffers];
-    long[] outputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = outputs[i];
-      outputAddrs[i] = buffer.getAddress();
-      outputSizes[i] = buffer.getLength();
+    try (NvtxRange range = new NvtxRange("allocCompressedBuffers", NvtxColor.YELLOW)) {
+      CloseableArray<DeviceMemoryBuffer> buffers = CloseableArray.wrap(
+          new DeviceMemoryBuffer[(int) numBuffers]);
+      try {
+        // allocate all of the max-chunks intermediate compressed buffers
+        for (int i = 0; i < buffers.size() - 1; ++i) {
+          buffers.set(i, DeviceMemoryBuffer.allocate(chunksPerBuffer * maxOutputChunkSize, stream));
+        }
+        // allocate the tail intermediate compressed buffer that may be smaller than the others
+        buffers.set(buffers.size() - 1, DeviceMemoryBuffer.allocate(
+            (numChunks - chunksPerBuffer * (buffers.size() - 1)) * maxOutputChunkSize, stream));
+        return buffers;
+      } catch (Exception e) {
+        buffers.close(e);
+        throw e;
+      }
     }
-
-    NvcompJni.batchedLZ4CompressAsync(compressedSizesOutputBuffer.getAddress(),
-        inputAddrs, inputSizes, chunkSize, tempBuffer.getAddress(), tempBuffer.getLength(),
-        outputAddrs, outputSizes, stream.getStream());
   }
 
-  /**
-   * Compress a batch of buffers with LZ4
-   * @param inputs    buffers to compress
-   * @param chunkSize compression chunk size to use
-   * @param stream    CUDA stream to use
-   * @return compression results containing the corresponding output buffer and compressed data size
-   *         for each input buffer
-   */
-  public static BatchedCompressionResult compress(BaseDeviceMemoryBuffer[] inputs, long chunkSize,
-                                                  Cuda.Stream stream) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+  // Fill in the inputChunkAddrs, inputChunkSizes, and outputChunkAddrs arrays to point
+  // into the chunks in the input and output buffers.
+  private void buildAddrsAndSizes(CloseableArray<BaseDeviceMemoryBuffer> inputs,
+                                  long[] inputChunkAddrs,
+                                  long[] inputChunkSizes,
+                                  CloseableArray<DeviceMemoryBuffer> compressedBuffers,
+                                  long[] outputChunkAddrs) {
+    // setup the input addresses and sizes
+    int chunkIdx = 0;
+    for (BaseDeviceMemoryBuffer input : inputs.getArray()) {
+      final int numChunksInBuffer = getNumChunksInBuffer(input);
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        inputChunkAddrs[chunkIdx] = input.getAddress() + i * chunkSize;
+        inputChunkSizes[chunkIdx] = (i != numChunksInBuffer - 1) ? chunkSize
+            : (input.getLength() - (long) i * chunkSize);
+        ++chunkIdx;
+      }
     }
-    int numBuffers = inputs.length;
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
+    assert chunkIdx == inputChunkAddrs.length;
+    assert chunkIdx == inputChunkSizes.length;
+
+    // setup output addresses
+    chunkIdx = 0;
+    for (DeviceMemoryBuffer buffer : compressedBuffers.getArray()) {
+      assert buffer.getLength() % maxOutputChunkSize == 0;
+      long numChunksInBuffer = buffer.getLength() / maxOutputChunkSize;
+      long baseAddr = buffer.getAddress();
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        outputChunkAddrs[chunkIdx++] = baseAddr + i * maxOutputChunkSize;
+      }
     }
+    assert chunkIdx == outputChunkAddrs.length;
+  }
 
-    DeviceMemoryBuffer[] outputBuffers = new DeviceMemoryBuffer[numBuffers];
-    try {
-      long tempSize = NvcompJni.batchedLZ4CompressGetTempSize(inputAddrs, inputSizes, chunkSize);
-      try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) {
-        long[] outputSizes = NvcompJni.batchedLZ4CompressGetOutputSize(inputAddrs, inputSizes,
-                chunkSize, tempBuffer.getAddress(), tempBuffer.getLength());
-        long[] outputAddrs = new long[numBuffers];
-        for (int i = 0; i < numBuffers; ++i) {
-          DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(outputSizes[i]);
-          outputBuffers[i] = buffer;
-          outputAddrs[i] = buffer.getAddress();
+  // Write input addresses, output addresses and sizes contiguously into a DeviceMemoryBuffer.
+  private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs,
+                                                      long[] inputSizes,
+                                                      long[] outputAddrs,
+                                                      Cuda.Stream stream) {
+    final long totalSize = inputAddrs.length * 8L * 3; // space for input, output, and size arrays
+    final long outputAddrsOffset = inputAddrs.length * 8L;
+    final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
+    try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(totalSize);
+           DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
+        hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
+        hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
+        for (int i = 0; i < inputSizes.length; i++) {
+          hostbuf.setLong(sizesOffset + i * 8L, inputSizes[i]);
         }
+        result.copyFromHostBuffer(hostbuf, stream);
+        result.incRefCount();
+        return result;
+      }
+    }
+  }
 
-        long compressedSizesBufferSize = getCompressedSizesBufferSize(numBuffers);
-        try (HostMemoryBuffer compressedSizesBuffer =
-                 HostMemoryBuffer.allocate(compressedSizesBufferSize)) {
-          NvcompJni.batchedLZ4CompressAsync(compressedSizesBuffer.getAddress(),
-              inputAddrs, inputSizes, chunkSize,
-              tempBuffer.getAddress(), tempBuffer.getLength(),
-              outputAddrs, outputSizes, stream.getStream());
-          stream.sync();
-          long[] compressedSizes = new long[numBuffers];
-          compressedSizesBuffer.getLongs(compressedSizes, 0, 0, numBuffers);
-          return new BatchedCompressionResult(outputBuffers, compressedSizes);
+  // Synchronously copy the resulting compressed sizes from device memory to host memory.
+  private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(devChunkSizes.getLength())) {
+        hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
+        int numChunks = (int) (devChunkSizes.getLength() / 8);
+        long[] result = new long[numChunks];
+        for (int i = 0; i < numChunks; i++) {
+          long size = hostbuf.getLong(i * 8L);
+          assert size < Integer.MAX_VALUE : "output size is too big";
+          result[i] = size;
         }
+        return result;
       }
-    } catch (Throwable t) {
-      for (DeviceMemoryBuffer buffer : outputBuffers) {
-        if (buffer != null) {
-          buffer.close();
+    }
+  }
+
+  // Stitch together the individual chunks into the result buffers.
+  // Each result buffer has metadata at the beginning, followed by compressed chunks.
+  // This is done by building up parallel lists of source addr, dest addr and size and
+  // then calling multiBufferCopyAsync()
+  private DeviceMemoryBuffer[] stitchOutput(int[] chunksPerInput,
+                                            DeviceMemoryBuffer compressedChunkSizes,
+                                            long[] outputChunkAddrs,
+                                            long[] outputChunkSizes,
+                                            Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("stitchOutput", NvtxColor.YELLOW)) {
+      final int numOutputs = chunksPerInput.length;
+      final long chunkSizesAddr = compressedChunkSizes.getAddress();
+      long[] outputBufferSizes = calcOutputBufferSizes(chunksPerInput, outputChunkSizes);
+      try (CloseableArray<DeviceMemoryBuffer> outputs =
+               CloseableArray.wrap(new DeviceMemoryBuffer[numOutputs])) {
+        // Each chunk needs to be copied, and each output needs a copy of the
+        // compressed chunk size vector representing the metadata.
+        final int totalBuffersToCopy = numOutputs + outputChunkAddrs.length;
+        long[] destAddrs = new long[totalBuffersToCopy];
+        long[] srcAddrs = new long[totalBuffersToCopy];
+        long[] sizes = new long[totalBuffersToCopy];
+        int copyBufferIdx = 0;
+        int chunkIdx = 0;
+        for (int outputIdx = 0; outputIdx < numOutputs; outputIdx++) {
+          DeviceMemoryBuffer outputBuffer = DeviceMemoryBuffer.allocate(outputBufferSizes[outputIdx]);
+          final long outputBufferAddr = outputBuffer.getAddress();
+          outputs.set(outputIdx, outputBuffer);
+          final long numChunks = chunksPerInput[outputIdx];
+          final long metadataSize = numChunks * METADATA_BYTES_PER_CHUNK;
+
+          // setup a copy of the metadata at the front of the output buffer
+          srcAddrs[copyBufferIdx] = chunkSizesAddr + chunkIdx * 8;
+          destAddrs[copyBufferIdx] = outputBufferAddr;
+          sizes[copyBufferIdx] = metadataSize;
+          ++copyBufferIdx;
+
+          // setup copies of the compressed chunks for this output buffer
+          long nextChunkAddr = outputBufferAddr + metadataSize;
+          for (int i = 0; i < numChunks; ++i) {
+            srcAddrs[copyBufferIdx] = outputChunkAddrs[chunkIdx];
+            destAddrs[copyBufferIdx] = nextChunkAddr;
+            final long chunkSize = outputChunkSizes[chunkIdx];
+            sizes[copyBufferIdx] = chunkSize;
+            copyBufferIdx++;
+            chunkIdx++;
+            nextChunkAddr += chunkSize;
+          }
         }
+        assert copyBufferIdx == totalBuffersToCopy;
+        assert chunkIdx == outputChunkAddrs.length;
+        assert chunkIdx == outputChunkSizes.length;
+
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        return outputs.release();
       }
-      throw t;
     }
   }
 
-
+  // Calculate the list of sizes for each output buffer (metadata plus size of compressed chunks)
+  private long[] calcOutputBufferSizes(int[] chunksPerInput,
+                                       long[] outputChunkSizes) {
+    long[] sizes = new long[chunksPerInput.length];
+    int chunkIdx = 0;
+    for (int i = 0; i < sizes.length; i++) {
+      final int chunksInBuffer = chunksPerInput[i];
+      final int chunkEndIdx = chunkIdx + chunksInBuffer;
+      // metadata stored in front of compressed data
+      long bufferSize = METADATA_BYTES_PER_CHUNK * chunksInBuffer;
+      // add in the compressed chunk sizes to get the total size
+      while (chunkIdx < chunkEndIdx) {
+        bufferSize += outputChunkSizes[chunkIdx++];
+      }
+      sizes[i] = bufferSize;
+    }
+    assert chunkIdx == outputChunkSizes.length;
+    return sizes;
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
index 61969db4fb4..40ad4d5e9ed 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,214 +16,183 @@
 
 package ai.rapids.cudf.nvcomp;
 
+import ai.rapids.cudf.CloseableArray;
 import ai.rapids.cudf.Cuda;
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
 import ai.rapids.cudf.DeviceMemoryBuffer;
-import ai.rapids.cudf.MemoryCleaner;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+import java.util.Arrays;
 
 /** LZ4 decompressor that operates on multiple input buffers in a batch */
 public class BatchedLZ4Decompressor {
-  private static final Logger log = LoggerFactory.getLogger(Decompressor.class);
-
-  /**
-   * Get the metadata associated with a batch of compressed buffers
-   * @param inputs compressed buffers that will be decompressed
-   * @param stream CUDA stream to use
-   * @return opaque metadata object
-   */
-  public static BatchedMetadata getMetadata(BaseDeviceMemoryBuffer[] inputs, Cuda.Stream stream) {
-    long[] inputAddrs = new long[inputs.length];
-    long[] inputSizes = new long[inputs.length];
-    for (int i = 0; i < inputs.length; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
-    }
-    return new BatchedMetadata(NvcompJni.batchedLZ4DecompressGetMetadata(
-        inputAddrs, inputSizes, stream.getStream()));
-  }
-
-  /**
-   * Get the amount of temporary storage required to decompress a batch of buffers
-   * @param metadata metadata retrieved from the compressed buffers
-   * @return amount in bytes of temporary storage space required to decompress the buffer batch
-   */
-  public static long getTempSize(BatchedMetadata metadata) {
-    return NvcompJni.batchedLZ4DecompressGetTempSize(metadata.getMetadata());
-  }
-
-  /**
-   * Get the amount of ouptut storage required to decopmress a batch of buffers
-   * @param metadata   metadata retrieved from the compressed buffers
-   * @param numOutputs number of buffers in the batch
-   * @return amount in bytes of temporary storage space required to decompress the buffer batch
-   */
-  public static long[] getOutputSizes(BatchedMetadata metadata, int numOutputs) {
-    return NvcompJni.batchedLZ4DecompressGetOutputSize(metadata.getMetadata(), numOutputs);
-  }
-
   /**
    * Asynchronously decompress a batch of buffers
-   * @param inputs     buffers to decompress
-   * @param tempBuffer temporary buffer
-   * @param metadata   metadata retrieved from the compressed buffers
-   * @param outputs    output buffers that will contain the compressed results
-   * @param stream     CUDA stream to use
+   * @param chunkSize maximum uncompressed block size, must match value used during compression
+   * @param origInputs buffers to decompress, will be closed by this operation
+   * @param outputs output buffers that will contain the compressed results, each must be sized
+   *                to the exact decompressed size of the corresponding input
+   * @param stream CUDA stream to use
    */
-  public static void decompressAsync(BaseDeviceMemoryBuffer[] inputs,
-                                     BaseDeviceMemoryBuffer tempBuffer, BatchedMetadata metadata,
-                                     BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
-    int numBuffers = inputs.length;
-    if (outputs.length != numBuffers) {
-      throw new IllegalArgumentException("buffer count mismatch, " + numBuffers + " inputs and " +
-          outputs.length + " outputs");
-    }
+  public static void decompressAsync(long chunkSize,
+                                     BaseDeviceMemoryBuffer[] origInputs,
+                                     BaseDeviceMemoryBuffer[] outputs,
+                                     Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs =
+             CloseableArray.wrap(Arrays.copyOf(origInputs, origInputs.length))) {
+      BatchedLZ4Compressor.validateChunkSize(chunkSize);
+      if (origInputs.length != outputs.length) {
+        throw new IllegalArgumentException("number of inputs must match number of outputs");
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return;
+      }
 
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
-    }
+      int[] chunksPerInput = new int[numInputs];
+      long totalChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        // use output size to determine number of chunks in the input, as the output buffer
+        // must be exactly sized to the uncompressed data
+        BaseDeviceMemoryBuffer buffer = outputs[i];
+        int numBufferChunks = getNumChunksInBuffer(chunkSize, buffer);
+        chunksPerInput[i] = numBufferChunks;
+        totalChunks += numBufferChunks;
+      }
 
-    long[] outputAddrs = new long[numBuffers];
-    long[] outputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = outputs[i];
-      outputAddrs[i] = buffer.getAddress();
-      outputSizes[i] = buffer.getLength();
+      final long tempBufferSize = NvcompJni.batchedLZ4DecompressGetTempSize(totalChunks, chunkSize);
+      try (DeviceMemoryBuffer devAddrsSizes =
+               buildAddrsSizesBuffer(chunkSize, totalChunks, inputs.getArray(), chunksPerInput,
+                   outputs, stream);
+           DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) {
+        // buffer containing addresses and sizes contains four vectors of longs in this order:
+        // - compressed chunk input addresses
+        // - chunk output buffer addresses
+        // - compressed chunk sizes
+        // - uncompressed chunk sizes
+        final long inputAddrsPtr = devAddrsSizes.getAddress();
+        final long outputAddrsPtr = inputAddrsPtr + totalChunks * 8;
+        final long inputSizesPtr = outputAddrsPtr + totalChunks * 8;
+        final long outputSizesPtr = inputSizesPtr + totalChunks * 8;
+        NvcompJni.batchedLZ4DecompressAsync(
+            inputAddrsPtr,
+            inputSizesPtr,
+            outputSizesPtr,
+            totalChunks,
+            devTemp.getAddress(),
+            devTemp.getLength(),
+            outputAddrsPtr,
+            stream.getStream());
+      }
     }
+  }
 
-    NvcompJni.batchedLZ4DecompressAsync(inputAddrs, inputSizes,
-        tempBuffer.getAddress(), tempBuffer.getLength(), metadata.getMetadata(),
-        outputAddrs, outputSizes, stream.getStream());
+  private static int getNumChunksInBuffer(long chunkSize, BaseDeviceMemoryBuffer buffer) {
+    return (int) ((buffer.getLength() + chunkSize - 1) / chunkSize);
   }
 
   /**
-   * Asynchronously decompress a batch of buffers
-   * @param inputs buffers to decompress
+   * Build a device memory buffer containing four vectors of longs in the following order:
+   * <ul>
+   *   <li>compressed chunk input addresses</li>
+   *   <li>uncompressed chunk output addresses</li>
+   *   <li>compressed chunk sizes</li>
+   *   <li>uncompressed chunk sizes</li>
+   * </ul>
+   * Each vector contains as many longs as the number of chunks being decompressed
+   * @param chunkSize maximum uncompressed size of a chunk
+   * @param totalChunks total number of chunks to be decompressed
+   * @param inputs device buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks per input buffer
+   * @param outputs device buffers that will hold the uncompressed output
    * @param stream CUDA stream to use
-   * @return output buffers containing compressed data corresponding to the input buffers
+   * @return device buffer containing address and size vectors
    */
-  public static DeviceMemoryBuffer[] decompressAsync(BaseDeviceMemoryBuffer[] inputs,
-                                                     Cuda.Stream stream) {
-    int numBuffers = inputs.length;
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
-    }
-
-    long metadata = NvcompJni.batchedLZ4DecompressGetMetadata(inputAddrs, inputSizes,
-            stream.getStream());
-    try {
-      long[] outputSizes = NvcompJni.batchedLZ4DecompressGetOutputSize(metadata, numBuffers);
-      long[] outputAddrs = new long[numBuffers];
-      DeviceMemoryBuffer[] outputs = new DeviceMemoryBuffer[numBuffers];
-      try {
-        for (int i = 0; i < numBuffers; ++i) {
-          DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(outputSizes[i]);
-          outputs[i] = buffer;
-          outputAddrs[i] = buffer.getAddress();
-        }
-
-        long tempSize = NvcompJni.batchedLZ4DecompressGetTempSize(metadata);
-        try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) {
-          NvcompJni.batchedLZ4DecompressAsync(inputAddrs, inputSizes,
-                  tempBuffer.getAddress(), tempBuffer.getLength(), metadata,
-                  outputAddrs, outputSizes, stream.getStream());
-        }
-      } catch (Throwable t) {
-        for (DeviceMemoryBuffer buffer : outputs) {
-          if (buffer != null) {
-            buffer.close();
+  private static DeviceMemoryBuffer buildAddrsSizesBuffer(long chunkSize,
+                                                          long totalChunks,
+                                                          BaseDeviceMemoryBuffer[] inputs,
+                                                          int[] chunksPerInput,
+                                                          BaseDeviceMemoryBuffer[] outputs,
+                                                          Cuda.Stream stream) {
+    final long totalBufferSize = totalChunks * 8L * 4L;
+    try (NvtxRange range = new NvtxRange("buildAddrSizesBuffer", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer metadata = fetchMetadata(totalChunks, inputs, chunksPerInput, stream);
+           HostMemoryBuffer hostAddrsSizes = HostMemoryBuffer.allocate(totalBufferSize);
+           DeviceMemoryBuffer devAddrsSizes = DeviceMemoryBuffer.allocate(totalBufferSize)) {
+        // Build four long vectors in the AddrsSizes buffer:
+        // - compressed input address (one per chunk)
+        // - uncompressed output address (one per chunk)
+        // - compressed input size (one per chunk)
+        // - uncompressed input size (one per chunk)
+        final long srcAddrsOffset = 0;
+        final long destAddrsOffset = srcAddrsOffset + totalChunks * 8L;
+        final long srcSizesOffset = destAddrsOffset + totalChunks * 8L;
+        final long destSizesOffset = srcSizesOffset + totalChunks * 8L;
+        long chunkIdx = 0;
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final BaseDeviceMemoryBuffer output = outputs[inputIdx];
+          final int numChunksInInput = chunksPerInput[inputIdx];
+          long srcAddr = input.getAddress() +
+              BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK * numChunksInInput;
+          long destAddr = output.getAddress();
+          final long chunkIdxEnd = chunkIdx + numChunksInInput;
+          while (chunkIdx < chunkIdxEnd) {
+            final long srcChunkSize = metadata.getLong(chunkIdx * 8);
+            final long destChunkSize = (chunkIdx < chunkIdxEnd - 1) ? chunkSize
+                : output.getAddress() + output.getLength() - destAddr;
+            hostAddrsSizes.setLong(srcAddrsOffset + chunkIdx * 8, srcAddr);
+            hostAddrsSizes.setLong(destAddrsOffset + chunkIdx * 8, destAddr);
+            hostAddrsSizes.setLong(srcSizesOffset + chunkIdx * 8, srcChunkSize);
+            hostAddrsSizes.setLong(destSizesOffset + chunkIdx * 8, destChunkSize);
+            srcAddr += srcChunkSize;
+            destAddr += destChunkSize;
+            ++chunkIdx;
           }
         }
-        throw t;
+        devAddrsSizes.copyFromHostBuffer(hostAddrsSizes, stream);
+        devAddrsSizes.incRefCount();
+        return devAddrsSizes;
       }
-
-      return outputs;
-    } finally {
-      NvcompJni.batchedLZ4DecompressDestroyMetadata(metadata);
     }
   }
 
-  /** Opaque metadata object for batched LZ4 decompression */
-  public static class BatchedMetadata implements AutoCloseable {
-    private final BatchedMetadataCleaner cleaner;
-    private final long id;
-    private boolean closed = false;
-
-    BatchedMetadata(long metadata) {
-      this.cleaner = new BatchedMetadataCleaner(metadata);
-      this.id = cleaner.id;
-      MemoryCleaner.register(this, cleaner);
-      cleaner.addRef();
-    }
-
-    long getMetadata() {
-      return cleaner.metadata;
-    }
-
-    public boolean isLZ4Metadata() {
-      return NvcompJni.isLZ4Metadata(getMetadata());
-    }
-
-    @Override
-    public synchronized void close() {
-      if (!closed) {
-        cleaner.delRef();
-        cleaner.clean(false);
-        closed = true;
-      } else {
-        cleaner.logRefCountDebug("double free " + this);
-        throw new IllegalStateException("Close called too many times " + this);
-      }
-    }
-
-    @Override
-    public String toString() {
-      return "LZ4 BATCHED METADATA (ID: " + id + " " +
-          Long.toHexString(cleaner.metadata) + ")";
-    }
-
-    private static class BatchedMetadataCleaner extends MemoryCleaner.Cleaner {
-      private long metadata;
-
-      BatchedMetadataCleaner(long metadata) {
-        this.metadata = metadata;
-      }
-
-      @Override
-      protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
-        boolean neededCleanup = false;
-        long address = metadata;
-        if (metadata != 0) {
-          try {
-            NvcompJni.batchedLZ4DecompressDestroyMetadata(metadata);
-          } finally {
-            // Always mark the resource as freed even if an exception is thrown.
-            // We cannot know how far it progressed before the exception, and
-            // therefore it is unsafe to retry.
-            metadata = 0;
-          }
-          neededCleanup = true;
-        }
-        if (neededCleanup && logErrorIfNotClean) {
-          log.error("LZ4 BATCHED METADATA WAS LEAKED (Address: " + Long.toHexString(address) + ")");
-          logRefCountDebug("Leaked event");
+  /**
+   * Fetch the metadata at the front of each input in a single, contiguous host buffer.
+   * @param totalChunks total number of compressed chunks
+   * @param inputs buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks for the corresponding input
+   * @param stream CUDA stream to use
+   * @return host buffer containing all of the metadata
+   */
+  private static HostMemoryBuffer fetchMetadata(long totalChunks,
+                                                BaseDeviceMemoryBuffer[] inputs,
+                                                int[] chunksPerInput,
+                                                Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("fetchMetadata", NvtxColor.PURPLE)) {
+      // one long per chunk containing the compressed size
+      final long totalMetadataSize = totalChunks * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
+      // Build corresponding vectors of destination addresses, source addresses and sizes.
+      long[] destAddrs = new long[inputs.length];
+      long[] srcAddrs = new long[inputs.length];
+      long[] sizes = new long[inputs.length];
+      try (HostMemoryBuffer hostMetadata = HostMemoryBuffer.allocate(totalMetadataSize);
+           DeviceMemoryBuffer devMetadata = DeviceMemoryBuffer.allocate(totalMetadataSize)) {
+        long destCopyAddr = devMetadata.getAddress();
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final long copySize = chunksPerInput[inputIdx] * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
+          destAddrs[inputIdx] = destCopyAddr;
+          srcAddrs[inputIdx] = input.getAddress();
+          sizes[inputIdx] = copySize;
+          destCopyAddr += copySize;
         }
-        return neededCleanup;
-      }
-
-      @Override
-      public boolean isClean() {
-        return metadata != 0;
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        hostMetadata.copyFromDeviceBuffer(devMetadata, stream);
+        hostMetadata.incRefCount();
+        return hostMetadata;
       }
     }
   }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java b/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java
index 5a133acbf7c..70f0a021a4d 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,10 @@ public enum CompressionType {
   INT(4),
   UINT(5),
   LONGLONG(6),
-  ULONGLONG(7);
+  ULONGLONG(7),
+  BITS(0xff);
+
+  private static final CompressionType[] types = CompressionType.values();
 
   final int nativeId;
 
@@ -33,6 +36,17 @@ public enum CompressionType {
     this.nativeId = nativeId;
   }
 
+  /** Lookup the CompressionType that corresponds to the specified native identifier */
+  public static CompressionType fromNativeId(int id) {
+    for (CompressionType type : types) {
+      if (type.nativeId == id) {
+        return type;
+      }
+    }
+    throw new IllegalArgumentException("Unknown compression type ID: " + id);
+  }
+
+  /** Get the native code identifier for the type */
   public final int toNativeId() {
     return nativeId;
   }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/Decompressor.java
deleted file mode 100644
index 90dabfbcf8e..00000000000
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/Decompressor.java
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package ai.rapids.cudf.nvcomp;
-
-import ai.rapids.cudf.Cuda;
-import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.MemoryCleaner;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/** Generic single-buffer decompressor interface */
-public class Decompressor {
-  private static final Logger log = LoggerFactory.getLogger(Decompressor.class);
-
-  /**
-   * Get the metadata associated with a compressed buffer
-   * @param buffer compressed data buffer
-   * @param stream CUDA stream to use
-   * @return opaque metadata object
-   */
-  public static Metadata getMetadata(BaseDeviceMemoryBuffer buffer, Cuda.Stream stream) {
-    long metadata = NvcompJni.decompressGetMetadata(buffer.getAddress(), buffer.getLength(),
-        stream.getStream());
-    return new Metadata(metadata);
-  }
-
-  /**
-   * Get the amount of temporary storage space required to decompress a buffer.
-   * @param metadata metadata retrieved from the compressed data
-   * @return amount in bytes of temporary storage space required to decompress
-   */
-  public static long getTempSize(Metadata metadata) {
-    return NvcompJni.decompressGetTempSize(metadata.getMetadata());
-  }
-
-  /**
-   * Get the amount of output storage space required to hold the uncompressed data.
-   * @param metadata metadata retrieved from the compressed data
-   * @return amount in bytes of output storage space required to decompress
-   */
-  public static long getOutputSize(Metadata metadata) {
-    return NvcompJni.decompressGetOutputSize(metadata.getMetadata());
-  }
-
-  /**
-   * Asynchronously decompress a buffer.
-   * @param input      compressed data buffer
-   * @param tempBuffer temporary storage buffer
-   * @param metadata   metadata retrieved from compressed data
-   * @param output     output storage buffer
-   * @param stream     CUDA stream to use
-   */
-  public static void decompressAsync(BaseDeviceMemoryBuffer input, BaseDeviceMemoryBuffer tempBuffer,
-      Metadata metadata, BaseDeviceMemoryBuffer output, Cuda.Stream stream) {
-    NvcompJni.decompressAsync(
-        input.getAddress(), input.getLength(),
-        tempBuffer.getAddress(), tempBuffer.getLength(),
-        metadata.getMetadata(),
-        output.getAddress(), output.getLength(),
-        stream.getStream());
-  }
-
-  /**
-   * Determine if a buffer is data compressed with LZ4.
-   * @param buffer data to examine
-   * @return true if the data is LZ4 compressed
-   */
-  public static boolean isLZ4Data(BaseDeviceMemoryBuffer buffer) {
-    return NvcompJni.isLZ4Data(buffer.getAddress(), buffer.getLength());
-  }
-
-
-  /** Opaque metadata object for single-buffer decompression */
-  public static class Metadata implements AutoCloseable {
-    private final MetadataCleaner cleaner;
-    private final long id;
-    private boolean closed = false;
-
-    Metadata(long metadata) {
-      this.cleaner = new MetadataCleaner(metadata);
-      this.id = cleaner.id;
-      MemoryCleaner.register(this, cleaner);
-      cleaner.addRef();
-    }
-
-    long getMetadata() {
-      return cleaner.metadata;
-    }
-
-    /**
-     * Determine if this metadata is associated with LZ4-compressed data
-     * @return true if the metadata is associated with LZ4-compressed data
-     */
-    public boolean isLZ4Metadata() {
-      return NvcompJni.isLZ4Metadata(getMetadata());
-    }
-
-    @Override
-    public synchronized void close() {
-      if (!closed) {
-        cleaner.delRef();
-        cleaner.clean(false);
-        closed = true;
-      } else {
-        cleaner.logRefCountDebug("double free " + this);
-        throw new IllegalStateException("Close called too many times " + this);
-      }
-    }
-
-    @Override
-    public String toString() {
-      return "DECOMPRESSOR METADATA (ID: " + id + " " +
-          Long.toHexString(cleaner.metadata) + ")";
-    }
-
-    private static class MetadataCleaner extends MemoryCleaner.Cleaner {
-      private long metadata;
-
-      MetadataCleaner(long metadata) {
-        this.metadata = metadata;
-      }
-
-      @Override
-      protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
-        boolean neededCleanup = false;
-        long address = metadata;
-        if (metadata != 0) {
-          try {
-            NvcompJni.decompressDestroyMetadata(metadata);
-          } finally {
-            // Always mark the resource as freed even if an exception is thrown.
-            // We cannot know how far it progressed before the exception, and
-            // therefore it is unsafe to retry.
-            metadata = 0;
-          }
-          neededCleanup = true;
-        }
-        if (neededCleanup && logErrorIfNotClean) {
-          log.error("DECOMPRESSOR METADATA WAS LEAKED (Address: " +
-              Long.toHexString(address) + ")");
-          logRefCountDebug("Leaked event");
-        }
-        return neededCleanup;
-      }
-
-      @Override
-      public boolean isClean() {
-        return metadata != 0;
-      }
-    }
-  }
-}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java
index ce7012a3bee..67a770f1346 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,46 +18,54 @@
 
 import ai.rapids.cudf.Cuda;
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.DeviceMemoryBuffer;
 import ai.rapids.cudf.HostMemoryBuffer;
 
 /** Single-buffer compressor implementing LZ4 */
 public class LZ4Compressor {
 
-  /**
-   * Calculate the amount of temporary storage space required to compress a buffer.
-   * @param input     buffer to compress
-   * @param inputType type of data within the buffer
-   * @param chunkSize compression chunk size to use
-   * @return amount in bytes of temporary storage space required to compress the buffer
-   */
-  public static long getTempSize(BaseDeviceMemoryBuffer input, CompressionType inputType,
-                                 long chunkSize) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+  /** LZ4 compression settings corresponding to a chunk size */
+  public static final class Configuration {
+    private final long metadataBytes;
+    private final long tempBytes;
+    private final long maxCompressedBytes;
+
+    Configuration(long metadataBytes, long tempBytes, long maxCompressedBytes) {
+      this.metadataBytes = metadataBytes;
+      this.tempBytes = tempBytes;
+      this.maxCompressedBytes = maxCompressedBytes;
+    }
+
+    /** Get the size of the metadata information in bytes */
+    public long getMetadataBytes() {
+      return metadataBytes;
+    }
+
+    /** Get the size of the temporary storage in bytes needed to compress */
+    public long getTempBytes() {
+      return tempBytes;
+    }
+
+    /** Get the maximum compressed output size in bytes */
+    public long getMaxCompressedBytes() {
+      return maxCompressedBytes;
     }
-    return NvcompJni.lz4CompressGetTempSize(input.getAddress(), input.getLength(),
-        inputType.nativeId, chunkSize);
   }
 
   /**
-   * Calculate the amount of output storage space required to compress a buffer.
-   * @param input      buffer to compress
-   * @param inputType  type of data within the buffer
-   * @param chunkSize  compression chunk size to use
-   * @param tempBuffer temporary storage space
-   * @return amount in bytes of output storage space required to compress the buffer
+   * Get the compression configuration necessary for a particular chunk size.
+   * @param chunkSize size of an LZ4 chunk in bytes
+   * @param uncompressedSize total size of the uncompressed data
+   * @return compression configuration for the specified chunk size
    */
-  public static long getOutputSize(BaseDeviceMemoryBuffer input, CompressionType inputType,
-                                   long chunkSize, BaseDeviceMemoryBuffer tempBuffer) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-    }
-    return NvcompJni.lz4CompressGetOutputSize(input.getAddress(), input.getLength(),
-        inputType.nativeId, chunkSize, tempBuffer.getAddress(), tempBuffer.getLength(), false);
+  public static Configuration configure(long chunkSize, long uncompressedSize) {
+    long[] configs = NvcompJni.lz4CompressConfigure(chunkSize, uncompressedSize);
+    assert configs.length == 3;
+    return new Configuration(configs[0], configs[1], configs[2]);
   }
 
   /**
-   * Compress a buffer with LZ4.
+   * Synchronously compress a buffer with LZ4.
    * @param input      buffer to compress
    * @param inputType  type of data within the buffer
    * @param chunkSize  compression chunk size to use
@@ -72,16 +80,19 @@ public static long compress(BaseDeviceMemoryBuffer input, CompressionType inputT
     if (chunkSize <= 0) {
       throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
     }
-    return NvcompJni.lz4Compress(input.getAddress(), input.getLength(), inputType.nativeId,
-        chunkSize, tempBuffer.getAddress(), tempBuffer.getLength(),
-        output.getAddress(), output.getLength(), stream.getStream());
+    try (DeviceMemoryBuffer devOutputSizeBuffer = DeviceMemoryBuffer.allocate(Long.BYTES);
+         HostMemoryBuffer hostOutputSizeBuffer = HostMemoryBuffer.allocate(Long.BYTES)) {
+      compressAsync(devOutputSizeBuffer, input, inputType, chunkSize, tempBuffer, output, stream);
+      hostOutputSizeBuffer.copyFromDeviceBuffer(devOutputSizeBuffer, stream);
+      return hostOutputSizeBuffer.getLong(0);
+    }
   }
 
   /**
    * Asynchronously compress a buffer with LZ4. The compressed size output buffer must be pinned
    * memory for this operation to be truly asynchronous. Note that the caller must synchronize
    * on the specified CUDA stream in order to safely examine the compressed output size!
-   * @param compressedSizeOutputBuffer host memory where the compressed output size will be stored
+   * @param compressedSizeOutputBuffer device memory where the compressed output size will be stored
    * @param input      buffer to compress
    * @param inputType  type of data within the buffer
    * @param chunkSize  compression chunk size to use
@@ -89,7 +100,7 @@ public static long compress(BaseDeviceMemoryBuffer input, CompressionType inputT
    * @param output     buffer that will contain the compressed result
    * @param stream     CUDA stream to use
    */
-  public static void compressAsync(HostMemoryBuffer compressedSizeOutputBuffer,
+  public static void compressAsync(DeviceMemoryBuffer compressedSizeOutputBuffer,
                                    BaseDeviceMemoryBuffer input, CompressionType inputType,
                                    long chunkSize, BaseDeviceMemoryBuffer tempBuffer,
                                    BaseDeviceMemoryBuffer output, Cuda.Stream stream) {
@@ -100,9 +111,16 @@ public static void compressAsync(HostMemoryBuffer compressedSizeOutputBuffer,
       throw new IllegalArgumentException("compressed output size buffer must be able to hold " +
           "at least 8 bytes, size is only " + compressedSizeOutputBuffer.getLength());
     }
-    NvcompJni.lz4CompressAsync(compressedSizeOutputBuffer.getAddress(),
-        input.getAddress(), input.getLength(), inputType.nativeId, chunkSize,
-        tempBuffer.getAddress(), tempBuffer.getLength(), output.getAddress(), output.getLength(),
+    NvcompJni.lz4CompressAsync(
+        compressedSizeOutputBuffer.getAddress(),
+        input.getAddress(),
+        input.getLength(),
+        inputType.nativeId,
+        chunkSize,
+        tempBuffer.getAddress(),
+        tempBuffer.getLength(),
+        output.getAddress(),
+        output.getLength(),
         stream.getStream());
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java
new file mode 100644
index 00000000000..46b3127581b
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.Cuda;
+
+/** Single-buffer decompression using LZ4 */
+public class LZ4Decompressor {
+
+  /**
+   * LZ4 decompression settings corresponding to an LZ4 compressed input.
+   * NOTE: Each instance must be closed to avoid a native memory leak.
+   */
+  public static final class Configuration implements AutoCloseable {
+    private final long metadataPtr;
+    private final long metadataSize;
+    private final long tempBytes;
+    private final long uncompressedBytes;
+
+    Configuration(long metadataPtr, long metadataSize, long tempBytes,
+                  long uncompressedBytes) {
+      this.metadataPtr = metadataPtr;
+      this.metadataSize = metadataSize;
+      this.tempBytes = tempBytes;
+      this.uncompressedBytes = uncompressedBytes;
+    }
+
+    /** Get the host address of the metadata */
+    public long getMetadataPtr() {
+      return metadataPtr;
+    }
+
+    /** Get the size of the metadata in bytes */
+    public long getMetadataSize() {
+      return metadataSize;
+    }
+
+    /** Get the size of the temporary buffer in bytes needed to decompress */
+    public long getTempBytes() {
+      return tempBytes;
+    }
+
+    /** Get the size of the uncompressed data in bytes */
+    public long getUncompressedBytes() {
+      return uncompressedBytes;
+    }
+
+    @Override
+    public void close() {
+      NvcompJni.lz4DestroyMetadata(metadataPtr);
+    }
+  }
+
+  /**
+   * Determine if a buffer is data compressed with LZ4.
+   * @param buffer data to examine
+   * @param stream CUDA stream to use
+   * @return true if the data is LZ4 compressed
+   */
+  public static boolean isLZ4Data(BaseDeviceMemoryBuffer buffer, Cuda.Stream stream) {
+    return NvcompJni.isLZ4Data(buffer.getAddress(), buffer.getLength(), stream.getStream());
+  }
+
+  /**
+   * Get the decompression configuration from compressed data.
+   * NOTE: The resulting configuration object must be closed to avoid a native memory leak.
+   * @param compressed data that has been compressed by the LZ4 compressor
+   * @param stream CUDA stream to use
+   * @return decompression configuration for the specified input
+   */
+  public static Configuration configure(BaseDeviceMemoryBuffer compressed, Cuda.Stream stream) {
+    long[] configs = NvcompJni.lz4DecompressConfigure(compressed.getAddress(),
+        compressed.getLength(), stream.getStream());
+    assert configs.length == 4;
+    return new Configuration(configs[0], configs[1], configs[2], configs[3]);
+  }
+
+  /**
+   * Asynchronously decompress data compressed with the LZ4 compressor.
+   * @param compressed buffer containing LZ4-compressed data
+   * @param config decompression configuration
+   * @param temp temporary storage buffer
+   * @param outputBuffer buffer that will be written with the uncompressed output
+   * @param stream CUDA stream to use
+   */
+  public static void decompressAsync(
+      BaseDeviceMemoryBuffer compressed,
+      Configuration config,
+      BaseDeviceMemoryBuffer temp,
+      BaseDeviceMemoryBuffer outputBuffer,
+      Cuda.Stream stream) {
+    NvcompJni.lz4DecompressAsync(
+        compressed.getAddress(),
+        compressed.getLength(),
+        config.getMetadataPtr(),
+        config.getMetadataSize(),
+        temp.getAddress(),
+        temp.getLength(),
+        outputBuffer.getAddress(),
+        outputBuffer.getLength(),
+        stream.getStream());
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
index 5ce0a8d815d..58f8390d0eb 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,69 +24,14 @@ class NvcompJni {
     NativeDepsLoader.loadNativeDeps();
   }
 
-  /**
-   * Extracts the metadata from the input on the device and copies
-   * it to the host. Note that the result must be released with a
-   * call to decompressDestroyMetadata
-   * @param inPtr device address of the compressed data
-   * @param inSize size of the compressed data in bytes
-   * @param stream address of CUDA stream that will be used for synchronization
-   * @return address of the metadata on the host
-   */
-  static native long decompressGetMetadata(long inPtr, long inSize, long stream);
-
-  /**
-   * Destroys the metadata object and frees the associated memory.
-   * @param metadataPtr address of the metadata object
-   */
-  static native void decompressDestroyMetadata(long metadataPtr);
-
-  /**
-   * Computes the temporary storage size needed to decompress.
-   * This over-estimates the needed storage considerably.
-   * @param metadataPtr address of the metadata object
-   * @return the number of temporary storage bytes needed to decompress
-   */
-  static native long decompressGetTempSize(long metadataPtr);
-
-  /**
-   * Computes the decompressed size of the data.  Gets this from the
-   * metadata contained in the compressed data.
-   * @param metadataPtr address of the metadata object
-   * @return the size of the decompressed data in bytes
-   */
-  static native long decompressGetOutputSize(long metadataPtr);
-
-  /**
-   * Perform asynchronous decompression using the specified CUDA stream.
-   * The input, temporary, and output buffers must all be in GPU-accessible
-   * memory.
-   * @param inPtr device address of the compressed buffer
-   * @param inSize size of the compressed data in bytes
-   * @param tempPtr device address of the temporary decompression storage buffer
-   * @param tempSize size of the temporary decompression storage buffer
-   * @param metadataPtr address of the metadata object
-   * @param outPtr device address of the buffer to use for uncompressed output
-   * @param outSize size of the uncompressed output buffer in bytes
-   * @param stream CUDA stream to use
-   */
-  static native void decompressAsync(
-      long inPtr,
-      long inSize,
-      long tempPtr,
-      long tempSize,
-      long metadataPtr,
-      long outPtr,
-      long outSize,
-      long stream);
-
   /**
    * Determine if data is compressed with the nvcomp LZ4 compressor.
    * @param inPtr device address of the compressed data
    * @param inSize size of the compressed data in bytes
+   * @param stream CUDA stream to use
    * @return true if the data is compressed with the nvcomp LZ4 compressor
    */
-  static native boolean isLZ4Data(long inPtr, long inSize);
+  static native boolean isLZ4Data(long inPtr, long inSize, long stream);
 
   /**
    * Determine if the metadata corresponds to data compressed with the nvcomp LZ4 compressor.
@@ -96,45 +41,21 @@ static native void decompressAsync(
   static native boolean isLZ4Metadata(long metadataPtr);
 
   /**
-   * Calculate the temporary buffer size needed for LZ4 compression.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @return number of temporary storage bytes needed to compress
+   * Return the LZ4 compression configuration necessary for a particular chunk size.
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param uncompressedSize total size of the uncompressed data
+   * @return array of three longs containing metadata size, temp storage size,
+   *         and output buffer size
    */
-  static native long lz4CompressGetTempSize(
-      long inPtr,
-      long inSize,
-      int inputType,
-      long chunkSize);
+  static native long[] lz4CompressConfigure(long chunkSize, long uncompressedSize);
 
   /**
-   * Calculate the output buffer size for LZ4 compression. The output
-   * size can be an estimated upper bound or the exact value.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @param tempPtr device address of the temporary storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param computeExactSize set to true to compute the exact output size
-   * @return output buffer size in bytes. If computeExactSize is true then
-   * this is an exact size otherwise it is a maximum, worst-case size of the
-   * compressed data.
-   */
-  static native long lz4CompressGetOutputSize(
-      long inPtr,
-      long inSize,
-      int inputType,
-      long chunkSize,
-      long tempPtr,
-      long tempSize,
-      boolean computeExactSize);
-
-  /**
-   * Perform LZ4 compression synchronously using the specified CUDA
-   * stream.
+   * Perform LZ4 compression asynchronously using the specified CUDA stream.
+   * @param compressedSizeOutputPtr host address of a 64-bit integer to update
+   *                                with the resulting compressed size of the
+   *                                data. For the operation to be truly
+   *                                asynchronous this should point to pinned
+   *                                host memory.
    * @param inPtr device address of the uncompressed data
    * @param inSize size of the uncompressed data in bytes
    * @param inputType type of uncompressed data
@@ -144,9 +65,9 @@ static native long lz4CompressGetOutputSize(
    * @param outPtr device address of the output buffer
    * @param outSize size of the output buffer in bytes
    * @param stream CUDA stream to use
-   * @return size of the compressed output in bytes
    */
-  static native long lz4Compress(
+  static native void lz4CompressAsync(
+      long compressedSizeOutputPtr,
       long inPtr,
       long inSize,
       int inputType,
@@ -158,29 +79,33 @@ static native long lz4Compress(
       long stream);
 
   /**
-   * Perform LZ4 compression synchronously using the specified CUDA
-   * stream.
-   * @param compressedSizeOutputPtr host address of a 64-bit integer to update
-   *                                with the resulting compressed size of the
-   *                                data. For the operation to be truly
-   *                                asynchronous this should point to pinned
-   *                                host memory.
+   * Return the decompression configuration for a compressed input.
+   * NOTE: The resulting configuration object must be closed to destroy the corresponding
+   * host-side metadata created by this method to avoid a native memory leak.
+   * @param inPtr device address of the compressed data
+   * @param inSize size of the compressed data
+   * @return array of four longs containing metadata address, metadata size, temp storage size,
+   *         and output buffer size
+   */
+  static native long[] lz4DecompressConfigure(long inPtr, long inSize, long stream);
+
+  /**
+   * Perform LZ4 decompression asynchronously using the specified CUDA stream.
    * @param inPtr device address of the uncompressed data
    * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param chunkSize size of an LZ4 chunk in bytes
+   * @param metadataPtr host address of the metadata
+   * @param metadataSize size of the metadata in bytes
    * @param tempPtr device address of the temporary compression storage buffer
    * @param tempSize size of the temporary storage buffer in bytes
    * @param outPtr device address of the output buffer
    * @param outSize size of the output buffer in bytes
    * @param stream CUDA stream to use
    */
-  static native void lz4CompressAsync(
-      long compressedSizeOutputPtr,
+  static native void lz4DecompressAsync(
       long inPtr,
       long inSize,
-      int inputType,
-      long chunkSize,
+      long metadataPtr,
+      long metadataSize,
       long tempPtr,
       long tempSize,
       long outPtr,
@@ -188,229 +113,99 @@ static native void lz4CompressAsync(
       long stream);
 
   /**
-   * Extracts the metadata from the batch of inputs on the device and
-   * copies them to the host. This synchronizes on the stream.
-   * @param inPtrs device addresses of the compressed buffers in the batch
-   * @param inSizes corresponding byte sizes of the compressed buffers
-   * @param stream CUDA stream to use
-   * @return handle to the batched decompress metadata on the host
-   */
-  static native long batchedLZ4DecompressGetMetadata(
-      long[] inPtrs,
-      long[] inSizes,
-      long stream);
-
-  /**
-   * Destroys batched metadata and frees the underlying host memory.
-   * @param batchedMetadata handle to the batched decompress metadata
+   * Destroy host-side metadata created by {@link NvcompJni#lz4DecompressConfigure(long, long, long)}
+   * @param metadataPtr host address of metadata
    */
-  static native void batchedLZ4DecompressDestroyMetadata(long batchedMetadata);
+  static native void lz4DestroyMetadata(long metadataPtr);
 
   /**
-   * Computes the temporary storage size in bytes needed to decompress
-   * the compressed batch.
-   * @param batchedMetadata handle to the batched metadata
-   * @return number of temporary storage bytes needed to decompress the batch
-   */
-  static native long batchedLZ4DecompressGetTempSize(long batchedMetadata);
-
-  /**
-   * Computes the decompressed size of each chunk in the batch.
-   * @param batchedMetadata handle to the batched metadata
-   * @param numOutputs number of output buffers in the batch
-   * @return Array of corresponding output sizes needed to decompress
-   * each buffer in the batch.
-   */
-  static native long[] batchedLZ4DecompressGetOutputSize(
-      long batchedMetadata,
-      long numOutputs);
-
-  /**
-   * Asynchronously decompress a batch of compressed data buffers.
-   * @param inPtrs device addresses of the compressed buffers
-   * @param inSizes corresponding byte sizes of the compressed buffers
-   * @param tempPtr device address of the temporary decompression space
-   * @param tempSize size of the temporary decompression space in bytes
-   * @param batchedMetadata handle to the batched metadata
-   * @param outPtrs device addresses of the uncompressed output buffers
-   * @param outSizes corresponding byte sizes of the uncompressed output buffers
-   * @param stream CUDA stream to use
-   */
-  static native void batchedLZ4DecompressAsync(
-      long[] inPtrs,
-      long[] inSizes,
-      long tempPtr,
-      long tempSize,
-      long batchedMetadata,
-      long[] outPtrs,
-      long[] outSizes,
-      long stream);
-
-  /**
-   * Get the temporary workspace size required to perform compression of entire batch.
-   * @param inPtrs device addresses of the uncompressed buffers
-   * @param inSizes corresponding byte sizes of the uncompressed buffers
-   * @param chunkSize size of an LZ4 chunk in bytes
+   * Get the temporary workspace size required to perform compression of entire LZ4 batch.
+   * @param batchSize number of chunks in the batch
+   * @param maxChunkSize maximum size of an uncompressed chunk in bytes
    * @return The size of required temporary workspace in bytes to compress the batch.
    */
-  static native long batchedLZ4CompressGetTempSize(
-      long[] inPtrs,
-      long[] inSizes,
-      long chunkSize);
+  static native long batchedLZ4CompressGetTempSize(long batchSize, long maxChunkSize);
 
   /**
-   * Get the required output sizes of each chunk to perform compression.
-   * @param inPtrs device addresses of the uncompressed buffers
-   * @param inSizes corresponding byte sizes of the uncompressed buffers
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @param tempPtr device address of the temporary workspace buffer
-   * @param tempSize size of the temporary workspace buffer in bytes
-   * @return array of corresponding sizes in bytes of the output buffers needed
-   * to compress the buffers in the batch.
+   * Get the maximum size any chunk could compress to in a LZ4 batch. This is the minimum amount of
+   * output memory to allocate per chunk when batch compressing.
+   * @param maxChunkSize maximum size of an uncompressed chunk size in bytes
+   * @return maximum compressed output size of a chunk
    */
-  static native long[] batchedLZ4CompressGetOutputSize(
-      long[] inPtrs,
-      long[] inSizes,
-      long chunkSize,
-      long tempPtr,
-      long tempSize);
+  static native long batchedLZ4CompressGetMaxOutputChunkSize(long maxChunkSize);
 
   /**
-   * Asynchronously compress a batch of buffers. Note that
+   * Asynchronously compress a batch of buffers with LZ4. Note that
    * compressedSizesOutPtr must point to pinned memory for this operation
    * to be asynchronous.
-   * @param compressedSizesOutPtr host address where to write the sizes of the
+   * @param devInPtrs device address of uncompressed buffer addresses vector
+   * @param devInSizes device address of uncompressed buffer sizes vector
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param batchSize number of chunks in the batch
+   * @param tempPtr device address of the temporary workspace buffer
+   * @param tempSize size of the temporary workspace buffer in bytes
+   * @param devOutPtrs device address of output buffer addresses vector
+   * @param compressedSizesOutPtr device address where to write the sizes of the
    *                              compressed data written to the corresponding
    *                              output buffers. Must point to a buffer with
    *                              at least 8 bytes of memory per output buffer
-   *                              in the batch. For asynchronous operation
-   *                              this must point to pinned host memory.
-   * @param inPtrs device addresses of the uncompressed buffers
-   * @param inSizes corresponding byte sizes of the uncompressed buffers
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @param tempPtr device address of the temporary workspace buffer
-   * @param tempSize size of the temporary workspace buffer in bytes
-   * @param outPtrs device addresses of the output compressed buffers
-   * @param outSizes corresponding sizes in bytes of the output buffers
+   *                              in the batch.
    * @param stream CUDA stream to use
    */
   static native void batchedLZ4CompressAsync(
-      long compressedSizesOutPtr,
-      long[] inPtrs,
-      long[] inSizes,
+      long devInPtrs,
+      long devInSizes,
       long chunkSize,
+      long batchSize,
       long tempPtr,
       long tempSize,
-      long[] outPtrs,
-      long[] outSizes,
+      long devOutPtrs,
+      long compressedSizesOutPtr,
       long stream);
 
   /**
-   * Calculate the temporary buffer size needed for cascaded compression.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param numRLEs number of Run Length Encoding layers to use
-   * @param numDeltas number of delta layers to use
-   * @param useBitPacking set to true if bit-packing should be used
-   * @return number of temporary storage bytes needed to compress
-   */
-  static native long cascadedCompressGetTempSize(
-      long inPtr,
-      long inSize,
-      int inputType,
-      int numRLEs,
-      int numDeltas,
-      boolean useBitPacking);
-
-  /**
-   * Calculate the output buffer size for cascaded compression. The output
-   * size can be an estimated upper bound or the exact value.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param numRLEs number of Run Length Encoding layers to use
-   * @param numDeltas number of delta layers to use
-   * @param useBitPacking set to true if bit-packing should be used
-   * @param tempPtr device address of the temporary compression storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param computeExactSize set to true to compute the exact output size
-   * @return output buffer size in bytes. If computeExactSize is true then
-   * this is an exact size otherwise it is a maximum, worst-case size of the
-   * compressed data.
+   * Computes the temporary storage size in bytes needed to decompress a LZ4-compressed batch.
+   * @param numChunks number of chunks in the batch
+   * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes
+   * @return number of temporary storage bytes needed to decompress the batch
    */
-  static native long cascadedCompressGetOutputSize(
-      long inPtr,
-      long inSize,
-      int inputType,
-      int numRLEs,
-      int numDeltas,
-      boolean useBitPacking,
-      long tempPtr,
-      long tempSize,
-      boolean computeExactSize);
+  static native long batchedLZ4DecompressGetTempSize(
+      long numChunks,
+      long maxUncompressedChunkBytes);
 
   /**
-   * Perform cascaded compression synchronously using the specified CUDA
-   * stream.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param numRLEs number of Run Length Encoding layers to use
-   * @param numDeltas number of delta layers to use
-   * @param useBitPacking set to true if bit-packing should be used
-   * @param tempPtr device address of the temporary compression storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param outPtr device address of the output buffer
-   * @param outSize size of the output buffer in bytes
+   * Asynchronously decompress a batch of LZ4-compressed data buffers.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of uncompressed buffer sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param tempPtr device address of the temporary decompression space
+   * @param tempSize size of the temporary decompression space in bytes
+   * @param devOutPtrs device address of uncompressed output buffer addresses vector
    * @param stream CUDA stream to use
-   * @return size of the compressed output in bytes
    */
-  static native long cascadedCompress(
-      long inPtr,
-      long inSize,
-      int inputType,
-      int numRLEs,
-      int numDeltas,
-      boolean useBitPacking,
+  static native void batchedLZ4DecompressAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
       long tempPtr,
       long tempSize,
-      long outPtr,
-      long outSize,
+      long devOutPtrs,
       long stream);
 
   /**
-   * Perform cascaded compression asynchronously using the specified CUDA
-   * stream. Note if the compressedSizeOutputPtr argument points to paged
-   * memory then this may synchronize in practice due to limitations of
-   * copying from the device to paged memory.
-   * @param compressedSizeOutputPtr address of a 64-bit integer to update with
-   *                                the resulting compressed size of the data.
-   *                                For the operation to be truly asynchronous
-   *                                this should point to pinned host memory.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param numRLEs number of Run Length Encoding layers to use
-   * @param numDeltas number of delta layers to use
-   * @param useBitPacking set to true if bit-packing should be used
-   * @param tempPtr device address of the temporary compression storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param outPtr device address of the output buffer
-   * @param outSize size of the output buffer in bytes
+   * Asynchronously calculates the decompressed size needed for each chunk.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of calculated decompress sizes vector
+   * @param batchSize number of buffers in the batch
    * @param stream CUDA stream to use
    */
-  static native void cascadedCompressAsync(
-      long compressedSizeOutputPtr,
-      long inPtr,
-      long inSize,
-      int inputType,
-      int numRLEs,
-      int numDeltas,
-      boolean useBitPacking,
-      long tempPtr,
-      long tempSize,
-      long outPtr,
-      long outSize,
+  static native void batchedLZ4GetDecompressSizeAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
       long stream);
 }
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 2c95c6eebac..b5c7cfe8e6f 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -184,11 +184,21 @@ endif()
 ###################################################################################################
 # - nvcomp ----------------------------------------------------------------------------------------
 
-include(ConfigureNvcomp)
-if(NVCOMP_FOUND)
-    message(STATUS "nvcomp compression library found in ${NVCOMP_ROOT}")
+find_path(NVCOMP_INCLUDE "nvcomp"
+    HINTS "${CUDF_CPP_BUILD_DIR}/_deps/nvcomp-src/include"
+          "$ENV{CONDA_PREFIX}/include")
+
+message(STATUS "NVCOMP: NVCOMP_INCLUDE set to ${NVCOMP_INCLUDE}")
+
+set(CUDF_JNI_NVCOMP_LIBNAME "libnvcomp.a")
+find_library(NVCOMP_LIBRARY ${CUDF_JNI_NVCOMP_LIBNAME} REQUIRED
+  HINTS "${CUDF_CPP_BUILD_DIR}/lib"
+	"$ENV{CONDA_PREFIX}/lib")
+
+if(NOT NVCOMP_LIBRARY)
+  message(FATAL_ERROR "nvcomp static library not found.")
 else()
-    message(FATAL_ERROR "nvcomp compression library not found.")
+  message(STATUS "NVCOMP: NVCOMP_LIBRARY set to ${NVCOMP_LIBRARY}")
 endif()
 
 ###################################################################################################
@@ -218,7 +228,8 @@ add_library(cudfjni SHARED
     src/RmmJni.cpp
     src/ScalarJni.cpp
     src/TableJni.cpp
-    src/map_lookup.cu)
+    src/map_lookup.cu
+    src/check_nvcomp_output_sizes.cu)
 
 ###################################################################################################
 # - include paths ---------------------------------------------------------------------------------
@@ -229,7 +240,7 @@ target_include_directories(cudfjni
       "${CUB_INCLUDE}"
       "${LIBCUDACXX_INCLUDE}"
       "${CUDAToolkit_INCLUDE_DIRS}"
-      "${NVCOMP_INCLUDE_DIR}"
+      "${NVCOMP_INCLUDE}"
       "${CMAKE_BINARY_DIR}/include"
       "${CMAKE_SOURCE_DIR}/include"
       "${SPDLOG_INCLUDE}"
@@ -293,7 +304,7 @@ target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM
 ###################################################################################################
 # - link libraries --------------------------------------------------------------------------------
 
-target_link_libraries(cudfjni PRIVATE nvcomp ${CUDF_LIB} ${ARROW_LIBRARY})
+target_link_libraries(cudfjni PRIVATE ${NVCOMP_LIBRARY} ${CUDF_LIB} ${ARROW_LIBRARY})
 
 ###################################################################################################
 # - cudart options --------------------------------------------------------------------------------
diff --git a/java/src/main/native/cmake/Modules/ConfigureNvcomp.cmake b/java/src/main/native/cmake/Modules/ConfigureNvcomp.cmake
deleted file mode 100644
index 1a0083e4518..00000000000
--- a/java/src/main/native/cmake/Modules/ConfigureNvcomp.cmake
+++ /dev/null
@@ -1,79 +0,0 @@
-#=============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-set(NVCOMP_ROOT "${CMAKE_BINARY_DIR}/nvcomp")
-
-if(CUDA_STATIC_RUNTIME)
-  set(NVCOMP_CUDA_RUNTIME_LIBRARY Static)
-else()
-  set(NVCOMP_CUDA_RUNTIME_LIBRARY Shared)
-endif()
-
-set(NVCOMP_CMAKE_ARGS "-DCMAKE_CUDA_RUNTIME_LIBRARY=${NVCOMP_CUDA_RUNTIME_LIBRARY} -DUSE_RMM=ON -DCUB_DIR=${CUB_INCLUDE}")
-
-configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/Nvcomp.CMakeLists.txt.cmake"
-               "${NVCOMP_ROOT}/CMakeLists.txt")
-
-file(MAKE_DIRECTORY "${NVCOMP_ROOT}/build")
-
-execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} .
-                RESULT_VARIABLE NVCOMP_CONFIG
-                WORKING_DIRECTORY ${NVCOMP_ROOT})
-
-if(NVCOMP_CONFIG)
-    message(FATAL_ERROR "Configuring nvcomp failed: " ${NVCOMP_CONFIG})
-endif()
-
-set(PARALLEL_BUILD -j)
-if($ENV{PARALLEL_LEVEL})
-    set(NUM_JOBS $ENV{PARALLEL_LEVEL})
-    set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
-endif()
-
-if(${NUM_JOBS})
-    if(${NUM_JOBS} EQUAL 1)
-        message(STATUS "NVCOMP BUILD: Enabling Sequential CMake build")
-    elseif(${NUM_JOBS} GREATER 1)
-        message(STATUS "NVCOMP BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
-    endif()
-else()
-    message(STATUS "NVCOMP BUILD: Enabling Parallel CMake build with all threads")
-endif()
-
-execute_process(COMMAND ${CMAKE_COMMAND} --build .. -- ${PARALLEL_BUILD}
-                RESULT_VARIABLE NVCOMP_BUILD
-                WORKING_DIRECTORY ${NVCOMP_ROOT}/build)
-
-if(NVCOMP_BUILD)
-    message(FATAL_ERROR "Building nvcomp failed: " ${NVCOMP_BUILD})
-endif()
-
-message(STATUS "nvcomp build completed at: " ${NVCOMP_ROOT}/build)
-
-set(NVCOMP_INCLUDE_DIR "${NVCOMP_ROOT}/build/include")
-set(NVCOMP_LIBRARY_DIR "${NVCOMP_ROOT}/build/lib")
-
-find_library(NVCOMP_LIB nvcomp
-    NO_DEFAULT_PATH
-    HINTS "${NVCOMP_LIBRARY_DIR}")
-
-if(NVCOMP_LIB)
-    message(STATUS "nvcomp library: " ${NVCOMP_LIB})
-    set(NVCOMP_FOUND TRUE)
-
-    add_library(nvcomp STATIC IMPORTED)
-    set_target_properties(nvcomp PROPERTIES IMPORTED_LOCATION "${NVCOMP_LIB}")
-endif()
diff --git a/java/src/main/native/cmake/Templates/Nvcomp.CMakeLists.txt.cmake b/java/src/main/native/cmake/Templates/Nvcomp.CMakeLists.txt.cmake
deleted file mode 100644
index 5761ef8fd1b..00000000000
--- a/java/src/main/native/cmake/Templates/Nvcomp.CMakeLists.txt.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-#=============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-cmake_minimum_required(VERSION 3.12)
-
-project(nvcomp)
-
-include(ExternalProject)
-
-ExternalProject_Add(nvcomp
-    GIT_REPOSITORY  https://github.com/NVIDIA/nvcomp.git
-    GIT_TAG         v1.2.1
-    GIT_SHALLOW     true
-    SOURCE_DIR      "${NVCOMP_ROOT}/nvcomp"
-    BINARY_DIR      "${NVCOMP_ROOT}/build"
-    INSTALL_DIR     "${NVCOMP_ROOT}/install"
-    PATCH_COMMAND   patch --reject-file=- -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/cmake/nvcomp.patch || true
-    CMAKE_ARGS      ${NVCOMP_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${NVCOMP_ROOT}/install
-    BUILD_COMMAND   ${CMAKE_COMMAND} --build . --target nvcomp
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping nvcomp install step.")
diff --git a/java/src/main/native/cmake/nvcomp.patch b/java/src/main/native/cmake/nvcomp.patch
deleted file mode 100644
index ea1340b7754..00000000000
--- a/java/src/main/native/cmake/nvcomp.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
-index 32f48ef..a2e3125 100644
---- a/src/CMakeLists.txt
-+++ b/src/CMakeLists.txt
-@@ -10,7 +10,9 @@ endif()
- file(GLOB CUDA_SOURCES *.cu)
- file(GLOB CPP_SOURCES *.cpp)
- 
--add_library(nvcomp SHARED ${CUDA_SOURCES} ${CPP_SOURCES})
-+
-+add_library(nvcomp STATIC ${CUDA_SOURCES} ${CPP_SOURCES})
-+set_property(TARGET nvcomp PROPERTY POSITION_INDEPENDENT_CODE True)
- set_property(TARGET nvcomp PROPERTY CUDA_ARCHITECTURES ${GPU_ARCHS})
- target_compile_options(nvcomp PRIVATE
-     $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda -Xcompiler -pthread>)
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index 4b6696e3911..95444db7dff 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -21,6 +21,7 @@
 #include <jni.h>
 
 #include <cudf/utilities/error.hpp>
+#include <rmm/detail/error.hpp>
 
 namespace cudf {
 namespace jni {
@@ -741,11 +742,7 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
   }
 
 #define CATCH_STD_CLASS(env, class_name, ret_val)                                                  \
-  catch (const std::bad_alloc &e) {                                                                \
-    /* In some cases a cuda exception can be the cause so peek and clear if needed*/               \
-    if (cudaErrorMemoryAllocation == cudaPeekAtLastError()) {                                      \
-      cudaGetLastError();                                                                          \
-    }                                                                                              \
+  catch (const rmm::out_of_memory &e) {                                                            \
     auto what =                                                                                    \
         std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
     JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                         \
diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp
index 4b378905a43..a18c88e10dc 100644
--- a/java/src/main/native/src/CompiledExpression.cpp
+++ b/java/src/main/native/src/CompiledExpression.cpp
@@ -144,6 +144,9 @@ cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) {
     case 20: return cudf::ast::ast_operator::RINT;
     case 21: return cudf::ast::ast_operator::BIT_INVERT;
     case 22: return cudf::ast::ast_operator::NOT;
+    case 23: return cudf::ast::ast_operator::CAST_TO_INT64;
+    case 24: return cudf::ast::ast_operator::CAST_TO_UINT64;
+    case 25: return cudf::ast::ast_operator::CAST_TO_FLOAT64;
     default: throw std::invalid_argument("unexpected JNI AST unary operator value");
   }
 }
diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp
index 0e34d2856f5..d551e9414d1 100644
--- a/java/src/main/native/src/NvcompJni.cpp
+++ b/java/src/main/native/src/NvcompJni.cpp
@@ -13,11 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include <cascaded.h>
-#include <lz4.h>
 #include <nvcomp.h>
 
+#include <nvcomp/lz4.h>
+#include <rmm/device_uvector.hpp>
+
+#include "check_nvcomp_output_sizes.hpp"
 #include "cudf_jni_apis.hpp"
 
 namespace {
@@ -27,7 +28,7 @@ constexpr char const *NVCOMP_CUDA_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompCud
 constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
 constexpr char const *UNSUPPORTED_CLASS = "java/lang/UnsupportedOperationException";
 
-void check_nvcomp_status(JNIEnv *env, nvcompError_t status) {
+void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) {
   switch (status) {
     case nvcompSuccess: break;
     case nvcompErrorInvalidValue:
@@ -36,9 +37,15 @@ void check_nvcomp_status(JNIEnv *env, nvcompError_t status) {
     case nvcompErrorNotSupported:
       cudf::jni::throw_java_exception(env, UNSUPPORTED_CLASS, "nvcomp unsupported");
       break;
+    case nvcompErrorCannotDecompress:
+      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "nvcomp cannot decompress");
+      break;
     case nvcompErrorCudaError:
       cudf::jni::throw_java_exception(env, NVCOMP_CUDA_ERROR_CLASS, "nvcomp CUDA error");
       break;
+    case nvcompErrorInternal:
+      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "nvcomp internal error");
+      break;
     default:
       cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "nvcomp unknown error");
       break;
@@ -49,74 +56,16 @@ void check_nvcomp_status(JNIEnv *env, nvcompError_t status) {
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressGetMetadata(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jlong jstream) {
-  try {
-    cudf::jni::auto_set_device(env);
-    void *metadata_ptr;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompDecompressGetMetadata(reinterpret_cast<void *>(in_ptr), in_size,
-                                              &metadata_ptr, stream);
-    check_nvcomp_status(env, status);
-    return reinterpret_cast<jlong>(metadata_ptr);
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressDestroyMetadata(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    nvcompDecompressDestroyMetadata(reinterpret_cast<void *>(metadata_ptr));
-  }
-  CATCH_STD(env, );
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressGetTempSize(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    size_t temp_size;
-    auto status = nvcompDecompressGetTempSize(reinterpret_cast<void *>(metadata_ptr), &temp_size);
-    check_nvcomp_status(env, status);
-    return temp_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressGetOutputSize(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    size_t out_size;
-    auto status = nvcompDecompressGetOutputSize(reinterpret_cast<void *>(metadata_ptr), &out_size);
-    check_nvcomp_status(env, status);
-    return out_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressAsync(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jlong temp_ptr, jlong temp_size,
-    jlong metadata_ptr, jlong out_ptr, jlong out_size, jlong jstream) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompDecompressAsync(reinterpret_cast<void *>(in_ptr), in_size,
-                                        reinterpret_cast<void *>(temp_ptr), temp_size,
-                                        reinterpret_cast<void *>(metadata_ptr),
-                                        reinterpret_cast<void *>(out_ptr), out_size, stream);
-    check_nvcomp_status(env, status);
-  }
-  CATCH_STD(env, );
-}
-
 JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_isLZ4Data(JNIEnv *env, jclass,
-                                                                          jlong in_ptr,
-                                                                          jlong in_size) {
+                                                                          jlong j_in_ptr,
+                                                                          jlong j_in_size,
+                                                                          jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    return LZ4IsData(reinterpret_cast<void *>(in_ptr), in_size);
+    auto in_ptr = reinterpret_cast<void const *>(j_in_ptr);
+    auto in_size = static_cast<std::size_t>(j_in_size);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    return LZ4IsData(in_ptr, in_size, stream);
   }
   CATCH_STD(env, 0)
 }
@@ -130,365 +79,215 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_isLZ4Metadata(JN
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressGetTempSize(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jlong chunk_size) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    size_t temp_size;
-    auto status = nvcompLZ4CompressGetTempSize(reinterpret_cast<void *>(in_ptr), in_size, comp_type,
-                                               &opts, &temp_size);
-    check_nvcomp_status(env, status);
-    return temp_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressGetOutputSize(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jlong chunk_size,
-    jlong temp_ptr, jlong temp_size, jboolean compute_exact) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    size_t out_size;
-    auto status = nvcompLZ4CompressGetOutputSize(
-        reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-        reinterpret_cast<void *>(temp_ptr), temp_size, &out_size, compute_exact);
-    check_nvcomp_status(env, status);
-    return out_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4Compress(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jlong chunk_size,
-    jlong temp_ptr, jlong temp_size, jlong out_ptr, jlong out_size, jlong jstream) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressConfigure(
+    JNIEnv *env, jclass, jlong j_chunk_size, jlong j_uncompressed_size) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
     nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    size_t compressed_size = out_size;
-    auto status =
-        nvcompLZ4CompressAsync(reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-                               reinterpret_cast<void *>(temp_ptr), temp_size,
-                               reinterpret_cast<void *>(out_ptr), &compressed_size, stream);
+    opts.chunk_size = static_cast<std::size_t>(j_chunk_size);
+    auto uncompressed_size = static_cast<std::size_t>(j_uncompressed_size);
+    std::size_t metadata_bytes = 0;
+    std::size_t temp_bytes = 0;
+    std::size_t out_bytes = 0;
+    auto status = nvcompLZ4CompressConfigure(&opts, NVCOMP_TYPE_CHAR, uncompressed_size,
+                                             &metadata_bytes, &temp_bytes, &out_bytes);
     check_nvcomp_status(env, status);
-    if (cudaStreamSynchronize(stream) != cudaSuccess) {
-      JNI_THROW_NEW(env, NVCOMP_CUDA_ERROR_CLASS, "Error synchronizing stream", 0);
-    }
-    return compressed_size;
+    cudf::jni::native_jlongArray result(env, 3);
+    result[0] = static_cast<jlong>(metadata_bytes);
+    result[1] = static_cast<jlong>(temp_bytes);
+    result[2] = static_cast<jlong>(out_bytes);
+    return result.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressAsync(
-    JNIEnv *env, jclass, jlong compressed_output_ptr, jlong in_ptr, jlong in_size, jint input_type,
-    jlong chunk_size, jlong temp_ptr, jlong temp_size, jlong out_ptr, jlong out_size,
-    jlong jstream) {
+    JNIEnv *env, jclass, jlong j_compressed_size_ptr, jlong j_in_ptr, jlong j_in_size,
+    jint j_input_type, jlong j_chunk_size, jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptr,
+    jlong j_out_size, jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
+    auto in_ptr = reinterpret_cast<void const *>(j_in_ptr);
+    auto in_size = static_cast<std::size_t>(j_in_size);
+    auto comp_type = static_cast<nvcompType_t>(j_input_type);
     nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto compressed_size_ptr = reinterpret_cast<size_t *>(compressed_output_ptr);
-    *compressed_size_ptr = out_size;
-    auto status =
-        nvcompLZ4CompressAsync(reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-                               reinterpret_cast<void *>(temp_ptr), temp_size,
-                               reinterpret_cast<void *>(out_ptr), compressed_size_ptr, stream);
+    opts.chunk_size = static_cast<std::size_t>(j_chunk_size);
+    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
+    auto temp_size = static_cast<std::size_t>(j_temp_size);
+    auto out_ptr = reinterpret_cast<void *>(j_out_ptr);
+    auto compressed_size_ptr = reinterpret_cast<std::size_t *>(j_compressed_size_ptr);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status = nvcompLZ4CompressAsync(&opts, comp_type, in_ptr, in_size, temp_ptr, temp_size,
+                                         out_ptr, compressed_size_ptr, stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetMetadata(
-    JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong jstream) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DecompressConfigure(
+    JNIEnv *env, jclass, jlong j_input_ptr, jlong j_input_size, jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
+    auto compressed_ptr = reinterpret_cast<void const *>(j_input_ptr);
+    auto compressed_bytes = static_cast<std::size_t>(j_input_size);
     void *metadata_ptr = nullptr;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompBatchedLZ4DecompressGetMetadata(input_ptrs.data(), sizes.data(),
-                                                        input_ptrs.size(), &metadata_ptr, stream);
-    check_nvcomp_status(env, status);
-    return reinterpret_cast<jlong>(metadata_ptr);
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressDestroyMetadata(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    nvcompBatchedLZ4DecompressDestroyMetadata(reinterpret_cast<void *>(metadata_ptr));
-  }
-  CATCH_STD(env, );
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    size_t temp_size;
+    std::size_t metadata_bytes = 0;
+    std::size_t temp_bytes = 0;
+    std::size_t uncompressed_bytes = 0;
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
     auto status =
-        nvcompBatchedLZ4DecompressGetTempSize(reinterpret_cast<void *>(metadata_ptr), &temp_size);
+        nvcompLZ4DecompressConfigure(compressed_ptr, compressed_bytes, &metadata_ptr,
+                                     &metadata_bytes, &temp_bytes, &uncompressed_bytes, stream);
     check_nvcomp_status(env, status);
-    return static_cast<jlong>(temp_size);
+    cudf::jni::native_jlongArray result(env, 4);
+    result[0] = reinterpret_cast<jlong>(metadata_ptr);
+    result[1] = static_cast<jlong>(metadata_bytes);
+    result[2] = static_cast<jlong>(temp_bytes);
+    result[3] = static_cast<jlong>(uncompressed_bytes);
+    return result.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetOutputSize(
-    JNIEnv *env, jclass, jlong metadata_ptr, jint num_outputs) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DecompressAsync(
+    JNIEnv *env, jclass, jlong j_in_ptr, jlong j_in_size, jlong j_metadata_ptr,
+    jlong j_metadata_size, jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptr, jlong j_out_size,
+    jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    std::vector<size_t> sizes(num_outputs);
-    auto status = nvcompBatchedLZ4DecompressGetOutputSize(reinterpret_cast<void *>(metadata_ptr),
-                                                          num_outputs, sizes.data());
+    auto compressed_ptr = reinterpret_cast<void const *>(j_in_ptr);
+    auto compressed_bytes = static_cast<std::size_t>(j_in_size);
+    auto metadata_ptr = reinterpret_cast<void const *>(j_metadata_ptr);
+    auto metadata_bytes = static_cast<std::size_t>(j_metadata_size);
+    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
+    auto temp_bytes = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptr = reinterpret_cast<void *>(j_out_ptr);
+    auto uncompressed_bytes = static_cast<std::size_t>(j_out_size);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status = nvcompLZ4DecompressAsync(compressed_ptr, compressed_bytes, metadata_ptr,
+                                           metadata_bytes, temp_ptr, temp_bytes, uncompressed_ptr,
+                                           uncompressed_bytes, stream);
     check_nvcomp_status(env, status);
-    cudf::jni::native_jlongArray jsizes(env, num_outputs);
-    std::transform(sizes.begin(), sizes.end(), jsizes.data(),
-                   [](size_t x) -> jlong { return static_cast<jlong>(x); });
-    return jsizes.get_jArray();
   }
-  CATCH_STD(env, NULL);
+  CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(
-    JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong temp_ptr, jlong temp_size,
-    jlong metadata_ptr, jlongArray out_ptrs, jlongArray out_sizes, jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DestroyMetadata(JNIEnv *env, jclass,
+                                                                               jlong metadata_ptr) {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> input_sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(input_sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    cudf::jni::native_jpointerArray<void> output_ptrs(env, out_ptrs);
-    cudf::jni::native_jlongArray output_jsizes(env, out_sizes);
-    if (output_ptrs.size() != output_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "output array size mismatch");
-    }
-    if (input_ptrs.size() != output_ptrs.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input/output array size mismatch");
-    }
-    std::vector<size_t> output_sizes;
-    std::transform(output_jsizes.data(), output_jsizes.data() + output_jsizes.size(),
-                   std::back_inserter(output_sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompBatchedLZ4DecompressAsync(
-        input_ptrs.data(), input_sizes.data(), input_ptrs.size(),
-        reinterpret_cast<void *>(temp_ptr), static_cast<size_t>(temp_size),
-        reinterpret_cast<void *>(metadata_ptr), output_ptrs.data(), output_sizes.data(), stream);
-    check_nvcomp_status(env, status);
+    nvcompLZ4DestroyMetadata(reinterpret_cast<void *>(metadata_ptr));
   }
   CATCH_STD(env, );
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize(
-    JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong chunk_size) {
+    JNIEnv *env, jclass, jlong j_batch_size, jlong j_max_chunk_size) {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    size_t temp_size = 0;
-    auto status = nvcompBatchedLZ4CompressGetTempSize(input_ptrs.data(), sizes.data(),
-                                                      input_ptrs.size(), &opts, &temp_size);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t temp_size = 0;
+    auto status = nvcompBatchedLZ4CompressGetTempSize(batch_size, max_chunk_size,
+                                                      nvcompBatchedLZ4DefaultOpts, &temp_size);
     check_nvcomp_status(env, status);
     return static_cast<jlong>(temp_size);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetOutputSize(
-    JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong chunk_size, jlong temp_ptr,
-    jlong temp_size) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize(
+    JNIEnv *env, jclass, jlong j_max_chunk_size) {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> input_sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(input_sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    std::vector<size_t> output_sizes(input_ptrs.size());
-    auto status = nvcompBatchedLZ4CompressGetOutputSize(
-        input_ptrs.data(), input_sizes.data(), input_ptrs.size(), &opts,
-        reinterpret_cast<void *>(temp_ptr), static_cast<size_t>(temp_size), output_sizes.data());
+    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t max_output_size = 0;
+    auto status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
+        max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size);
     check_nvcomp_status(env, status);
-    cudf::jni::native_jlongArray jsizes(env, input_ptrs.size());
-    std::transform(output_sizes.begin(), output_sizes.end(), jsizes.data(),
-                   [](size_t x) -> jlong { return static_cast<jlong>(x); });
-    return jsizes.get_jArray();
+    return static_cast<jlong>(max_output_size);
   }
-  CATCH_STD(env, NULL);
+  CATCH_STD(env, 0);
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(
-    JNIEnv *env, jclass, jlong compressed_sizes_out_ptr, jlongArray in_ptrs, jlongArray in_sizes,
-    jlong chunk_size, jlong temp_ptr, jlong temp_size, jlongArray out_ptrs, jlongArray out_sizes,
-    jlong jstream) {
+    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_chunk_size, jlong j_batch_size,
+    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_compressed_sizes_out_ptr,
+    jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> input_sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(input_sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    cudf::jni::native_jpointerArray<void> output_ptrs(env, out_ptrs);
-    cudf::jni::native_jlongArray output_jsizes(env, out_sizes);
-    if (output_ptrs.size() != output_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "output array size mismatch");
-    }
-    if (input_ptrs.size() != output_ptrs.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input/output array size mismatch");
-    }
-
-    auto output_sizes = reinterpret_cast<size_t *>(compressed_sizes_out_ptr);
-    std::transform(output_jsizes.data(), output_jsizes.data() + output_jsizes.size(), output_sizes,
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompBatchedLZ4CompressAsync(
-        input_ptrs.data(), input_sizes.data(), input_ptrs.size(), &opts,
-        reinterpret_cast<void *>(temp_ptr), static_cast<size_t>(temp_size), output_ptrs.data(),
-        output_sizes, // input/output parameter
-        stream);
+    auto in_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
+    auto in_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
+    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
+    auto temp_size = static_cast<std::size_t>(j_temp_size);
+    auto out_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
+    auto compressed_out_sizes = reinterpret_cast<std::size_t *>(j_compressed_sizes_out_ptr);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status = nvcompBatchedLZ4CompressAsync(in_ptrs, in_sizes, chunk_size, batch_size, temp_ptr,
+                                                temp_size, out_ptrs, compressed_out_sizes,
+                                                nvcompBatchedLZ4DefaultOpts, stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompressGetTempSize(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jint num_rles,
-    jint num_deltas, jboolean use_bp) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompCascadedFormatOpts opts{};
-    opts.num_RLEs = num_rles;
-    opts.num_deltas = num_deltas;
-    opts.use_bp = use_bp;
-    size_t temp_size;
-    auto status = nvcompCascadedCompressGetTempSize(reinterpret_cast<void *>(in_ptr), in_size,
-                                                    comp_type, &opts, &temp_size);
-    check_nvcomp_status(env, status);
-    return temp_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompressGetOutputSize(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jint num_rles,
-    jint num_deltas, jboolean use_bp, jlong temp_ptr, jlong temp_size, jboolean compute_exact) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize(
+    JNIEnv *env, jclass, jlong j_batch_size, jlong j_chunk_size) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompCascadedFormatOpts opts{};
-    opts.num_RLEs = num_rles;
-    opts.num_deltas = num_deltas;
-    opts.use_bp = use_bp;
-    size_t out_size;
-    auto status = nvcompCascadedCompressGetOutputSize(
-        reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-        reinterpret_cast<void *>(temp_ptr), temp_size, &out_size, compute_exact);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
+    std::size_t temp_size = 0;
+    auto status = nvcompBatchedLZ4DecompressGetTempSize(batch_size, chunk_size, &temp_size);
     check_nvcomp_status(env, status);
-    return out_size;
+    return static_cast<jlong>(temp_size);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompress(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jint num_rles,
-    jint num_deltas, jboolean use_bp, jlong temp_ptr, jlong temp_size, jlong out_ptr,
-    jlong out_size, jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(
+    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
+    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompCascadedFormatOpts opts{};
-    opts.num_RLEs = num_rles;
-    opts.num_deltas = num_deltas;
-    opts.use_bp = use_bp;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    size_t compressed_size = out_size;
-    auto status =
-        nvcompCascadedCompressAsync(reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-                                    reinterpret_cast<void *>(temp_ptr), temp_size,
-                                    reinterpret_cast<void *>(out_ptr), &compressed_size, stream);
+    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
+    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t const *>(j_out_sizes);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
+    auto temp_size = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto uncompressed_statuses = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
+    auto actual_uncompressed_sizes = rmm::device_uvector<std::size_t>(batch_size, stream);
+    auto status = nvcompBatchedLZ4DecompressAsync(
+        compressed_ptrs, compressed_sizes, uncompressed_sizes, actual_uncompressed_sizes.data(),
+        batch_size, temp_ptr, temp_size, uncompressed_ptrs, uncompressed_statuses.data(), stream);
     check_nvcomp_status(env, status);
-    if (cudaStreamSynchronize(stream) != cudaSuccess) {
-      JNI_THROW_NEW(env, NVCOMP_CUDA_ERROR_CLASS, "Error synchronizing stream", 0);
+    if (!cudf::java::check_nvcomp_output_sizes(uncompressed_sizes, actual_uncompressed_sizes.data(),
+                                               batch_size, stream)) {
+      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS,
+                                      "nvcomp decompress output size mismatch");
     }
-    return compressed_size;
   }
-  CATCH_STD(env, 0);
+  CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompressAsync(
-    JNIEnv *env, jclass, jlong compressed_output_ptr, jlong in_ptr, jlong in_size, jint input_type,
-    jint num_rles, jint num_deltas, jboolean use_bp, jlong temp_ptr, jlong temp_size, jlong out_ptr,
-    jlong out_size, jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(
+    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
+    jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompCascadedFormatOpts opts{};
-    opts.num_RLEs = num_rles;
-    opts.num_deltas = num_deltas;
-    opts.use_bp = use_bp;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto compressed_size_ptr = reinterpret_cast<size_t *>(compressed_output_ptr);
-    *compressed_size_ptr = out_size;
-    auto status =
-        nvcompCascadedCompressAsync(reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-                                    reinterpret_cast<void *>(temp_ptr), temp_size,
-                                    reinterpret_cast<void *>(out_ptr), compressed_size_ptr, stream);
+    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
+    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t *>(j_out_sizes);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status = nvcompBatchedLZ4GetDecompressSizeAsync(compressed_ptrs, compressed_sizes,
+                                                         uncompressed_sizes, batch_size, stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 1ee5345c036..4722db30244 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -210,17 +210,6 @@ class java_event_handler_memory_resource final : public device_memory_resource {
   }
 
   bool on_alloc_fail(std::size_t num_bytes) {
-    cudaError_t err = cudaPeekAtLastError();
-    if (err != cudaSuccess) {
-      // workaround for RMM pooled mode (CNMEM backend) leaving a CUDA error pending
-      if (err == cudaErrorMemoryAllocation) {
-        cudaGetLastError();
-      } else {
-        // let this allocation fail so the application can see the CUDA error
-        return false;
-      }
-    }
-
     JNIEnv *env = cudf::jni::get_jni_env(jvm);
     jboolean result =
         env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
@@ -256,7 +245,7 @@ class java_event_handler_memory_resource final : public device_memory_resource {
         total_before = get_total_bytes_allocated();
         result = resource->allocate(num_bytes, stream);
         break;
-      } catch (std::bad_alloc const &e) {
+      } catch (rmm::out_of_memory const &e) {
         if (!on_alloc_fail(num_bytes)) {
           throw;
         }
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.cu b/java/src/main/native/src/check_nvcomp_output_sizes.cu
new file mode 100644
index 00000000000..944399882b8
--- /dev/null
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.cu
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/detail/nvtx/nvtx3.hpp>
+#include <cudf/utilities/error.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/equal.h>
+
+#include "check_nvcomp_output_sizes.hpp"
+
+namespace {
+
+struct java_domain {
+  static constexpr char const *name{"Java"};
+};
+
+} // anonymous namespace
+
+namespace cudf {
+namespace java {
+
+/**
+ * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
+ * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
+ */
+bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
+                               std::size_t const *dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks, rmm::cuda_stream_view stream) {
+  NVTX3_FUNC_RANGE_IN(java_domain);
+  return thrust::equal(rmm::exec_policy(stream), dev_uncompressed_sizes,
+                       dev_uncompressed_sizes + num_chunks, dev_actual_uncompressed_sizes);
+}
+
+} // namespace java
+} // namespace cudf
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.hpp b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
new file mode 100644
index 00000000000..00b36471a85
--- /dev/null
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace java {
+
+/**
+ * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
+ * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
+ */
+bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
+                               std::size_t const *dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks, rmm::cuda_stream_view stream);
+} // namespace java
+} // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index d1af0d9a2f6..c767a98b342 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3085,6 +3085,17 @@ void testWindowStatic() {
              ColumnVector result = v1.rollingWindow(RollingAggregation.lag(1, defaultOutput), options)) {
           assertColumnsAreEqual(expected, result);
         }
+
+        try (ColumnVector expected = ColumnVector.fromBoxedDoubles(0.7071d, 1.5275d, 1.5275d, 1d, 1.4142);
+             ColumnVector result = v1.rollingWindow(RollingAggregation.standardDeviation(), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
+
+        try (ColumnVector expected =
+                 ColumnVector.fromBoxedDoubles(Double.POSITIVE_INFINITY, 2.1602d, 2.1602d, 1.4142d, Double.POSITIVE_INFINITY);
+             ColumnVector result = v1.rollingWindow(RollingAggregation.standardDeviation(2), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
       }
     }
   }
diff --git a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
index a41cc22e9b2..c36d241500a 100644
--- a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
+++ b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,18 +29,20 @@ public class NvcompTest {
   private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);
 
   @Test
-  void testLZ4RoundTripSync() {
+  void testLZ4RoundTripViaLZ4DecompressorSync() {
     lz4RoundTrip(false);
   }
 
   @Test
-  void testLZ4RoundTripAsync() {
+  void testLZ4RoundTripViaLZ4DecompressorAsync() {
     lz4RoundTrip(true);
   }
 
   @Test
   void testBatchedLZ4RoundTripAsync() {
+    final Cuda.Stream stream = Cuda.DEFAULT_STREAM;
     final long chunkSize = 64 * 1024;
+    final long targetIntermediteSize = Long.MAX_VALUE;
     final int maxElements = 1024 * 1024 + 1;
     final int numBuffers = 200;
     long[] data = new long[maxElements];
@@ -48,149 +50,52 @@ void testBatchedLZ4RoundTripAsync() {
       data[i] = i;
     }
 
-    DeviceMemoryBuffer[] originalBuffers = new DeviceMemoryBuffer[numBuffers];
-    DeviceMemoryBuffer[] uncompressedBuffers = new DeviceMemoryBuffer[numBuffers];
-
-    // compressed data in buffers that are likely oversized
-    DeviceMemoryBuffer[] compressedBuffers = new DeviceMemoryBuffer[numBuffers];
-
-    // compressed data in right-sized buffers
-    DeviceMemoryBuffer[] compressedInputs = new DeviceMemoryBuffer[numBuffers];
-
-    try {
+    try (CloseableArray<DeviceMemoryBuffer> originalBuffers =
+             CloseableArray.wrap(new DeviceMemoryBuffer[numBuffers])) {
       // create the batched buffers to compress
-      for (int i = 0; i < numBuffers; ++i) {
-        originalBuffers[i] = initBatchBuffer(data, i);
+      for (int i = 0; i < originalBuffers.size(); i++) {
+        originalBuffers.set(i, initBatchBuffer(data, i));
+        // Increment the refcount since compression will try to close it
+        originalBuffers.get(i).incRefCount();
       }
 
-      // compress the buffers
-      long[] outputSizes;
-      long[] compressedSizes;
-      long tempSize = BatchedLZ4Compressor.getTempSize(originalBuffers, chunkSize);
-      try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) {
-        outputSizes = BatchedLZ4Compressor.getOutputSizes(originalBuffers, chunkSize, tempBuffer);
-        for (int i = 0; i < numBuffers; ++i) {
-          compressedBuffers[i] = DeviceMemoryBuffer.allocate(outputSizes[i]);
-        }
-        long sizesBufferSize = BatchedLZ4Compressor.getCompressedSizesBufferSize(numBuffers);
-        try (HostMemoryBuffer compressedSizesBuffer = HostMemoryBuffer.allocate(sizesBufferSize)) {
-          BatchedLZ4Compressor.compressAsync(compressedSizesBuffer, originalBuffers, chunkSize,
-              tempBuffer, compressedBuffers, Cuda.DEFAULT_STREAM);
-          Cuda.DEFAULT_STREAM.sync();
-          compressedSizes = new long[numBuffers];
-          for (int i = 0; i < numBuffers; ++i) {
-            compressedSizes[i] = compressedSizesBuffer.getLong(i * 8);
-          }
-        }
-      }
-
-      // right-size the compressed buffers based on reported compressed sizes
-      for (int i = 0; i < numBuffers; ++i) {
-        compressedInputs[i] = compressedBuffers[i].slice(0, compressedSizes[i]);
-      }
-
-      // decompress the buffers
-      try (BatchedLZ4Decompressor.BatchedMetadata metadata =
-               BatchedLZ4Decompressor.getMetadata(compressedInputs, Cuda.DEFAULT_STREAM)) {
-        outputSizes = BatchedLZ4Decompressor.getOutputSizes(metadata, numBuffers);
-        for (int i = 0; i < numBuffers; ++i) {
-          uncompressedBuffers[i] = DeviceMemoryBuffer.allocate(outputSizes[i]);
-        }
-        tempSize = BatchedLZ4Decompressor.getTempSize(metadata);
-        try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) {
-          BatchedLZ4Decompressor.decompressAsync(compressedInputs, tempBuffer, metadata,
-              uncompressedBuffers, Cuda.DEFAULT_STREAM);
-        }
-      }
+      // compress and decompress the buffers
+      BatchedLZ4Compressor compressor = new BatchedLZ4Compressor(chunkSize, targetIntermediteSize);
 
-      // check the decompressed results against the original
-      for (int i = 0; i < numBuffers; ++i) {
-        try (HostMemoryBuffer expected = HostMemoryBuffer.allocate(originalBuffers[i].getLength());
-             HostMemoryBuffer actual = HostMemoryBuffer.allocate(outputSizes[i])) {
-          Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE);
-          Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE);
-          Assertions.assertEquals(originalBuffers[i].getLength(), uncompressedBuffers[i].getLength(),
-              "uncompressed size mismatch at buffer " + i);
-          expected.copyFromDeviceBuffer(originalBuffers[i]);
-          actual.copyFromDeviceBuffer(uncompressedBuffers[i]);
-          byte[] expectedBytes = new byte[(int) expected.getLength()];
-          expected.getBytes(expectedBytes, 0, 0, expected.getLength());
-          byte[] actualBytes = new byte[(int) actual.getLength()];
-          actual.getBytes(actualBytes, 0, 0, actual.getLength());
-          Assertions.assertArrayEquals(expectedBytes, actualBytes,
-              "mismatch in batch buffer " + i);
+      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
+               CloseableArray.wrap(compressor.compress(originalBuffers.getArray(), stream));
+           CloseableArray<DeviceMemoryBuffer> uncompressedBuffers =
+               CloseableArray.wrap(new DeviceMemoryBuffer[numBuffers])) {
+        for (int i = 0; i < numBuffers; i++) {
+          uncompressedBuffers.set(i,
+              DeviceMemoryBuffer.allocate(originalBuffers.get(i).getLength()));
         }
-      }
-    } finally {
-      closeBufferArray(originalBuffers);
-      closeBufferArray(uncompressedBuffers);
-      closeBufferArray(compressedBuffers);
-      closeBufferArray(compressedInputs);
-    }
-  }
-
-  @Test
-  void testBatchedLZ4CompressRoundTrip() {
-    final long chunkSize = 64 * 1024;
-    final int maxElements = 1024 * 1024 + 1;
-    final int numBuffers = 200;
-    long[] data = new long[maxElements];
-    for (int i = 0; i < maxElements; ++i) {
-      data[i] = i;
-    }
-
-    DeviceMemoryBuffer[] originalBuffers = new DeviceMemoryBuffer[numBuffers];
-    DeviceMemoryBuffer[] uncompressedBuffers = new DeviceMemoryBuffer[numBuffers];
-    BatchedLZ4Compressor.BatchedCompressionResult compResult = null;
-
-    // compressed data in right-sized buffers
-    DeviceMemoryBuffer[] compressedInputs = new DeviceMemoryBuffer[numBuffers];
-
-    try {
-      // create the batched buffers to compress
-      for (int i = 0; i < numBuffers; ++i) {
-        originalBuffers[i] = initBatchBuffer(data, i);
-      }
 
-      // compress the buffers
-      compResult = BatchedLZ4Compressor.compress(originalBuffers, chunkSize, Cuda.DEFAULT_STREAM);
+        // decompress takes ownership of the compressed buffers and will close them
+        BatchedLZ4Decompressor.decompressAsync(chunkSize, compressedBuffers.release(),
+            uncompressedBuffers.getArray(), stream);
 
-      // right-size the compressed buffers based on reported compressed sizes
-      DeviceMemoryBuffer[] compressedBuffers = compResult.getCompressedBuffers();
-      long[] compressedSizes = compResult.getCompressedSizes();
-      for (int i = 0; i < numBuffers; ++i) {
-        compressedInputs[i] = compressedBuffers[i].slice(0, compressedSizes[i]);
-      }
-
-      // decompress the buffers
-      uncompressedBuffers = BatchedLZ4Decompressor.decompressAsync(compressedInputs,
-              Cuda.DEFAULT_STREAM);
-
-      // check the decompressed results against the original
-      for (int i = 0; i < numBuffers; ++i) {
-        try (HostMemoryBuffer expected = HostMemoryBuffer.allocate(originalBuffers[i].getLength());
-             HostMemoryBuffer actual = HostMemoryBuffer.allocate(uncompressedBuffers[i].getLength())) {
-          Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE);
-          Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE);
-          Assertions.assertEquals(originalBuffers[i].getLength(), uncompressedBuffers[i].getLength(),
-                  "uncompressed size mismatch at buffer " + i);
-          expected.copyFromDeviceBuffer(originalBuffers[i]);
-          actual.copyFromDeviceBuffer(uncompressedBuffers[i]);
-          byte[] expectedBytes = new byte[(int) expected.getLength()];
-          expected.getBytes(expectedBytes, 0, 0, expected.getLength());
-          byte[] actualBytes = new byte[(int) actual.getLength()];
-          actual.getBytes(actualBytes, 0, 0, actual.getLength());
-          Assertions.assertArrayEquals(expectedBytes, actualBytes,
-                  "mismatch in batch buffer " + i);
+        // check the decompressed results against the original
+        for (int i = 0; i < numBuffers; ++i) {
+          try (HostMemoryBuffer expected =
+                   HostMemoryBuffer.allocate(originalBuffers.get(i).getLength());
+               HostMemoryBuffer actual =
+                   HostMemoryBuffer.allocate(uncompressedBuffers.get(i).getLength())) {
+            Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE);
+            Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE);
+            Assertions.assertEquals(expected.getLength(), actual.getLength(),
+                "uncompressed size mismatch at buffer " + i);
+            expected.copyFromDeviceBuffer(originalBuffers.get(i));
+            actual.copyFromDeviceBuffer(uncompressedBuffers.get(i));
+            byte[] expectedBytes = new byte[(int) expected.getLength()];
+            expected.getBytes(expectedBytes, 0, 0, expected.getLength());
+            byte[] actualBytes = new byte[(int) actual.getLength()];
+            actual.getBytes(actualBytes, 0, 0, actual.getLength());
+            Assertions.assertArrayEquals(expectedBytes, actualBytes,
+                "mismatch in batch buffer " + i);
+          }
         }
       }
-    } finally {
-      closeBufferArray(originalBuffers);
-      closeBufferArray(uncompressedBuffers);
-      closeBufferArray(compressedInputs);
-      if (compResult != null) {
-        closeBufferArray(compResult.getCompressedBuffers());
-      }
     }
   }
 
@@ -200,14 +105,6 @@ private void closeBuffer(MemoryBuffer buffer) {
     }
   }
 
-  private void closeBufferArray(MemoryBuffer[] buffers) {
-    for (MemoryBuffer buffer : buffers) {
-      if (buffer != null) {
-        buffer.close();
-      }
-    }
-  }
-
   private DeviceMemoryBuffer initBatchBuffer(long[] data, int bufferId) {
     // grab a subsection of the data based on buffer ID
     int dataStart = 0;
@@ -239,6 +136,7 @@ private DeviceMemoryBuffer initBatchBuffer(long[] data, int bufferId) {
   }
 
   private void lz4RoundTrip(boolean useAsync) {
+    final Cuda.Stream stream = Cuda.DEFAULT_STREAM;
     final long chunkSize = 64 * 1024;
     final int numElements = 10 * 1024 * 1024 + 1;
     long[] data = new long[numElements];
@@ -251,31 +149,32 @@ private void lz4RoundTrip(boolean useAsync) {
     DeviceMemoryBuffer uncompressedBuffer = null;
     try (ColumnVector v = ColumnVector.fromLongs(data)) {
       BaseDeviceMemoryBuffer inputBuffer = v.getDeviceBufferFor(BufferType.DATA);
-      log.debug("Uncompressed size is {}", inputBuffer.getLength());
-
-      long tempSize = LZ4Compressor.getTempSize(inputBuffer, CompressionType.CHAR, chunkSize);
-
-      log.debug("Using {} temporary space for lz4 compression", tempSize);
-      tempBuffer = DeviceMemoryBuffer.allocate(tempSize);
+      final long uncompressedSize = inputBuffer.getLength();
+      log.debug("Uncompressed size is {}", uncompressedSize);
 
-      long outSize = LZ4Compressor.getOutputSize(inputBuffer, CompressionType.CHAR, chunkSize,
-          tempBuffer);
-      log.debug("lz4 compressed size estimate is {}", outSize);
+      LZ4Compressor.Configuration compressConf =
+          LZ4Compressor.configure(chunkSize, uncompressedSize);
+      Assertions.assertTrue(compressConf.getMetadataBytes() > 0);
+      log.debug("Using {} temporary space for lz4 compression", compressConf.getTempBytes());
+      tempBuffer = DeviceMemoryBuffer.allocate(compressConf.getTempBytes());
+      log.debug("lz4 compressed size estimate is {}", compressConf.getMaxCompressedBytes());
 
-      compressedBuffer = DeviceMemoryBuffer.allocate(outSize);
+      compressedBuffer = DeviceMemoryBuffer.allocate(compressConf.getMaxCompressedBytes());
 
       long startTime = System.nanoTime();
       long compressedSize;
       if (useAsync) {
-        try (HostMemoryBuffer tempHostBuffer = HostMemoryBuffer.allocate(8)) {
-          LZ4Compressor.compressAsync(tempHostBuffer, inputBuffer, CompressionType.CHAR, chunkSize,
-              tempBuffer, compressedBuffer, Cuda.DEFAULT_STREAM);
-          Cuda.DEFAULT_STREAM.sync();
-          compressedSize = tempHostBuffer.getLong(0);
+        try (DeviceMemoryBuffer devCompressedSizeBuffer = DeviceMemoryBuffer.allocate(8);
+             HostMemoryBuffer hostCompressedSizeBuffer = HostMemoryBuffer.allocate(8)) {
+          LZ4Compressor.compressAsync(devCompressedSizeBuffer, inputBuffer, CompressionType.CHAR,
+              chunkSize, tempBuffer, compressedBuffer, stream);
+          hostCompressedSizeBuffer.copyFromDeviceBufferAsync(devCompressedSizeBuffer, stream);
+          stream.sync();
+          compressedSize = hostCompressedSizeBuffer.getLong(0);
         }
       } else {
         compressedSize = LZ4Compressor.compress(inputBuffer, CompressionType.CHAR, chunkSize,
-            tempBuffer, compressedBuffer, Cuda.DEFAULT_STREAM);
+            tempBuffer, compressedBuffer, stream);
       }
       double duration = (System.nanoTime() - startTime) / 1000.0;
       log.info("Compressed with lz4 to {} in {} us", compressedSize, duration);
@@ -283,23 +182,20 @@ private void lz4RoundTrip(boolean useAsync) {
       tempBuffer.close();
       tempBuffer = null;
 
-      Assertions.assertTrue(Decompressor.isLZ4Data(compressedBuffer));
-
-      try (Decompressor.Metadata metadata =
-               Decompressor.getMetadata(compressedBuffer, Cuda.DEFAULT_STREAM)) {
-        Assertions.assertTrue(metadata.isLZ4Metadata());
-        tempSize = Decompressor.getTempSize(metadata);
+      try (LZ4Decompressor.Configuration decompressConf =
+               LZ4Decompressor.configure(compressedBuffer, stream)) {
+        final long tempSize = decompressConf.getTempBytes();
 
         log.debug("Using {} temporary space for lz4 compression", tempSize);
         tempBuffer = DeviceMemoryBuffer.allocate(tempSize);
 
-        outSize = Decompressor.getOutputSize(metadata);
+        final long outSize = decompressConf.getUncompressedBytes();
         Assertions.assertEquals(inputBuffer.getLength(), outSize);
 
         uncompressedBuffer = DeviceMemoryBuffer.allocate(outSize);
 
-        Decompressor.decompressAsync(compressedBuffer, tempBuffer, metadata, uncompressedBuffer,
-            Cuda.DEFAULT_STREAM);
+        LZ4Decompressor.decompressAsync(compressedBuffer, decompressConf, tempBuffer,
+            uncompressedBuffer, stream);
 
         try (ColumnVector v2 = new ColumnVector(
             DType.INT64,
@@ -324,133 +220,4 @@ private void lz4RoundTrip(boolean useAsync) {
       closeBuffer(uncompressedBuffer);
     }
   }
-
-  @Test
-  void testCascadedRoundTripSync() {
-    cascadedRoundTrip(false);
-  }
-
-  @Test
-  void testCascadedRoundTripAsync() {
-    cascadedRoundTrip(true);
-  }
-
-  private void cascadedRoundTrip(boolean useAsync) {
-    final int numElements = 10 * 1024 * 1024 + 1;
-    final int numRunLengthEncodings = 2;
-    final int numDeltas = 1;
-    final boolean useBitPacking = true;
-    int[] data = new int[numElements];
-    for (int i = 0; i < numElements; ++i) {
-      data[i] = i;
-    }
-
-    DeviceMemoryBuffer tempBuffer = null;
-    DeviceMemoryBuffer compressedBuffer = null;
-    DeviceMemoryBuffer uncompressedBuffer = null;
-    try (ColumnVector v = ColumnVector.fromInts(data)) {
-      BaseDeviceMemoryBuffer inputBuffer = v.getDeviceBufferFor(BufferType.DATA);
-      log.debug("Uncompressed size is " + inputBuffer.getLength());
-
-      long tempSize = NvcompJni.cascadedCompressGetTempSize(
-          inputBuffer.getAddress(),
-          inputBuffer.getLength(),
-          CompressionType.INT.nativeId,
-          numRunLengthEncodings,
-          numDeltas,
-          useBitPacking);
-
-      log.debug("Using {} temporary space for cascaded compression", tempSize);
-      tempBuffer = DeviceMemoryBuffer.allocate(tempSize);
-
-      long outSize = NvcompJni.cascadedCompressGetOutputSize(
-          inputBuffer.getAddress(),
-          inputBuffer.getLength(),
-          CompressionType.INT.nativeId,
-          numRunLengthEncodings,
-          numDeltas,
-          useBitPacking,
-          tempBuffer.getAddress(),
-          tempBuffer.getLength(),
-          false);
-      log.debug("Inexact cascaded compressed size estimate is {}", outSize);
-
-      compressedBuffer = DeviceMemoryBuffer.allocate(outSize);
-
-      long startTime = System.nanoTime();
-      long compressedSize;
-      if (useAsync) {
-        try (HostMemoryBuffer tempHostBuffer = HostMemoryBuffer.allocate(8)) {
-          NvcompJni.cascadedCompressAsync(
-              tempHostBuffer.getAddress(),
-              inputBuffer.getAddress(),
-              inputBuffer.getLength(),
-              CompressionType.INT.nativeId,
-              numRunLengthEncodings,
-              numDeltas,
-              useBitPacking,
-              tempBuffer.getAddress(),
-              tempBuffer.getLength(),
-              compressedBuffer.getAddress(),
-              compressedBuffer.getLength(),
-              0);
-          Cuda.DEFAULT_STREAM.sync();
-          compressedSize = tempHostBuffer.getLong(0);
-        }
-      } else {
-        compressedSize = NvcompJni.cascadedCompress(
-            inputBuffer.getAddress(),
-            inputBuffer.getLength(),
-            CompressionType.INT.nativeId,
-            numRunLengthEncodings,
-            numDeltas,
-            useBitPacking,
-            tempBuffer.getAddress(),
-            tempBuffer.getLength(),
-            compressedBuffer.getAddress(),
-            compressedBuffer.getLength(),
-            0);
-      }
-
-      double duration = (System.nanoTime() - startTime) / 1000.0;
-      log.debug("Compressed with cascaded to {} in {} us", compressedSize, duration);
-
-      tempBuffer.close();
-      tempBuffer = null;
-
-      try (Decompressor.Metadata metadata =
-               Decompressor.getMetadata(compressedBuffer, Cuda.DEFAULT_STREAM)) {
-        tempSize = Decompressor.getTempSize(metadata);
-
-        log.debug("Using {} temporary space for cascaded compression", tempSize);
-        tempBuffer = DeviceMemoryBuffer.allocate(tempSize);
-
-        outSize = Decompressor.getOutputSize(metadata);
-        Assertions.assertEquals(inputBuffer.getLength(), outSize);
-
-        uncompressedBuffer = DeviceMemoryBuffer.allocate(outSize);
-
-        Decompressor.decompressAsync(compressedBuffer, tempBuffer, metadata, uncompressedBuffer,
-            Cuda.DEFAULT_STREAM);
-
-        try (ColumnVector v2 = new ColumnVector(
-            DType.INT32,
-            numElements,
-            Optional.empty(),
-            uncompressedBuffer,
-            null,
-            null)) {
-          uncompressedBuffer = null;
-          try (ColumnVector compare = v2.equalTo(v);
-               Scalar compareAll = compare.all()) {
-            Assertions.assertTrue(compareAll.getBoolean());
-          }
-        }
-      }
-    } finally {
-      closeBuffer(tempBuffer);
-      closeBuffer(compressedBuffer);
-      closeBuffer(uncompressedBuffer);
-    }
-  }
 }
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 0e68f1c71cc..ff5870c50be 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -116,6 +116,8 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             )
         elif dtype == "decimal64":
             meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION
+        elif dtype == "decimal32":
+            meta["max_precision"] = cudf.Decimal32Dtype.MAX_PRECISION
 
         meta["dtype"] = dtype
         meta["null_frequency"] = null_frequency
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
index 74efdb08bea..628ffef433b 100644
--- a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
+++ b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
@@ -5,6 +5,8 @@ from libc.stdint cimport int32_t, int64_t
 cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
     # cython type stub to help resolve to numeric::decimal64
     ctypedef int64_t decimal64
+    # cython type stub to help resolve to numeric::decimal32
+    ctypedef int64_t decimal32
 
     cdef cppclass scale_type:
         scale_type(int32_t)
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 43bc1ac9db7..5b73b1fef10 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -192,7 +192,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     # update the decimal precision of each column
     if meta is not None:
         for col, col_meta in zip(column_names, meta["columns"]):
-            if isinstance(df._data[col].dtype, cudf.Decimal64Dtype):
+            if is_decimal_dtype(df._data[col].dtype):
                 df._data[col].dtype.precision = (
                     col_meta["metadata"]["precision"]
                 )
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index ab53a242db2..21a039dbf78 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -2,7 +2,6 @@
 
 import cudf
 from cudf.api.types import is_decimal_dtype
-from cudf.core.dtypes import Decimal64Dtype
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
@@ -18,7 +17,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
 
 from cudf._lib.aggregation cimport Aggregation, make_aggregation
-from cudf._lib.types cimport dtype_to_data_type, underlying_type_t_type_id
+from cudf._lib.types cimport (
+    dtype_to_data_type,
+    is_decimal_type_id,
+    underlying_type_t_type_id,
+)
 
 import numpy as np
 
@@ -72,11 +75,11 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             c_out_dtype
         ))
 
-    if c_result.get()[0].type().id() == libcudf_types.type_id.DECIMAL64:
+    if is_decimal_type_id(c_result.get()[0].type().id()):
         scale = -c_result.get()[0].type().scale()
         precision = _reduce_precision(col_dtype, reduction_op, len(incol))
         py_result = DeviceScalar.from_unique_ptr(
-            move(c_result), dtype=cudf.Decimal64Dtype(precision, scale)
+            move(c_result), dtype=col_dtype.__class__(precision, scale)
         )
     else:
         py_result = DeviceScalar.from_unique_ptr(move(c_result))
@@ -157,4 +160,4 @@ def _reduce_precision(dtype, op, nrows):
         new_p = 2 * p + nrows
     else:
         raise NotImplementedError()
-    return max(min(new_p, cudf.Decimal64Dtype.MAX_PRECISION), 0)
+    return max(min(new_p, dtype.MAX_PRECISION), 0)
diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx
index 2d7f56dc5ce..e4311b356ec 100644
--- a/python/cudf/cudf/_lib/replace.pyx
+++ b/python/cudf/cudf/_lib/replace.pyx
@@ -204,12 +204,8 @@ def clip(Column input_col, object lo, object hi):
     and > hi will be replaced by hi
     """
 
-    lo_scalar = as_device_scalar(
-        lo, dtype=input_col.dtype if lo is None else None
-    )
-    hi_scalar = as_device_scalar(
-        hi, dtype=input_col.dtype if hi is None else None
-    )
+    lo_scalar = as_device_scalar(lo, dtype=input_col.dtype)
+    hi_scalar = as_device_scalar(hi, dtype=input_col.dtype)
 
     return clamp(input_col, lo_scalar, hi_scalar)
 
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 4017c60683e..43c0198f80a 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -45,7 +45,7 @@ from cudf._lib.cpp.scalar.scalar cimport (
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.wrappers.decimals cimport decimal64, scale_type
+from cudf._lib.cpp.wrappers.decimals cimport decimal32, decimal64, scale_type
 from cudf._lib.cpp.wrappers.durations cimport (
     duration_ms,
     duration_ns,
@@ -88,8 +88,8 @@ cdef class DeviceScalar:
         # IMPORTANT: this should only ever be called from __init__
         valid = not _is_null_host_scalar(value)
 
-        if isinstance(dtype, cudf.Decimal64Dtype):
-            _set_decimal64_from_scalar(
+        if isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
+            _set_decimal_from_scalar(
                 self.c_value, value, dtype, valid)
         elif isinstance(dtype, cudf.ListDtype):
             _set_list_from_pylist(
@@ -118,7 +118,7 @@ cdef class DeviceScalar:
             )
 
     def _to_host_scalar(self):
-        if isinstance(self.dtype, cudf.Decimal64Dtype):
+        if isinstance(self.dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
             result = _get_py_decimal_from_fixed_point(self.c_value)
         elif cudf.api.types.is_struct_dtype(self.dtype):
             result = _get_py_dict_from_struct(self.c_value)
@@ -305,16 +305,25 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s,
     else:
         raise ValueError(f"dtype not supported: {dtype}")
 
-cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s,
-                                object value,
-                                object dtype,
-                                bool valid=True):
+cdef _set_decimal_from_scalar(unique_ptr[scalar]& s,
+                              object value,
+                              object dtype,
+                              bool valid=True):
     value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0
-    s.reset(
-        new fixed_point_scalar[decimal64](
-            <int64_t>np.int64(value), scale_type(-dtype.scale), valid
+    if isinstance(dtype, cudf.Decimal64Dtype):
+        s.reset(
+            new fixed_point_scalar[decimal64](
+                <int64_t>np.int64(value), scale_type(-dtype.scale), valid
+            )
         )
-    )
+    elif isinstance(dtype, cudf.Decimal32Dtype):
+        s.reset(
+            new fixed_point_scalar[decimal32](
+                <int32_t>np.int32(value), scale_type(-dtype.scale), valid
+            )
+        )
+    else:
+        raise ValueError(f"dtype not supported: {dtype}")
 
 cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
                              object value,
@@ -450,6 +459,10 @@ cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s):
         rep_val = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].value())
         scale = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].type().scale())
         return decimal.Decimal(rep_val).scaleb(scale)
+    elif cdtype.id() == libcudf_types.DECIMAL32:
+        rep_val = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].value())
+        scale = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].type().scale())
+        return decimal.Decimal(rep_val).scaleb(scale)
     else:
         raise ValueError("Could not convert cudf::scalar to numpy scalar")
 
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index dbbe9b1e05a..58e3221a4ec 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -17,3 +17,4 @@ ctypedef bool underlying_type_t_null_policy
 cdef dtype_from_column_view(column_view cv)
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
+cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index e798d78d426..1fa389f408c 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -225,10 +225,13 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
             <underlying_type_t_type_id> (
                 SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[np.dtype(dtype)]))
 
-    if tid in (
-        libcudf_types.type_id.DECIMAL64,
-        libcudf_types.type_id.DECIMAL32
-    ):
+    if is_decimal_type_id(tid):
         return libcudf_types.data_type(tid, -dtype.scale)
     else:
         return libcudf_types.data_type(tid)
+
+cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *:
+    return tid in (
+        libcudf_types.type_id.DECIMAL64,
+        libcudf_types.type_id.DECIMAL32
+    )
diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
index 85b4bf20e5c..b6089b65aa5 100644
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ b/python/cudf/cudf/comm/gpuarrow.py
@@ -129,13 +129,11 @@ def null_raw(self):
         return self._series._column.mask_array_view
 
     def make_series(self):
-        """Make a Series object out of this node
-        """
+        """Make a Series object out of this node"""
         return self._series.copy(deep=False)
 
     def _make_dictionary_series(self):
-        """Make a dictionary-encoded series from this node
-        """
+        """Make a dictionary-encoded series from this node"""
         assert self.is_dictionary
         return self._series.copy(deep=False)
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 23b6c01ca83..b5042305299 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -400,8 +400,7 @@ def valid_count(self) -> int:
 
     @property
     def nullmask(self) -> Buffer:
-        """The gpu buffer for the null-mask
-        """
+        """The gpu buffer for the null-mask"""
         if not self.nullable:
             raise ValueError("Column has no null mask")
         return self.mask_array_view
@@ -630,8 +629,7 @@ def fillna(
         )
 
     def isnull(self) -> ColumnBase:
-        """Identify missing values in a Column.
-        """
+        """Identify missing values in a Column."""
         result = libcudf.unary.is_null(self)
 
         if self.dtype.kind == "f":
@@ -642,8 +640,7 @@ def isnull(self) -> ColumnBase:
         return result
 
     def notnull(self) -> ColumnBase:
-        """Identify non-missing values in a Column.
-        """
+        """Identify non-missing values in a Column."""
         result = libcudf.unary.is_valid(self)
 
         if self.dtype.kind == "f":
@@ -697,8 +694,7 @@ def take(
         keep_index: bool = True,
         nullify: bool = False,
     ) -> T:
-        """Return Column by taking values from the corresponding *indices*.
-        """
+        """Return Column by taking values from the corresponding *indices*."""
         # Handle zero size
         if indices.size == 0:
             return cast(T, column_empty_like(self, newsize=0))
@@ -1254,8 +1250,7 @@ def column_empty_like(
     masked: bool = False,
     newsize: int = None,
 ) -> ColumnBase:
-    """Allocate a new column like the given *column*
-    """
+    """Allocate a new column like the given *column*"""
     if dtype is None:
         dtype = column.dtype
     row_count = len(column) if newsize is None else newsize
@@ -1297,8 +1292,7 @@ def column_empty_like_same_mask(
 def column_empty(
     row_count: int, dtype: Dtype = "object", masked: bool = False
 ) -> ColumnBase:
-    """Allocate a new column like the given row_count and dtype.
-    """
+    """Allocate a new column like the given row_count and dtype."""
     dtype = cudf.dtype(dtype)
     children = ()  # type: Tuple[ColumnBase, ...]
 
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b1d69316863..68379002e6b 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -347,8 +347,7 @@ def as_string_column(
             )
 
     def _default_na_value(self) -> DatetimeLikeScalar:
-        """Returns the default NA value for this column
-        """
+        """Returns the default NA value for this column"""
         return np.datetime64("nat", self.time_unit)
 
     def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index b13ad8664dc..6409a9f9196 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -25,8 +25,7 @@
 
 
 class DecimalBaseColumn(NumericalBaseColumn):
-    """Base column for decimal64 and decimal32 columns
-    """
+    """Base column for decimal64 and decimal32 columns"""
 
     dtype: Union[Decimal32Dtype, Decimal64Dtype]
 
@@ -321,5 +320,5 @@ def _binop_precision(l_dtype, r_dtype, op):
         result = p1 + p2 + 1
     else:
         raise NotImplementedError()
-
+    # TODO
     return min(result, cudf.Decimal64Dtype.MAX_PRECISION)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index ca5026c2293..becb303feeb 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -21,7 +21,7 @@
     column,
     string,
 )
-from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
+from cudf.core.dtypes import CategoricalDtype, Decimal32Dtype, Decimal64Dtype
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
@@ -147,6 +147,7 @@ def binary_operator(
                         NumericalColumn,
                         cudf.Scalar,
                         cudf.core.column.Decimal64Column,
+                        cudf.core.column.Decimal32Column,
                     ),
                 )
                 or np.isscalar(rhs)
@@ -158,6 +159,11 @@ def binary_operator(
                     Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
                 )
                 return lhs.binary_operator(binop, rhs)
+            elif isinstance(rhs, cudf.core.column.Decimal32Column):
+                lhs = self.as_decimal_column(
+                    Decimal32Dtype(Decimal32Dtype.MAX_PRECISION, 0)
+                )
+                return lhs.binary_operator(binop, rhs)
             out_dtype = np.result_type(self.dtype, rhs.dtype)
             if binop in ["mod", "floordiv"]:
                 tmp = self if reflect else rhs
@@ -291,8 +297,7 @@ def _process_values_for_isin(
         return lhs, rhs
 
     def _default_na_value(self) -> ScalarLike:
-        """Returns the default NA value for this column
-        """
+        """Returns the default NA value for this column"""
         dkind = self.dtype.kind
         if dkind == "f":
             return self.dtype.type(np.nan)
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index c26b8b7e09c..853fb360c50 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -194,8 +194,7 @@ def corr(self, other: ColumnBase) -> float:
     def round(
         self, decimals: int = 0, how: str = "half_even"
     ) -> NumericalBaseColumn:
-        """Round the values in the Column to the given number of decimals.
-        """
+        """Round the values in the Column to the given number of decimals."""
         return libcudf.round.round(self, decimal_places=decimals, how=how)
 
     def _apply_scan_op(self, op: str) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index dba96b9069d..82f1794b200 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -42,7 +42,7 @@
 
 
 def str_to_boolean(column: StringColumn):
-    """Takes in string column and returns boolean column """
+    """Takes in string column and returns boolean column"""
     return (
         libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8")
     ).fillna(False)
@@ -552,7 +552,7 @@ def _split_by_character(self):
     def extract(
         self, pat: str, flags: int = 0, expand: bool = True
     ) -> SeriesOrIndex:
-        """
+        r"""
         Extract capture groups in the regex `pat` as columns in a DataFrame.
 
         For each subject string in the Series, extract groups from the first
@@ -624,7 +624,7 @@ def contains(
         na=np.nan,
         regex: bool = True,
     ) -> SeriesOrIndex:
-        """
+        r"""
         Test if pattern or regex is contained within a string of a Series or
         Index.
 
@@ -3270,7 +3270,7 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex:
         return self._return_or_inplace(libstrings.wrap(self._column, width))
 
     def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
-        """
+        r"""
         Count occurrences of pattern in each string of the Series/Index.
 
         This function is used to count the number of times a particular
@@ -4767,7 +4767,7 @@ def is_consonant(self, position) -> SeriesOrIndex:
         0     True
         1    False
         dtype: bool
-         """
+        """
         ltype = libstrings.LetterType.CONSONANT
 
         if can_convert_to_column(position):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index c7b13903751..4b7a3bcc197 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -305,8 +305,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn":
         )
 
     def _default_na_value(self) -> ScalarLike:
-        """Returns the default NA value for this column
-        """
+        """Returns the default NA value for this column"""
         return np.timedelta64("nat", self.time_unit)
 
     @property
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 56882f89af8..2411b2a9211 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -244,9 +244,7 @@ def _clear_cache(self):
             del self._column_length
 
     def to_pandas_index(self) -> pd.Index:
-        """"
-        Convert the keys of the ColumnAccessor to a Pandas Index object.
-        """
+        """Convert the keys of the ColumnAccessor to a Pandas Index object."""
         if self.multiindex and len(self.level_names) > 0:
             # Using `from_frame()` instead of `from_tuples`
             # prevents coercion of values to a different type
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d2dbac4d155..5239cf9d648 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -923,8 +923,7 @@ def dtypes(self):
 
     @property
     def ndim(self):
-        """Dimension of the data. DataFrame ndim is always 2.
-        """
+        """Dimension of the data. DataFrame ndim is always 2."""
         return 2
 
     def __dir__(self):
@@ -1023,8 +1022,7 @@ def __getitem__(self, arg):
 
     @annotate("DATAFRAME_SETITEM", color="blue", domain="cudf_python")
     def __setitem__(self, arg, value):
-        """Add/set column by *arg or DataFrame*
-        """
+        """Add/set column by *arg or DataFrame*"""
         if isinstance(arg, DataFrame):
             # not handling set_item where arg = df & value = df
             if isinstance(value, DataFrame):
@@ -1143,13 +1141,13 @@ def __sizeof__(self):
 
     def _slice(self: T, arg: slice) -> T:
         """
-       _slice : slice the frame as per the arg
+        _slice : slice the frame as per the arg
 
-       Parameters
-       ----------
-       arg : should always be of type slice
+        Parameters
+        ----------
+        arg : should always be of type slice
 
-       """
+        """
         from cudf.core.index import RangeIndex
 
         num_rows = len(self)
@@ -1306,7 +1304,7 @@ def __array_function__(self, func, types, args, kwargs):
             return NotImplemented
 
     def _get_numeric_data(self):
-        """ Return a dataframe with only numeric data types """
+        """Return a dataframe with only numeric data types"""
         columns = [
             c
             for c, dt in self.dtypes.items()
@@ -2057,7 +2055,7 @@ def __iter__(self):
         return iter(self.columns)
 
     def iteritems(self):
-        """ Iterate over column names and series pairs """
+        """Iterate over column names and series pairs"""
         for k in self:
             yield (k, self[k])
 
@@ -2078,8 +2076,7 @@ def at(self):
     @property  # type: ignore
     @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python")
     def columns(self):
-        """Returns a tuple of columns
-        """
+        """Returns a tuple of columns"""
         return self._data.to_pandas_index()
 
     @columns.setter  # type: ignore
@@ -2128,8 +2125,7 @@ def _rename_columns(self, new_names):
 
     @property
     def index(self):
-        """Returns the index of the DataFrame
-        """
+        """Returns the index of the DataFrame"""
         return self._index
 
     @index.setter
@@ -2618,7 +2614,7 @@ def take(self, positions, keep_index=True):
 
     @annotate("INSERT", color="green", domain="cudf_python")
     def insert(self, loc, name, value):
-        """ Add a column to DataFrame at the index specified by loc.
+        """Add a column to DataFrame at the index specified by loc.
 
         Parameters
         ----------
@@ -2842,8 +2838,7 @@ def drop(
             return out
 
     def _drop_column(self, name):
-        """Drop a column by *name*
-        """
+        """Drop a column by *name*"""
         if name not in self._data:
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
@@ -2926,8 +2921,7 @@ def drop_duplicates(
         return self._mimic_inplace(outdf, inplace=inplace)
 
     def pop(self, item):
-        """Return a column and drop it from the DataFrame.
-        """
+        """Return a column and drop it from the DataFrame."""
         popped = self[item]
         del self[item]
         return popped
@@ -5612,8 +5606,7 @@ def isin(self, values):
     # Stats
     #
     def _prepare_for_rowwise_op(self, method, skipna):
-        """Prepare a DataFrame for CuPy-based row-wise operations.
-        """
+        """Prepare a DataFrame for CuPy-based row-wise operations."""
 
         if method not in _cupy_nan_methods_map and any(
             col.nullable for col in self._columns
@@ -6239,8 +6232,7 @@ def cov(self, **kwargs):
         return df
 
     def corr(self):
-        """Compute the correlation matrix of a DataFrame.
-        """
+        """Compute the correlation matrix of a DataFrame."""
         corr = cupy.corrcoef(self.values, rowvar=False)
         df = DataFrame(cupy.asfortranarray(corr)).set_index(self.columns)
         df.columns = self.columns
@@ -6766,11 +6758,11 @@ def _setitem_with_dataframe(
     mask: Optional[cudf.core.column.ColumnBase] = None,
 ):
     """
-        This function sets item dataframes relevant columns with replacement df
-        :param input_df: Dataframe to be modified inplace
-        :param replace_df: Replacement DataFrame to replace values with
-        :param input_cols: columns to replace in the input dataframe
-        :param mask: boolean mask in case of masked replacing
+    This function sets item dataframes relevant columns with replacement df
+    :param input_df: Dataframe to be modified inplace
+    :param replace_df: Replacement DataFrame to replace values with
+    :param input_cols: columns to replace in the input dataframe
+    :param mask: boolean mask in case of masked replacing
     """
 
     if input_cols is None:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 71f910d7bb9..0b895460410 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1765,8 +1765,7 @@ def _fill(self, fill_values, begin, end, inplace):
         return self
 
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
-        """Shift values by `periods` positions.
-        """
+        """Shift values by `periods` positions."""
         assert axis in (None, 0) and freq is None
         return self._shift(periods)
 
@@ -5505,7 +5504,7 @@ def multiply(self, other, axis, level=None, fill_value=None):
 
     def rmul(self, other, axis, level=None, fill_value=None):
         """
-        Get Multiplication of dataframe or series and other, element-wise 
+        Get Multiplication of dataframe or series and other, element-wise
         (binary operator `rmul`).
 
         Equivalent to ``other * frame``, but with support to substitute a
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index ffe0051297b..6ffba8da069 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -820,7 +820,7 @@ def quantile(self, q=0.5, interpolation="linear"):
         interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}
             The interpolation method to use when the desired quantile lies
             between two data points. Defaults to "linear".
-       """
+        """
 
         def func(x):
             return getattr(x, "quantile")(q=q, interpolation=interpolation)
@@ -893,8 +893,7 @@ def diff(self, periods=1, axis=0):
         return result._copy_type_metadata(value_columns)
 
     def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
-        """Internal implementation for `ffill` and `bfill`
-        """
+        """Internal implementation for `ffill` and `bfill`"""
         value_columns = self.grouping.values
         result = self.obj.__class__._from_data(
             self._groupby.replace_nulls(
@@ -1365,8 +1364,7 @@ def _handle_by_or_level(self, by=None, level=None):
 
     @property
     def keys(self):
-        """Return grouping key columns as index
-        """
+        """Return grouping key columns as index"""
         nkeys = len(self._key_columns)
 
         if nkeys == 0:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index f858a589614..27edd41ed92 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -656,8 +656,7 @@ def _compute_levels_and_codes(self):
         self._codes = cudf.DataFrame._from_data(codes)
 
     def _compute_validity_mask(self, index, row_tuple, max_length):
-        """ Computes the valid set of indices of values in the lookup
-        """
+        """Computes the valid set of indices of values in the lookup"""
         lookup = cudf.DataFrame()
         for name, row in zip(index.names, row_tuple):
             if isinstance(row, slice) and row == slice(None):
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 91fef1df876..b2fac7a6140 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -592,7 +592,7 @@ def get_dummies(
     drop_first=False,
     dtype="uint8",
 ):
-    """ Returns a dataframe whose columns are the one hot encodings of all
+    """Returns a dataframe whose columns are the one hot encodings of all
     columns in `df`
 
     Parameters
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index f425b650ee7..787b28e213c 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -7,7 +7,7 @@
 
 import cudf
 from cudf.core.column.column import ColumnBase
-from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype
+from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.index import BaseIndex
 from cudf.core.series import Series
 from cudf.utils.dtypes import (
@@ -145,12 +145,12 @@ def _preprocess_host_value(self, value, dtype):
             else:
                 return NA, dtype
 
-        if isinstance(dtype, Decimal64Dtype):
+        if isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
             value = pa.scalar(
                 value, type=pa.decimal128(dtype.precision, dtype.scale)
             ).as_py()
         if isinstance(value, decimal.Decimal) and dtype is None:
-            dtype = Decimal64Dtype._from_decimal(value)
+            dtype = cudf.Decimal64Dtype._from_decimal(value)
 
         value = to_cudf_compatible_scalar(value, dtype=dtype)
 
@@ -171,7 +171,7 @@ def _preprocess_host_value(self, value, dtype):
             else:
                 dtype = value.dtype
 
-        if not isinstance(dtype, Decimal64Dtype):
+        if not isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
             dtype = cudf.dtype(dtype)
 
         if not valid:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c8d8837cbaa..2a9adf65283 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -85,7 +85,7 @@
 
 def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):
     """Append a scalar `value` to the end of `col` inplace.
-       Cast to common type if possible
+    Cast to common type if possible
     """
     to_type = find_common_type([type(value), col.dtype])
     val_col = as_column(value, dtype=to_type)
@@ -135,7 +135,11 @@ def __setitem__(self, key, value):
         if (
             not isinstance(
                 self._frame._column.dtype,
-                (cudf.Decimal64Dtype, cudf.CategoricalDtype),
+                (
+                    cudf.Decimal64Dtype,
+                    cudf.Decimal32Dtype,
+                    cudf.CategoricalDtype,
+                ),
             )
             and hasattr(value, "dtype")
             and _is_non_decimal_numeric_dtype(value.dtype)
@@ -2120,8 +2124,7 @@ def data(self):
 
     @property
     def index(self):
-        """The index object
-        """
+        """The index object"""
         return self._index
 
     @index.setter
@@ -2130,8 +2133,7 @@ def index(self, _index):
 
     @property
     def nullmask(self):
-        """The gpu buffer for the null-mask
-        """
+        """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
     def as_mask(self):
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 8c69b94cc84..5aa72215892 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -226,8 +226,7 @@ def _convert_str_col(col, errors, _downcast=None):
 
 
 def _proc_inf_empty_strings(col):
-    """Handles empty and infinity strings
-    """
+    """Handles empty and infinity strings"""
     col = libstrings.to_lower(col)
     col = _proc_empty_strings(col)
     col = _proc_inf_strings(col)
@@ -235,8 +234,7 @@ def _proc_inf_empty_strings(col):
 
 
 def _proc_empty_strings(col):
-    """Replaces empty strings with NaN
-    """
+    """Replaces empty strings with NaN"""
     s = cudf.Series(col)
     s = s.where(s != "", "NaN")
     return s._column
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index b568c108191..2341a5c23b9 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -18,7 +18,7 @@ def timeseries(
     nulls_frequency=0,
     seed=None,
 ):
-    """ Create timeseries dataframe with random data
+    """Create timeseries dataframe with random data
 
     Parameters
     ----------
@@ -81,7 +81,7 @@ def timeseries(
 
 
 def randomdata(nrows=10, dtypes=None, seed=None):
-    """ Create a dataframe with random data
+    """Create a dataframe with random data
 
     Parameters
     ----------
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index b101835e626..cc5aec36853 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -68,7 +68,7 @@ def count_zero(arr):
 
 
 def assert_eq(left, right, **kwargs):
-    """ Assert that two cudf-like things are equivalent
+    """Assert that two cudf-like things are equivalent
 
     This equality test works for pandas/cudf dataframes/series/indexes/scalars
     in the same way, and so makes it easier to perform parametrized testing
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index afe21201b7e..f4a80c60ddf 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -368,6 +368,22 @@ def rand_dataframe(
                     dtype=dtype,
                 )
             )
+        elif dtype == "decimal32":
+            max_precision = meta.get(
+                "max_precision", cudf.Decimal32Dtype.MAX_PRECISION
+            )
+            precision = np.random.randint(1, max_precision)
+            scale = np.random.randint(0, precision)
+            dtype = cudf.Decimal32Dtype(precision=precision, scale=scale)
+            column_params.append(
+                ColumnParameters(
+                    cardinality=cardinality,
+                    null_frequency=null_frequency,
+                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    is_sorted=False,
+                    dtype=dtype,
+                )
+            )
         elif dtype == "category":
             column_params.append(
                 ColumnParameters(
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 4d0dde88ec2..c6cf7c4e6f5 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -92,8 +92,7 @@ def test_index_find_label_range_genericindex():
 
 
 def test_index_find_label_range_rangeindex():
-    """Cudf specific
-    """
+    """Cudf specific"""
     # step > 0
     # 3, 8, 13, 18
     ridx = RangeIndex(3, 20, 5)
@@ -344,8 +343,7 @@ def test_index_copy_string(name, dtype, deep=True):
     NUMERIC_TYPES + ["datetime64[ns]", "timedelta64[ns]"] + OTHER_TYPES,
 )
 def test_index_copy_integer(name, dtype, deep=True):
-    """Test for NumericIndex Copy Casts
-    """
+    """Test for NumericIndex Copy Casts"""
     cidx = cudf.Int64Index([1, 2, 3])
     pidx = cidx.to_pandas()
 
@@ -358,8 +356,7 @@ def test_index_copy_integer(name, dtype, deep=True):
 @pytest.mark.parametrize("name", ["x"])
 @pytest.mark.parametrize("dtype", SIGNED_TYPES)
 def test_index_copy_float(name, dtype, deep=True):
-    """Test for NumericIndex Copy Casts
-    """
+    """Test for NumericIndex Copy Casts"""
     cidx = cudf.Float64Index([1.0, 2.0, 3.0])
     pidx = cidx.to_pandas()
 
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index a6d0a10ce5d..2ded4925964 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -666,8 +666,7 @@ def test_multiindex_equals():
 )
 @pytest.mark.parametrize("names", [["X", "Y"]])
 def test_multiindex_copy_sem(data, levels, codes, names):
-    """Test semantic equality for MultiIndex.copy
-    """
+    """Test semantic equality for MultiIndex.copy"""
     gdf = cudf.DataFrame(data)
     pdf = gdf.to_pandas()
 
@@ -740,8 +739,8 @@ def test_multiindex_copy_sem(data, levels, codes, names):
 @pytest.mark.parametrize("deep", [True, False])
 def test_multiindex_copy_deep(data, deep):
     """Test memory idendity for deep copy
-        Case1: Constructed from GroupBy, StringColumns
-        Case2: Constrcuted from MultiIndex, NumericColumns
+    Case1: Constructed from GroupBy, StringColumns
+    Case2: Constrcuted from MultiIndex, NumericColumns
     """
     same_ref = not deep
 
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index f47e87374dc..6543af36dd4 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1103,6 +1103,8 @@ def test_dataframe_exceptions_for_clip(lower, upper):
         ([1, 2, 3, 4, 5], None, 4),
         ([1, 2, 3, 4, 5], None, None),
         ([1, 2, 3, 4, 5], 4, 2),
+        ([1.0, 2.0, 3.0, 4.0, 5.0], 4, 2),
+        (pd.Series([1, 2, 3, 4, 5], dtype="int32"), 4, 2),
         (["a", "b", "c", "d", "e"], "b", "d"),
         (["a", "b", "c", "d", "e"], "b", None),
         (["a", "b", "c", "d", "e"], None, "d"),
@@ -1112,7 +1114,7 @@ def test_dataframe_exceptions_for_clip(lower, upper):
 @pytest.mark.parametrize("inplace", [True, False])
 def test_series_clip(data, lower, upper, inplace):
     psr = pd.Series(data)
-    gsr = cudf.Series.from_pandas(data)
+    gsr = cudf.from_pandas(psr)
 
     expect = psr.clip(lower=lower, upper=upper)
     got = gsr.clip(lower=lower, upper=upper, inplace=inplace)
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index b436825cf69..440dcf527ca 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -44,7 +44,7 @@
 )
 @pytest.mark.parametrize("to_host", [True, False])
 def test_serialize(df, to_host):
-    """ This should hopefully replace all functions below """
+    """This should hopefully replace all functions below"""
     a = df()
     if "cudf" not in type(a).__module__:
         a = cudf.from_pandas(a)
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 222b7c726fc..53676a47046 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -117,8 +117,7 @@ def test_series_sort_index(nelem, asc):
 @pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]])
 @pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 7])
 def test_series_nlargest(data, n):
-    """Indirectly tests Series.sort_values()
-    """
+    """Indirectly tests Series.sort_values()"""
     sr = Series(data)
     psr = pd.Series(data)
     assert_eq(sr.nlargest(n), psr.nlargest(n))
@@ -136,8 +135,7 @@ def test_series_nlargest(data, n):
 @pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]])
 @pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 9])
 def test_series_nsmallest(data, n):
-    """Indirectly tests Series.sort_values()
-    """
+    """Indirectly tests Series.sort_values()"""
     sr = Series(data)
     psr = pd.Series(data)
     assert_eq(sr.nsmallest(n), psr.nsmallest(n))
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index c8fb7c1a47d..fa5cde76524 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -332,8 +332,7 @@ def chunk_wise_kernel(nrows, chunks, {args}):
 
 @functools.wraps(_make_row_wise_kernel)
 def _load_cache_or_make_row_wise_kernel(cache_key, func, *args, **kwargs):
-    """Caching version of ``_make_row_wise_kernel``.
-    """
+    """Caching version of ``_make_row_wise_kernel``."""
     if cache_key is None:
         cache_key = func
     try:
@@ -349,8 +348,7 @@ def _load_cache_or_make_row_wise_kernel(cache_key, func, *args, **kwargs):
 
 @functools.wraps(_make_chunk_wise_kernel)
 def _load_cache_or_make_chunk_wise_kernel(func, *args, **kwargs):
-    """Caching version of ``_make_row_wise_kernel``.
-    """
+    """Caching version of ``_make_row_wise_kernel``."""
     try:
         return _cache[func]
     except KeyError:
@@ -360,6 +358,5 @@ def _load_cache_or_make_chunk_wise_kernel(func, *args, **kwargs):
 
 
 def _mangle_user(name):
-    """Mangle user variable name
-    """
+    """Mangle user variable name"""
     return "__user_{}".format(name)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index c5620bed078..2eb38c0f77e 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -121,8 +121,7 @@
 
 
 def np_to_pa_dtype(dtype):
-    """Util to convert numpy dtype to PyArrow dtype.
-    """
+    """Util to convert numpy dtype to PyArrow dtype."""
     # special case when dtype is np.datetime64
     if dtype.kind == "M":
         time_unit, _ = np.datetime_data(dtype)
@@ -153,8 +152,7 @@ def get_numeric_type_info(dtype):
 
 
 def numeric_normalize_types(*args):
-    """Cast all args to a common type using numpy promotion logic
-    """
+    """Cast all args to a common type using numpy promotion logic"""
     dtype = np.result_type(*[a.dtype for a in args])
     return [a.astype(dtype) for a in args]
 
@@ -171,8 +169,8 @@ def _find_common_type_decimal(dtypes):
 
 
 def cudf_dtype_from_pydata_dtype(dtype):
-    """ Given a numpy or pandas dtype, converts it into the equivalent cuDF
-        Python dtype.
+    """Given a numpy or pandas dtype, converts it into the equivalent cuDF
+    Python dtype.
     """
 
     if cudf.api.types.is_categorical_dtype(dtype):
@@ -188,8 +186,8 @@ def cudf_dtype_from_pydata_dtype(dtype):
 
 
 def cudf_dtype_to_pa_type(dtype):
-    """ Given a cudf pandas dtype, converts it into the equivalent cuDF
-        Python dtype.
+    """Given a cudf pandas dtype, converts it into the equivalent cuDF
+    Python dtype.
     """
     if cudf.api.types.is_categorical_dtype(dtype):
         raise NotImplementedError()
@@ -204,8 +202,8 @@ def cudf_dtype_to_pa_type(dtype):
 
 
 def cudf_dtype_from_pa_type(typ):
-    """ Given a cuDF pyarrow dtype, converts it into the equivalent
-        cudf pandas dtype.
+    """Given a cuDF pyarrow dtype, converts it into the equivalent
+    cudf pandas dtype.
     """
     if pa.types.is_list(typ):
         return cudf.core.dtypes.ListDtype.from_arrow(typ)
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index 532fa925670..4f84fde5492 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -247,7 +247,7 @@ def hash_vocab(
     sep_token="[SEP]",
 ):
     """
-      Write the vocab vocabulary hashtable to the output_path
+    Write the vocab vocabulary hashtable to the output_path
     """
     np.random.seed(1243342)
     vocab = _load_vocab_dict(vocab_path)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 3c98d27868d..6746753249c 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1115,7 +1115,8 @@ def _is_local_filesystem(fs):
 
 
 def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
-    """Return False if `path_or_data` resolves to multiple filepaths or buffers
+    """Return False if `path_or_data` resolves to multiple filepaths or
+    buffers.
     """
     path_or_data = stringify_pathlike(path_or_data)
     if isinstance(path_or_data, str):
@@ -1140,8 +1141,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
 
 
 def is_directory(path_or_data, **kwargs):
-    """Returns True if the provided filepath is a directory
-    """
+    """Returns True if the provided filepath is a directory"""
     path_or_data = stringify_pathlike(path_or_data)
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options")
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index c71a6dbccb1..c1c5513c718 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -157,8 +157,7 @@ def {kernelname}(out, {args}):
 
 
 def _wrap_query_expr(name, fn, args):
-    """Wrap the query expression in a cuda kernel.
-    """
+    """Wrap the query expression in a cuda kernel."""
 
     def _add_idx(arg):
         if arg.startswith(ENVREF_PREFIX):
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 4995a5f31b5..4f9b23bf6fe 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -383,8 +383,8 @@ def _cast_to_appropriate_cudf_type(val, index=None):
 
 def _get_cupy_compatible_args_index(args, ser_index=None):
     """
-     This function returns cupy compatible arguments and output index
-     if conversion is not possible it returns None
+    This function returns cupy compatible arguments and output index
+    if conversion is not possible it returns None
     """
 
     casted_ls = []
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 2907753d8d5..bf063918c89 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -33,7 +33,7 @@
 
 
 class _Frame(dd.core._Frame, OperatorMethodMixin):
-    """ Superclass for DataFrame and Series
+    """Superclass for DataFrame and Series
 
     Parameters
     ----------
@@ -263,13 +263,13 @@ def sort_values(
         return df
 
     def to_parquet(self, path, *args, **kwargs):
-        """ Calls dask.dataframe.io.to_parquet with CudfEngine backend """
+        """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
         from dask_cudf.io import to_parquet
 
         return to_parquet(self, path, *args, **kwargs)
 
     def to_orc(self, path, **kwargs):
-        """ Calls dask_cudf.io.to_orc """
+        """Calls dask_cudf.io.to_orc"""
         from dask_cudf.io import to_orc
 
         return to_orc(self, path, **kwargs)
@@ -326,8 +326,7 @@ def repartition(self, *args, **kwargs):
         return super().repartition(*args, **kwargs)
 
     def shuffle(self, *args, **kwargs):
-        """ Wraps dask.dataframe DataFrame.shuffle method
-        """
+        """Wraps dask.dataframe DataFrame.shuffle method"""
         shuffle_arg = kwargs.pop("shuffle", None)
         if shuffle_arg and shuffle_arg != "tasks":
             raise ValueError("dask_cudf does not support disk-based shuffle.")
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 0cf9d835523..3eb3975c41f 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -180,24 +180,24 @@ def groupby_agg(
     sort=False,
     as_index=True,
 ):
-    """ Optimized groupby aggregation for Dask-CuDF.
-
-        This aggregation algorithm only supports the following options:
-
-        - "count"
-        - "mean"
-        - "std"
-        - "var"
-        - "sum"
-        - "min"
-        - "max"
-        - "collect"
-        - "first"
-        - "last"
-
-        This "optimized" approach is more performant than the algorithm
-        in `dask.dataframe`, because it allows the cudf backend to
-        perform multiple aggregations at once.
+    """Optimized groupby aggregation for Dask-CuDF.
+
+    This aggregation algorithm only supports the following options:
+
+    - "count"
+    - "mean"
+    - "std"
+    - "var"
+    - "sum"
+    - "min"
+    - "max"
+    - "collect"
+    - "first"
+    - "last"
+
+    This "optimized" approach is more performant than the algorithm
+    in `dask.dataframe`, because it allows the cudf backend to
+    perform multiple aggregations at once.
     """
     # Assert that aggregations are supported
     aggs = _redirect_aggs(aggs_in)
@@ -348,8 +348,7 @@ def groupby_agg(
 
 
 def _redirect_aggs(arg):
-    """ Redirect aggregations to their corresponding name in cuDF
-    """
+    """Redirect aggregations to their corresponding name in cuDF"""
     redirects = {
         sum: "sum",
         max: "max",
@@ -375,8 +374,7 @@ def _redirect_aggs(arg):
 
 
 def _is_supported(arg, supported: set):
-    """ Check that aggregations in `arg` are a subset of `supported`
-    """
+    """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
         if isinstance(arg, dict):
             _global_set: Set[str] = set()
@@ -395,8 +393,7 @@ def _is_supported(arg, supported: set):
 
 
 def _make_name(*args, sep="_"):
-    """ Combine elements of `args` into a new string
-    """
+    """Combine elements of `args` into a new string"""
     _args = (arg for arg in args if arg != "")
     return sep.join(_args)
 
@@ -404,15 +401,15 @@ def _make_name(*args, sep="_"):
 def _groupby_partition_agg(
     df, gb_cols, aggs, columns, split_out, dropna, sort, sep
 ):
-    """ Initial partition-level aggregation task.
-
-        This is the first operation to be executed on each input
-        partition in `groupby_agg`.  Depending on `aggs`, four possible
-        groupby aggregations ("count", "sum", "min", and "max") are
-        performed.  The result is then partitioned (by hashing `gb_cols`)
-        into a number of distinct dictionary elements.  The number of
-        elements in the output dictionary (`split_out`) corresponds to
-        the number of partitions in the final output of `groupby_agg`.
+    """Initial partition-level aggregation task.
+
+    This is the first operation to be executed on each input
+    partition in `groupby_agg`.  Depending on `aggs`, four possible
+    groupby aggregations ("count", "sum", "min", and "max") are
+    performed.  The result is then partitioned (by hashing `gb_cols`)
+    into a number of distinct dictionary elements.  The number of
+    elements in the output dictionary (`split_out`) corresponds to
+    the number of partitions in the final output of `groupby_agg`.
     """
 
     # Modify dict for initial (partition-wise) aggregations
@@ -459,15 +456,15 @@ def _groupby_partition_agg(
 
 
 def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
-    """ Node in groupby-aggregation reduction tree.
-
-        Following the initial `_groupby_partition_agg` tasks,
-        the `groupby_agg` algorithm will perform a tree reduction
-        to combine the data from the input partitions into
-        `split_out` different output partitions.  For each node in
-        the reduction tree, the input DataFrame objects are
-        concatenated, and "sum", "min" and/or "max" groupby
-        aggregations are used to combine the necessary statistics.
+    """Node in groupby-aggregation reduction tree.
+
+    Following the initial `_groupby_partition_agg` tasks,
+    the `groupby_agg` algorithm will perform a tree reduction
+    to combine the data from the input partitions into
+    `split_out` different output partitions.  For each node in
+    the reduction tree, the input DataFrame objects are
+    concatenated, and "sum", "min" and/or "max" groupby
+    aggregations are used to combine the necessary statistics.
     """
 
     df = _concat(dfs, ignore_index=True)
@@ -493,8 +490,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
 
 
 def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
-    """ Calculate variance (given count, sum, and sum-squared columns).
-    """
+    """Calculate variance (given count, sum, and sum-squared columns)."""
 
     # Select count, sum, and sum-squared
     n = df[count_name]
@@ -526,13 +522,13 @@ def _finalize_gb_agg(
     str_cols_out,
     aggs_renames,
 ):
-    """ Final aggregation task.
+    """Final aggregation task.
 
-        This is the final operation on each output partitions
-        of the `groupby_agg` algorithm.  This function must
-        take care of higher-order aggregations, like "mean",
-        "std" and "var".  We also need to deal with the column
-        index, the row index, and final sorting behavior.
+    This is the final operation on each output partitions
+    of the `groupby_agg` algorithm.  This function must
+    take care of higher-order aggregations, like "mean",
+    "std" and "var".  We also need to deal with the column
+    index, the row index, and final sorting behavior.
     """
 
     # Deal with higher-order aggregations
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 9b7a58a3c59..2e5d55e92d2 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -351,7 +351,7 @@ def read_parquet(
     row_groups_per_part=None,
     **kwargs,
 ):
-    """ Read parquet files into a Dask DataFrame
+    """Read parquet files into a Dask DataFrame
 
     Calls ``dask.dataframe.read_parquet`` to cordinate the execution of
     ``cudf.read_parquet``, and ultimately read multiple partitions into
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 21bf3aee7d1..5f2af445170 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -46,7 +46,7 @@ def _quantile(a, q):
 
 
 def merge_quantiles(finalq, qs, vals):
-    """ Combine several quantile calculations of different data.
+    """Combine several quantile calculations of different data.
     [NOTE: Same logic as dask.array merge_percentiles]
     """
     if isinstance(finalq, Iterator):
@@ -223,8 +223,7 @@ def sort_values(
     ascending=True,
     na_position="last",
 ):
-    """ Sort by the given list/tuple of column names.
-    """
+    """Sort by the given list/tuple of column names."""
     if na_position not in ("first", "last"):
         raise ValueError("na_position must be either 'first' or 'last'")