From de55832127fba6fd79cf4f4ce3fb647ea1c4ab8b Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Wed, 24 Mar 2021 03:36:06 -0600
Subject: [PATCH 01/14] Add is_integer API that can check for the validity of a
 string-to-integer conversion (#7642)

This PR addresses #5110, #7080, and rework https://github.com/rapidsai/cudf/pull/7094. It adds the function `cudf::strings::is_integer` that can check if strings can be correctly converted into integer values. Underflow and overflow are also taken into account.

Note that this `cudf::strings::is_integer` is different from the existing `cudf::strings::string::is_integer`, which only checks for pattern and does not care about under/overflow.

Examples:
```
s = { "eee", "-200", "-100", "127", "128", "1.5", NULL}

is_integer(s, INT8) = { 0, 0, 1, 1, 0, 0, NULL}
is_integer(s, INT32) = { 0, 1, 1, 1, 1, 0, NULL}
```

Authors:
  - Nghia Truong (@ttnghia)

Approvers:
  - David (@davidwendt)
  - Jake Hemstad (@jrhemstad)
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7642
---
 .../cudf/strings/convert/convert_integers.hpp |  43 +++-
 cpp/src/strings/convert/convert_integers.cu   | 212 ++++++++++++++----
 cpp/tests/strings/integers_tests.cu           | 197 ++++++++++++++--
 3 files changed, 383 insertions(+), 69 deletions(-)
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 1e2fa80b129..4d29b0a5b6a 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -78,7 +78,10 @@ std::unique_ptr<column> from_integers(
  * characters are valid for conversion to integers.
  *
  * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9].
+ * have all characters in [-+0-9]. The optional sign character must only be in the first
+ * position. Notice that the the integer value is not checked to be within its storage limits.
+ * For strict integer type check, use the other `is_integer()` API which accepts `data_type`
+ * argument.
  *
  * @code{.pseudo}
  * Example:
@@ -89,12 +92,44 @@ std::unique_ptr<column> from_integers(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param strings  Strings instance for this operation.
+ * @param mr       Device memory resource used to allocate the returned column's device memory.
+ * @return         New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to integers.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has all characters in [-+0-9]. The optional sign character must only be in the first
+ * position. Also, the integer component must fit within the size limits of the underlying
+ * storage type, which is provided by the int_type parameter.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123456', '-456', '', 'A', '+7']
+ *
+ * output1 = s.is_integer(s, data_type{type_id::INT32})
+ * output1 is [true, true, false, false, true]
+ *
+ * output2 = s.is_integer(s, data_type{type_id::INT8})
+ * output2 is [false, false, false, false, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings  Strings instance for this operation.
+ * @param int_type Integer type used for checking underflow and overflow.
+ * @param mr       Device memory resource used to allocate the returned column's device memory.
+ * @return         New column of boolean results for each string.
  */
 std::unique_ptr<column> is_integer(
   strings_column_view const& strings,
+  data_type int_type,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 5c5032b5c87..7eee2b3cc0e 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -25,7 +25,6 @@
 #include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <strings/convert/utilities.cuh>
 #include <strings/utilities.cuh>
@@ -38,6 +37,160 @@
 
 namespace cudf {
 namespace strings {
+
+namespace detail {
+namespace {
+
+/**
+ * @brief This only checks if a string is a valid integer within the bounds of its storage type.
+ */
+template <typename IntegerType>
+struct string_to_integer_check_fn {
+  __device__ bool operator()(thrust::pair<string_view, bool> const& p) const
+  {
+    if (!p.second || p.first.empty()) { return false; }
+
+    auto const d_str = p.first.data();
+    if (d_str[0] == '-' && std::is_unsigned<IntegerType>::value) { return false; }
+
+    auto iter           = d_str + static_cast<int>((d_str[0] == '-' || d_str[0] == '+'));
+    auto const iter_end = d_str + p.first.size_bytes();
+    if (iter == iter_end) { return false; }
+
+    auto const sign = d_str[0] == '-' ? IntegerType{-1} : IntegerType{1};
+    auto const bound_val =
+      sign > 0 ? std::numeric_limits<IntegerType>::max() : std::numeric_limits<IntegerType>::min();
+
+    IntegerType value = 0;      // parse the string to integer and check for overflow along the way
+    while (iter != iter_end) {  // check all bytes for valid characters
+      auto const chr = *iter++;
+      // Check for valid character
+      if (chr < '0' || chr > '9') { return false; }
+
+      // Check for underflow and overflow:
+      auto const digit       = static_cast<IntegerType>(chr - '0');
+      auto const bound_check = (bound_val - sign * digit) / IntegerType{10} * sign;
+      if (value > bound_check) return false;
+      value = value * IntegerType{10} + digit;
+    }
+
+    return true;
+  }
+};
+
+/**
+ * @brief The dispatch functions for checking if strings are valid integers.
+ */
+struct dispatch_is_integer_fn {
+  template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  std::unique_ptr<column> operator()(strings_column_view const& strings,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
+  {
+    auto const d_column = column_device_view::create(strings.parent(), stream);
+    auto results        = make_numeric_column(data_type{type_id::BOOL8},
+                                       strings.size(),
+                                       cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                       strings.null_count(),
+                                       stream,
+                                       mr);
+
+    auto d_results = results->mutable_view().data<bool>();
+    if (strings.has_nulls()) {
+      thrust::transform(rmm::exec_policy(stream),
+                        d_column->pair_begin<string_view, true>(),
+                        d_column->pair_end<string_view, true>(),
+                        d_results,
+                        string_to_integer_check_fn<T>{});
+    } else {
+      thrust::transform(rmm::exec_policy(stream),
+                        d_column->pair_begin<string_view, false>(),
+                        d_column->pair_end<string_view, false>(),
+                        d_results,
+                        string_to_integer_check_fn<T>{});
+    }
+
+    // Calling mutable_view() on a column invalidates it's null count so we need to set it back
+    results->set_null_count(strings.null_count());
+
+    return results;
+  }
+
+  template <typename T, std::enable_if_t<not std::is_integral<T>::value>* = nullptr>
+  std::unique_ptr<column> operator()(strings_column_view const&,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
+  {
+    CUDF_FAIL("is_integer is expecting an integer type");
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const d_column = column_device_view::create(strings.parent(), stream);
+  auto results        = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+
+  auto d_results = results->mutable_view().data<bool>();
+  if (strings.has_nulls()) {
+    thrust::transform(
+      rmm::exec_policy(stream),
+      d_column->pair_begin<string_view, true>(),
+      d_column->pair_end<string_view, true>(),
+      d_results,
+      [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
+  } else {
+    thrust::transform(
+      rmm::exec_policy(stream),
+      d_column->pair_begin<string_view, false>(),
+      d_column->pair_end<string_view, false>(),
+      d_results,
+      [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
+  }
+
+  // Calling mutable_view() on a column invalidates it's null count so we need to set it back
+  results->set_null_count(strings.null_count());
+
+  return results;
+}
+
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  data_type int_type,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  if (strings.is_empty()) { return cudf::make_empty_column(data_type{type_id::BOOL8}); }
+  return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr);
+}
+
+}  // namespace detail
+
+// external APIs
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   data_type int_type,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_integer(strings, int_type, rmm::cuda_stream_default, mr);
+}
+
 namespace detail {
 namespace {
 /**
@@ -69,11 +222,10 @@ struct dispatch_to_integers_fn {
                   mutable_column_view& output_column,
                   rmm::cuda_stream_view stream) const
   {
-    auto d_results = output_column.data<IntegerType>();
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_column.size()),
-                      d_results,
+                      output_column.data<IntegerType>(),
                       string_to_integer_fn<IntegerType>{strings_column});
   }
   // non-integral types throw an exception
@@ -102,19 +254,22 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_numeric_column(output_type, 0);
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-  // create integer output column copying the strings null-mask
-  auto results      = make_numeric_column(output_type,
+
+  // Create integer output column copying the strings null-mask
+  auto results = make_numeric_column(output_type,
                                      strings_count,
                                      cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                      strings.null_count(),
                                      stream,
                                      mr);
-  auto results_view = results->mutable_view();
-  // fill output column with integers
-  type_dispatcher(output_type, dispatch_to_integers_fn{}, d_strings, results_view, stream);
+  // Fill output column with integers
+  auto const strings_dev_view = column_device_view::create(strings.parent(), stream);
+  auto results_view           = results->mutable_view();
+  type_dispatcher(output_type, dispatch_to_integers_fn{}, *strings_dev_view, results_view, stream);
+
+  // Calling mutable_view() on a column invalidates it's null count so we need to set it back
   results->set_null_count(strings.null_count());
+
   return results;
 }
 
@@ -253,42 +408,5 @@ std::unique_ptr<column> from_integers(column_view const& integers,
   return detail::from_integers(integers, rmm::cuda_stream_default, mr);
 }
 
-namespace detail {
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_integer(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-}  // namespace detail
-
-// external API
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu
index d6bf03b3f76..f15116ae4c2 100644
--- a/cpp/tests/strings/integers_tests.cu
+++ b/cpp/tests/strings/integers_tests.cu
@@ -26,20 +26,18 @@
 #include <string>
 #include <vector>
 
+// Using an alias variable for the null elements
+// This will make the code looks cleaner
+constexpr auto NULL_VAL = 0;
+
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
-TEST_F(StringsConvertTest, IsInteger)
+TEST_F(StringsConvertTest, IsIntegerBasicCheck)
 {
-  cudf::test::strings_column_wrapper strings;
-  auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::is_integer(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
   cudf::test::strings_column_wrapper strings1(
     {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
-  results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
+  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
   cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
 
@@ -50,24 +48,187 @@ TEST_F(StringsConvertTest, IsInteger)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
+TEST_F(StringsConvertTest, ZeroSizeIsIntegerBasicCheck)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_integer(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+}
+
+TEST_F(StringsConvertTest, IsIntegerBoundCheckNoNull)
+{
+  auto strings = cudf::test::strings_column_wrapper(
+    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
+  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                           cudf::data_type{cudf::type_id::INT32});
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<bool>({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  strings = cudf::test::strings_column_wrapper(
+    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::INT32});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsConvertTest, IsIntegerBoundCheckWithNulls)
+{
+  std::vector<const char*> const h_strings{
+    "eee", "1234", nullptr, "", "-9832", "93.24", "765é", nullptr};
+  auto const strings = cudf::test::strings_column_wrapper(
+    h_strings.begin(),
+    h_strings.end(),
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto const results = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                                 cudf::data_type{cudf::type_id::INT32});
+  // Input has null elements then the output should have the same null mask
+  auto const expected = cudf::test::fixed_width_column_wrapper<bool>(
+    std::initializer_list<int8_t>{0, 1, NULL_VAL, 0, 1, 0, 0, NULL_VAL},
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsConvertTest, ZeroSizeIsIntegerBoundCheck)
+{
+  // Empty input
+  auto strings = cudf::test::strings_column_wrapper{};
+  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                           cudf::data_type{cudf::type_id::INT32});
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+}
+
+TEST_F(StringsConvertTest, IsIntegerBoundCheckSmallNumbers)
+{
+  auto strings = cudf::test::strings_column_wrapper(
+    {"-200", "-129", "-128", "-120", "0", "120", "127", "130", "150", "255", "300", "500"});
+  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                           cudf::data_type{cudf::type_id::INT8});
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::UINT8});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  strings = cudf::test::strings_column_wrapper(
+    {"-40000", "-32769", "-32768", "-32767", "-32766", "32765", "32766", "32767", "32768"});
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::INT16});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 1, 1, 1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::UINT16});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::INT32});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsConvertTest, IsIntegerBoundCheckLargeNumbers)
+{
+  auto strings =
+    cudf::test::strings_column_wrapper({"-2147483649",   // std::numeric_limits<int32_t>::min() - 1
+                                        "-2147483648",   // std::numeric_limits<int32_t>::min()
+                                        "-2147483647",   // std::numeric_limits<int32_t>::min() + 1
+                                        "2147483646",    // std::numeric_limits<int32_t>::max() - 1
+                                        "2147483647",    // std::numeric_limits<int32_t>::max()
+                                        "2147483648",    // std::numeric_limits<int32_t>::max() + 1
+                                        "4294967294",    // std::numeric_limits<uint32_t>::max() - 1
+                                        "4294967295",    // std::numeric_limits<uint32_t>::max()
+                                        "4294967296"});  // std::numeric_limits<uint32_t>::max() + 1
+  auto results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                           cudf::data_type{cudf::type_id::INT32});
+  auto expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 1, 1, 1, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::UINT32});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 1, 1, 1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  strings = cudf::test::strings_column_wrapper(
+    {"-9223372036854775809",    // std::numeric_limits<int64_t>::min() - 1
+     "-9223372036854775808",    // std::numeric_limits<int64_t>::min()
+     "-9223372036854775807",    // std::numeric_limits<int64_t>::min() + 1
+     "9223372036854775806",     // std::numeric_limits<int64_t>::max() - 1
+     "9223372036854775807",     // std::numeric_limits<int64_t>::max()
+     "9223372036854775808",     // std::numeric_limits<int64_t>::max() + 1
+     "18446744073709551614",    // std::numeric_limits<uint64_t>::max() - 1
+     "18446744073709551615",    // std::numeric_limits<uint64_t>::max()
+     "18446744073709551616"});  // std::numeric_limits<uint64_t>::max() + 1
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::INT64});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 1, 1, 1, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::UINT64});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 1, 1, 1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsConvertTest, ToInteger)
 {
-  std::vector<const char*> h_strings{
-    "eee", "1234", nullptr, "", "-9832", "93.24", "765é", "-1.78e+5", "2147483647", "-2147483648"};
+  std::vector<const char*> h_strings{"eee",
+                                     "1234",
+                                     nullptr,
+                                     "",
+                                     "-9832",
+                                     "93.24",
+                                     "765é",
+                                     nullptr,
+                                     "-1.78e+5",
+                                     "2147483647",
+                                     "-2147483648",
+                                     "2147483648"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<int32_t> h_expected{0, 1234, 0, 0, -9832, 93, 765, -1, 2147483647, -2147483648};
 
-  auto strings_view = cudf::strings_column_view(strings);
-  auto results = cudf::strings::to_integers(strings_view, cudf::data_type{cudf::type_id::INT32});
+  auto results            = cudf::strings::to_integers(cudf::strings_column_view(strings),
+                                            cudf::data_type{cudf::type_id::INT16});
+  auto const expected_i16 = cudf::test::fixed_width_column_wrapper<int16_t>(
+    std::initializer_list<int16_t>{0, 1234, NULL_VAL, 0, -9832, 93, 765, NULL_VAL, -1, -1, 0, 0},
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_i16);
 
-  cudf::test::fixed_width_column_wrapper<int32_t> expected(
-    h_expected.begin(),
-    h_expected.end(),
+  results                 = cudf::strings::to_integers(cudf::strings_column_view(strings),
+                                       cudf::data_type{cudf::type_id::INT32});
+  auto const expected_i32 = cudf::test::fixed_width_column_wrapper<int32_t>(
+    std::initializer_list<int32_t>{
+      0, 1234, NULL_VAL, 0, -9832, 93, 765, NULL_VAL, -1, 2147483647, -2147483648, -2147483648},
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_i32);
+
+  results                 = cudf::strings::to_integers(cudf::strings_column_view(strings),
+                                       cudf::data_type{cudf::type_id::UINT32});
+  auto const expected_u32 = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    std::initializer_list<uint32_t>{0,
+                                    1234,
+                                    NULL_VAL,
+                                    0,
+                                    4294957464,
+                                    93,
+                                    765,
+                                    NULL_VAL,
+                                    4294967295,
+                                    2147483647,
+                                    2147483648,
+                                    2147483648},
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_u32);
 }
 
 TEST_F(StringsConvertTest, FromInteger)
@@ -114,7 +275,7 @@ TEST_F(StringsConvertTest, EmptyStringsColumn)
   cudf::test::strings_column_wrapper strings({"", "", ""});
   auto results = cudf::strings::to_integers(cudf::strings_column_view(strings),
                                             cudf::data_type{cudf::type_id::INT64});
-  cudf::test::fixed_width_column_wrapper<int64_t> expected({0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int64_t> expected{0, 0, 0};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
 }
 

From 6ed360c57dbd441c1b54b0b4d3a2b4dcaf841c27 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 24 Mar 2021 06:50:12 -0400
Subject: [PATCH 02/14] Add gbenchmark for nvtext tokenize functions (#7684)

Reference #5696
Creates gbenchmarks for `nvtext::tokenize()`, `nvtext::count_tokens()` and `nvtext::ngrams_tokenize()` functions.
The benchmarks measures various string lengths and number of rows.

These functions use the `make_strings_column` factory optimized in #7576

Authors:
  - David (@davidwendt)

Approvers:
  - Conor Hoekstra (@codereport)
  - Nghia Truong (@ttnghia)
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7684
---
 cpp/benchmarks/CMakeLists.txt              |  1 +
 cpp/benchmarks/text/tokenize_benchmark.cpp | 92 ++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 cpp/benchmarks/text/tokenize_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d59e582b1fb..7fd84b508ac 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -177,6 +177,7 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
 ConfigureBench(TEXT_BENCH
   text/normalize_benchmark.cpp
   text/normalize_spaces_benchmark.cpp
+  text/tokenize_benchmark.cpp
   text/subword_benchmark.cpp)
 
 ###################################################################################################
diff --git a/cpp/benchmarks/text/tokenize_benchmark.cpp b/cpp/benchmarks/text/tokenize_benchmark.cpp
new file mode 100644
index 00000000000..f9e742f0f31
--- /dev/null
+++ b/cpp/benchmarks/text/tokenize_benchmark.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/ngrams_tokenize.hpp>
+#include <nvtext/tokenize.hpp>
+
+class TextTokenize : public cudf::benchmark {
+};
+
+enum class tokenize_type { single, multi, count, count_multi, ngrams };
+
+static void BM_tokenize(benchmark::State& state, tokenize_type tt)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (tt) {
+      case tokenize_type::single: nvtext::tokenize(input); break;
+      case tokenize_type::multi:
+        nvtext::tokenize(input, cudf::strings_column_view(delimiters));
+        break;
+      case tokenize_type::count: nvtext::count_tokens(input); break;
+      case tokenize_type::count_multi:
+        nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
+        break;
+      case tokenize_type::ngrams:
+        // default is bigrams
+        nvtext::ngrams_tokenize(input);
+        break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)                                 \
+  BENCHMARK_DEFINE_F(TextTokenize, name)                              \
+  (::benchmark::State & st) { BM_tokenize(st, tokenize_type::name); } \
+  BENCHMARK_REGISTER_F(TextTokenize, name)                            \
+    ->Apply(generate_bench_args)                                      \
+    ->UseManualTime()                                                 \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(single)
+NVTEXT_BENCHMARK_DEFINE(multi)
+NVTEXT_BENCHMARK_DEFINE(count)
+NVTEXT_BENCHMARK_DEFINE(count_multi)
+NVTEXT_BENCHMARK_DEFINE(ngrams)

From 0c36ca9187f2a72820bff92f2ddfc7cffc4f1eb0 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 24 Mar 2021 06:52:58 -0400
Subject: [PATCH 03/14] Refactor ConfigureCUDA to not conditionally insert
 compiler flags (#7643)

Changing the state of `BUILD_TESTING` or `BUILD_BENCHMARKS` now doesn't cause a recompilation of all cudf source files.

Authors:
  - Robert Maynard (@robertmaynard)

Approvers:
  - Keith Kraus (@kkraus14)
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7643
---
 cpp/cmake/Modules/ConfigureCUDA.cmake | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index 3a24d0d634b..b0d048c6294 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -29,10 +29,6 @@ enable_language(CUDA)
 
 if(CMAKE_COMPILER_IS_GNUCXX)
     list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
-    if(CUDF_BUILD_TESTS OR CUDF_BUILD_BENCHMARKS)
-        # Suppress parentheses warning which causes gmock to fail
-        list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-parentheses)
-    endif()
 endif()
 
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)

From 444b889a05a8697133f01bcbd7ada20424127bdd Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 24 Mar 2021 06:09:30 -0500
Subject: [PATCH 04/14] Add Resources to README. (#7697)

Resolves #7217 by adding a section of commonly needed resource links at the top of the README.

In #7217, I also proposed adding relevant badges (e.g. for build status, download links, citation information, etc.). I would be happy to add that to this PR if that is of interest. I'm opening the PR without badges for now, because I think the "Resources" section is valuable by itself, for readers who want quick access to common destinations.

Authors:
  - Bradley Dice (@bdice)

Approvers:
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7697
---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index c0fa500ad77..687d25c200b 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,17 @@
 
 **NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/main/README.md) ensure you are on the `main` branch.
 
+## Resources
+
+- [cuDF Reference Documentation](https://docs.rapids.ai/api/cudf/stable/): Python API reference, tutorials, and topic guides.
+- [libcudf Reference Documentation](https://docs.rapids.ai/api/libcudf/stable/): C/C++ CUDA library API reference.
+- [Getting Started](https://rapids.ai/start.html): Instructions for installing cuDF.
+- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
+- [GitHub repository](https://github.com/rapidsai/cudf): Download the cuDF source code.
+- [Issue tracker](https://github.com/rapidsai/cudf/issues): Report issues or request features.
+
+## Overview
+
 Built based on the [Apache Arrow](http://arrow.apache.org/) columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data.
 
 cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming.

From 267d29ba5a438af684c921ff66558d0822f5db59 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Wed, 24 Mar 2021 10:37:11 -0400
Subject: [PATCH 05/14] Update codeowners file (#7701)

This PR updates the `codeowners` file to only require `ops-codeowners` reviews on the `Dockerfile`/`.dockerignore`/`docker` files in the repo's root directory. This will prevent `ops-codeowners` from getting tagged in reviews for PRs such as #7671.

Authors:
  - AJ Schmidt (@ajschmidt8)

Approvers:
  - Jordan Jacobelli (@Ethyling)
  - Mike Wendt (@mike-wendt)

URL: https://github.com/rapidsai/cudf/pull/7701
---
 .github/CODEOWNERS | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5f0be6d797a..59e2ea224f6 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -14,9 +14,9 @@ python/dask_cudf/  @rapidsai/cudf-dask-codeowners
 java/              @rapidsai/cudf-java-codeowners
 
 #build/ops code owners
-.github/           @rapidsai/ops-codeowners 
-/ci/                @rapidsai/ops-codeowners
+.github/           @rapidsai/ops-codeowners
+/ci/               @rapidsai/ops-codeowners
 conda/             @rapidsai/ops-codeowners
-**/Dockerfile      @rapidsai/ops-codeowners
-**/.dockerignore   @rapidsai/ops-codeowners
-docker/            @rapidsai/ops-codeowners
+/Dockerfile        @rapidsai/ops-codeowners
+/.dockerignore     @rapidsai/ops-codeowners
+/docker/           @rapidsai/ops-codeowners

From 8a7af11f7da1fd43fde4d308cc6394371f884e05 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Wed, 24 Mar 2021 13:01:23 -0400
Subject: [PATCH 06/14] Fixing empty null lists throwing explode_outer for a
 loop. (#7649)

I found two issues, one was that we didn't build the correct number of null or empty offsets. We should build them for the exploded column and they are sized as such, but I was marching over it the size of the child data column. This didn't cause trouble as long as there was more data than nulls.

The second issue was the large loop. We have to go over that loop at least the number of nulls we have as that loop is doing two things at once.

1) writing the valid data rows to a gather map.
2) filling in the holes in the gather map for null and empty entries.

This was another case of things working fine as long as we ran the loop enough to cover all the null entries, which happens unless there are more nulls than entries. That wasn't tested and so it was never seen. Thankfully, @sperlingxx tested exactly that.

Added a test for this case.

Fixes #7636

Authors:
  - Mike Wilson (@hyperbolic2346)

Approvers:
  - Nghia Truong (@ttnghia)
  - MithunR (@mythrocks)

URL: https://github.com/rapidsai/cudf/pull/7649
---
 cpp/src/lists/explode.cu          |  77 +++---
 cpp/tests/lists/explode_tests.cpp | 374 +++++++++++++++++++++++-------
 2 files changed, 326 insertions(+), 125 deletions(-)

diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 8233635050e..2b495deb47f 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -188,7 +188,7 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
     });
   thrust::inclusive_scan(rmm::exec_policy(stream),
                          null_or_empty,
-                         null_or_empty + sliced_child.size(),
+                         null_or_empty + explode_col.size(),
                          null_or_empty_offset.begin());
 
   auto null_or_empty_count =
@@ -209,41 +209,48 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
   // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
   auto offsets_minus_one = thrust::make_transform_iterator(
     thrust::next(offsets), [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+
+  auto fill_gather_maps = [offsets_minus_one,
+                           gather_map_p             = gather_map.begin(),
+                           explode_col_gather_map_p = explode_col_gather_map.begin(),
+                           position_array           = pos.begin(),
+                           sliced_child_size        = sliced_child.size(),
+                           null_or_empty_offset_p   = null_or_empty_offset.begin(),
+                           include_position,
+                           offsets,
+                           null_or_empty,
+                           offset_size = explode_col.offsets().size() - 1] __device__(auto idx) {
+    if (idx < sliced_child_size) {
+      auto lb_idx =
+        thrust::distance(offsets_minus_one,
+                         thrust::lower_bound(
+                           thrust::seq, offsets_minus_one, offsets_minus_one + (offset_size), idx));
+      auto index_to_write                      = null_or_empty_offset_p[lb_idx] + idx;
+      gather_map_p[index_to_write]             = lb_idx;
+      explode_col_gather_map_p[index_to_write] = idx;
+      if (include_position) {
+        position_array[index_to_write] = idx - (offsets[lb_idx] - offsets[0]);
+      }
+    }
+    if (null_or_empty[idx]) {
+      auto invalid_index = null_or_empty_offset_p[idx] == 0
+                             ? offsets[idx]
+                             : offsets[idx] + null_or_empty_offset_p[idx] - 1;
+      gather_map_p[invalid_index] = idx;
+
+      // negative one to indicate a null value
+      explode_col_gather_map_p[invalid_index] = -1;
+      if (include_position) { position_array[invalid_index] = 0; }
+    }
+  };
+
+  // we need to do this loop at least explode_col times or we may not properly fill in null and
+  // empty entries.
+  auto loop_count = std::max(sliced_child.size(), explode_col.size());
+
   // Fill in gather map with all the child column's entries
-  thrust::for_each(rmm::exec_policy(stream),
-                   counting_iter,
-                   counting_iter + sliced_child.size(),
-                   [offsets_minus_one,
-                    gather_map             = gather_map.begin(),
-                    explode_col_gather_map = explode_col_gather_map.begin(),
-                    position_array         = pos.begin(),
-                    include_position,
-                    offsets,
-                    null_or_empty_offset = null_or_empty_offset.begin(),
-                    null_or_empty,
-                    offset_size = explode_col.offsets().size() - 1] __device__(auto idx) {
-                     auto lb_idx = thrust::distance(
-                       offsets_minus_one,
-                       thrust::lower_bound(
-                         thrust::seq, offsets_minus_one, offsets_minus_one + (offset_size), idx));
-                     auto index_to_write                    = null_or_empty_offset[lb_idx] + idx;
-                     gather_map[index_to_write]             = lb_idx;
-                     explode_col_gather_map[index_to_write] = idx;
-                     if (include_position) {
-                       position_array[index_to_write] = idx - (offsets[lb_idx] - offsets[0]);
-                     }
-                     if (null_or_empty[idx]) {
-                       auto invalid_index = null_or_empty_offset[idx] == 0
-                                              ? offsets[idx]
-                                              : offsets[idx] + null_or_empty_offset[idx] - 1;
-                       gather_map[invalid_index] = idx;
-
-                       // negative one to indicate a null value
-                       explode_col_gather_map[invalid_index] = -1;
-
-                       if (include_position) { position_array[invalid_index] = 0; }
-                     }
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream), counting_iter, counting_iter + loop_count, fill_gather_maps);
 
   return build_table(
     input_table,
diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp
index 2ec9294d118..4c7ded0efd7 100644
--- a/cpp/tests/lists/explode_tests.cpp
+++ b/cpp/tests/lists/explode_tests.cpp
@@ -102,15 +102,17 @@ TEST_F(ExplodeTest, Basics)
 TEST_F(ExplodeTest, SingleNull)
 {
   //    a                   b
-  //    [1, 2, 7]           100
+  //    null                100
   //    [5, 6]              200
   //    []                  300
   //    [0, 3]              400
 
+  constexpr auto null = 0;
+
   auto first_invalid =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
 
-  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
+  LCW a({LCW{null}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
   FCW b({100, 200, 300, 400});
 
   FCW expected_a{5, 6, 0, 3};
@@ -134,15 +136,17 @@ TEST_F(ExplodeTest, Nulls)
 {
   //    a                   b
   //    [1, 2, 7]           100
-  //    [5, 6]              200
+  //    null                200
   //    [0, 3]              300
 
+  constexpr auto null = 0;
+
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
   auto always_valid =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids);
+  LCW a({LCW{1, 2, 7}, LCW{null}, LCW{0, 3}}, valids);
   FCW b({100, 200, 300}, valids);
 
   FCW expected_a({1, 2, 7, 0, 3});
@@ -165,18 +169,21 @@ TEST_F(ExplodeTest, Nulls)
 TEST_F(ExplodeTest, NullsInList)
 {
   //    a                   b
-  //    [1, 2, 7]           100
-  //    [5, 6, 0, 9]        200
+  //    [1, null, 7]        100
+  //    [5, null, 0, null]  200
   //    []                  300
-  //    [0, 3, 8]           400
+  //    [0, null, 8]        400
+
+  constexpr auto null = 0;
 
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)};
+  LCW a{
+    LCW({1, null, 7}, valids), LCW({5, null, 0, null}, valids), LCW{}, LCW({0, null, 8}, valids)};
   FCW b{100, 200, 300, 400};
 
-  FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
+  FCW expected_a({1, null, 7, 5, null, 0, null, 0, null, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
   FCW expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400};
 
   cudf::table_view t({a, b});
@@ -224,16 +231,18 @@ TEST_F(ExplodeTest, NestedNulls)
 {
   //    a                   b
   //    [[1, 2], [7, 6, 5]] 100
-  //    [[5, 6]]            200
+  //    null                null
   //    [[0, 3],[5],[2, 1]] 300
 
+  constexpr auto null = 0;
+
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
   auto always_valid =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
-  FCW b({100, 200, 300}, valids);
+  LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{null}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
+  FCW b({100, null, 300}, valids);
 
   LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{0, 3}, LCW{5}, LCW{2, 1}};
   FCW expected_b({100, 100, 300, 300, 300}, always_valid);
@@ -254,21 +263,23 @@ TEST_F(ExplodeTest, NestedNulls)
 
 TEST_F(ExplodeTest, NullsInNested)
 {
-  //    a                   b
-  //    [[1, 2], [7, 6, 5]] 100
-  //    [[5, 6]]            200
-  //    [[0, 3],[5],[2, 1]] 300
+  //    a                      b
+  //    [[1, null], [7, 6, 5]] 100
+  //    [[5, 6]]               200
+  //    [[0, 3],[5],[2, null]] 300
+
+  constexpr auto null = 0;
 
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+  LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
          LCW{LCW{5, 6}},
-         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
   FCW b({100, 200, 300});
 
   LCW expected_a{
-    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+    LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
   FCW expected_b{100, 100, 200, 300, 300, 300};
 
   cudf::table_view t({a, b});
@@ -287,20 +298,22 @@ TEST_F(ExplodeTest, NullsInNested)
 
 TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
 {
-  //    a                       b
-  //    [[1, 2], [], [7, 6, 5]] 100
-  //    [[5, 6]]                200
-  //    [[0, 3],[5],[2, 1]]     300
+  //    a                          b
+  //    [[1, null], [], [7, 6, 5]] 100
+  //    [[5, 6]]                   200
+  //    [[0, 3],[5],[2, null]]     300
+
+  constexpr auto null = 0;
 
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}},
+  LCW a{LCW{LCW({1, null}, valids), LCW{}, LCW{7, 6, 5}},
         LCW{LCW{5, 6}},
-        LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}};
+        LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}};
   FCW b{100, 200, 300};
 
-  FCW expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  FCW expected_a({1, null, 7, 6, 5, 5, 6, 0, 3, 5, 2, null}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
   FCW expected_b{100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
 
   cudf::table_view t({a, b});
@@ -320,23 +333,25 @@ TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
 
 TEST_F(ExplodeTest, NestedStructs)
 {
-  //    a                   b
-  //    [[1, 2], [7, 6, 5]] {100, "100"}
-  //    [[5, 6]]            {200, "200"}
-  //    [[0, 3],[5],[2, 1]] {300, "300"}
+  //    a                      b
+  //    [[1, null], [7, 6, 5]] {100, "100"}
+  //    [[5, 6]]               {200, "200"}
+  //    [[0, 3],[5],[2, null]] {300, "300"}
+
+  constexpr auto null = 0;
 
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+  LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
          LCW{LCW{5, 6}},
-         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
   FCW b1({100, 200, 300});
   strings_column_wrapper b2{"100", "200", "300"};
   structs_column_wrapper b({b1, b2});
 
   LCW expected_a{
-    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+    LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
   FCW expected_b1{100, 100, 200, 300, 300, 300};
   strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
   structs_column_wrapper expected_b({expected_b1, expected_b2});
@@ -397,15 +412,17 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs)
 
 TEST_F(ExplodeTest, SlicedList)
 {
-  //    a                        b
-  //    [[1, 2],[7, 6, 5]]       100
-  //    [[5, 6]]                 200
-  //    [[0, 3],[5],[2, 1]]      300
-  //    [[8, 3],[],[4, 3, 1, 2]] 400
-  //    [[2, 3, 4],[9, 8]]       500
+  //    a                              b
+  //    [[1, null],[7, 6, 5]]          100
+  //    [[5, 6]]                       200
+  //    [[0, 3],[5],[2, null]]         300
+  //    [[8, 3],[],[4, null, 1, null]] 400
+  //    [[2, 3, 4],[9, 8]]             500
 
   //    slicing the top 2 rows and the bottom row off
 
+  constexpr auto null = 0;
+
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
@@ -417,7 +434,7 @@ TEST_F(ExplodeTest, SlicedList)
   FCW b({100, 200, 300, 400, 500});
 
   LCW expected_a{
-    LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)};
+    LCW{0, 3}, LCW{5}, LCW({2, null}, valids), LCW{8, 3}, LCW{}, LCW({4, null, 1, null}, valids)};
   FCW expected_b{300, 300, 300, 400, 400, 400};
 
   cudf::table_view t({a, b});
@@ -490,19 +507,21 @@ TEST_F(ExplodeOuterTest, Basics)
 
 TEST_F(ExplodeOuterTest, SingleNull)
 {
-  //    a                   b
-  //    [1, 2, 7]           100
-  //    [5, 6]              200
-  //    []                  300
-  //    [0, 3]              400
+  //    a              b
+  //    null           100
+  //    [5, 6]         200
+  //    []             300
+  //    [0, 3]         400
+
+  constexpr auto null = 0;
 
   auto first_invalid =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
 
-  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
+  LCW a({LCW{null}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
   FCW b({100, 200, 300, 400});
 
-  FCW expected_a{{0, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}};
+  FCW expected_a{{null, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}};
   FCW expected_b{100, 200, 200, 300, 400, 400};
 
   cudf::table_view t({a, b});
@@ -522,17 +541,19 @@ TEST_F(ExplodeOuterTest, Nulls)
 {
   //    a                   b
   //    [1, 2, 7]           100
-  //    [5, 6]              200
+  //    null                null
   //    [0, 3]              300
 
+  constexpr auto null = 0;
+
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids);
-  FCW b({100, 200, 300}, valids);
+  LCW a({LCW{1, 2, 7}, LCW{null}, LCW{0, 3}}, valids);
+  FCW b({100, null, 300}, valids);
 
-  FCW expected_a({1, 2, 7, 0, 0, 3}, {1, 1, 1, 0, 1, 1});
-  FCW expected_b({100, 100, 100, 200, 300, 300}, {1, 1, 1, 0, 1, 1});
+  FCW expected_a({1, 2, 7, null, 0, 3}, {1, 1, 1, 0, 1, 1});
+  FCW expected_b({100, 100, 100, null, 300, 300}, {1, 1, 1, 0, 1, 1});
 
   cudf::table_view t({a, b});
   cudf::table_view expected({expected_a, expected_b});
@@ -547,21 +568,182 @@ TEST_F(ExplodeOuterTest, Nulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
 }
 
+TEST_F(ExplodeOuterTest, AllNulls)
+{
+  //    a            b
+  //    null         100
+  //    null         200
+  //    null         300
+
+  constexpr auto null = 0;
+
+  auto non_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return false; });
+
+  LCW a({LCW{null}, LCW{null}, LCW{null}}, non_valid);
+  FCW b({100, 200, 300});
+
+  FCW expected_a({null, null, null}, {0, 0, 0});
+  FCW expected_b({100, 200, 300});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 0, 0};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, SequentialNulls)
+{
+  //    a               b
+  //    [1, 2, null]    100
+  //    [3, 4]          200
+  //    []              300
+  //    []              400
+  //    [5, 6, 7]       500
+
+  constexpr auto null = 0;
+
+  auto third_invalid =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 2 ? false : true; });
+
+  LCW a{LCW({1, 2, null}, third_invalid), LCW{3, 4}, LCW{}, LCW{}, LCW{5, 6, 7}};
+  FCW b{100, 200, 300, 400, 500};
+
+  FCW expected_a({1, 2, null, 3, 4, null, null, 5, 6, 7}, {1, 1, 0, 1, 1, 0, 0, 1, 1, 1});
+  FCW expected_b({100, 100, 100, 200, 200, 300, 400, 500, 500, 500});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 0, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, MoreEmptyThanData)
+{
+  //    a               b
+  //    [1, 2]          100
+  //    []              200
+  //    []              300
+  //    []              400
+  //    []              500
+  //    [3]             600
+
+  constexpr auto null = 0;
+
+  LCW a{LCW{1, 2}, LCW{}, LCW{}, LCW{}, LCW{}, LCW{3}};
+  FCW b{100, 200, 300, 400, 500, 600};
+
+  FCW expected_a({1, 2, null, null, null, null, 3}, {1, 1, 0, 0, 0, 0, 1});
+  FCW expected_b({100, 100, 200, 300, 400, 500, 600});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 0, 0, 0};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, TrailingEmptys)
+{
+  //    a               b
+  //    [1, 2]          100
+  //    []              200
+  //    []              300
+  //    []              400
+  //    []              500
+
+  constexpr auto null = 0;
+
+  LCW a{LCW{1, 2}, LCW{}, LCW{}, LCW{}, LCW{}};
+  FCW b{100, 200, 300, 400, 500};
+
+  FCW expected_a({1, 2, null, null, null, null}, {1, 1, 0, 0, 0, 0});
+  FCW expected_b({100, 100, 200, 300, 400, 500});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 0, 0};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, LeadingNulls)
+{
+  //    a               b
+  //    null            100
+  //    null            200
+  //    null            300
+  //    null            400
+  //    [1, 2]          500
+
+  constexpr auto null = 0;
+
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 4 ? true : false; });
+
+  LCW a({LCW{null}, LCW{null}, LCW{null}, LCW{null}, LCW{1, 2}}, valids);
+  FCW b{100, 200, 300, 400, 500};
+
+  FCW expected_a({null, null, null, null, 1, 2}, {0, 0, 0, 0, 1, 1});
+  FCW expected_b({100, 200, 300, 400, 500, 500});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 0, 0, 0, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
 TEST_F(ExplodeOuterTest, NullsInList)
 {
   //    a                   b
-  //    [1, 2, 7]           100
-  //    [5, 6, 0, 9]        200
+  //    [1, null, 7]        100
+  //    [5, null, 0, null]  200
   //    []                  300
-  //    [0, 3, 8]           400
+  //    [0, null, 8]        400
+
+  constexpr auto null = 0;
 
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)};
+  LCW a{
+    LCW({1, null, 7}, valids), LCW({5, null, 0, null}, valids), LCW{}, LCW({0, null, 8}, valids)};
   FCW b{100, 200, 300, 400};
 
-  FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1});
+  FCW expected_a({1, null, 7, 5, null, 0, null, null, 0, null, 8},
+                 {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1});
   FCW expected_b{100, 100, 100, 200, 200, 200, 200, 300, 400, 400, 400};
 
   cudf::table_view t({a, b});
@@ -612,15 +794,18 @@ TEST_F(ExplodeOuterTest, NestedNulls)
   //    [[5, 6]]            200
   //    [[0, 3],[5],[2, 1]] 300
 
+  constexpr auto null = 0;
+
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
+  LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{null}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
   FCW b({100, 200, 300});
 
   auto expected_valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 2 ? false : true; });
-  LCW expected_a({LCW{1, 2}, LCW{7, 6, 5}, LCW{}, LCW{0, 3}, LCW{5}, LCW{2, 1}}, expected_valids);
+  LCW expected_a({LCW{1, 2}, LCW{7, 6, 5}, LCW{null}, LCW{0, 3}, LCW{5}, LCW{2, 1}},
+                 expected_valids);
   FCW expected_b({100, 100, 200, 300, 300, 300});
   cudf::table_view t({a, b});
   cudf::table_view expected({expected_a, expected_b});
@@ -637,21 +822,23 @@ TEST_F(ExplodeOuterTest, NestedNulls)
 
 TEST_F(ExplodeOuterTest, NullsInNested)
 {
-  //    a                   b
-  //    [[1, 2], [7, 6, 5]] 100
-  //    [[5, 6]]            200
-  //    [[0, 3],[5],[2, 1]] 300
+  //    a                      b
+  //    [[1, null], [7, 6, 5]] 100
+  //    [[5, 6]]               200
+  //    [[0, 3],[5],[2, null]] 300
+
+  constexpr auto null = 0;
 
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+  LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
          LCW{LCW{5, 6}},
-         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
   FCW b({100, 200, 300});
 
   LCW expected_a{
-    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+    LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
   FCW expected_b{100, 100, 200, 300, 300, 300};
 
   cudf::table_view t({a, b});
@@ -670,20 +857,23 @@ TEST_F(ExplodeOuterTest, NullsInNested)
 
 TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode)
 {
-  //    a                       b
-  //    [[1, 2], [], [7, 6, 5]] 100
-  //    [[5, 6]]                200
-  //    [[0, 3],[5],[2, 1]]     300
+  //    a                          b
+  //    [[1, null], [], [7, 6, 5]] 100
+  //    [[5, 6]]                   200
+  //    [[0, 3],[5],[2, null]]     300
+
+  constexpr auto null = 0;
 
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}},
+  LCW a{LCW{LCW({1, null}, valids), LCW{}, LCW{7, 6, 5}},
         LCW{LCW{5, 6}},
-        LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}};
+        LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}};
   FCW b{100, 200, 300};
 
-  FCW expected_a({1, 2, 0, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  FCW expected_a({1, null, null, 7, 6, 5, 5, 6, 0, 3, 5, 2, null},
+                 {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
   FCW expected_b{100, 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
 
   cudf::table_view t({a, b});
@@ -703,23 +893,25 @@ TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode)
 
 TEST_F(ExplodeOuterTest, NestedStructs)
 {
-  //    a                   b
-  //    [[1, 2], [7, 6, 5]] {100, "100"}
-  //    [[5, 6]]            {200, "200"}
-  //    [[0, 3],[5],[2, 1]] {300, "300"}
+  //    a                      b
+  //    [[1, null], [7, 6, 5]] {100, "100"}
+  //    [[5, 6]]               {200, "200"}
+  //    [[0, 3],[5],[2, null]] {300, "300"}
+
+  constexpr auto null = 0;
 
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+  LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
          LCW{LCW{5, 6}},
-         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
   FCW b1({100, 200, 300});
   strings_column_wrapper b2{"100", "200", "300"};
   structs_column_wrapper b({b1, b2});
 
   LCW expected_a{
-    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+    LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
   FCW expected_b1{100, 100, 200, 300, 300, 300};
   strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
   structs_column_wrapper expected_b({expected_b1, expected_b2});
@@ -780,27 +972,29 @@ TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs)
 
 TEST_F(ExplodeOuterTest, SlicedList)
 {
-  //    a                        b
-  //    [[1, 2],[7, 6, 5]]       100
-  //    [[5, 6]]                 200
-  //    [[0, 3],[5],[2, 1]]      300
-  //    [[8, 3],[],[4, 3, 1, 2]] 400
-  //    [[2, 3, 4],[9, 8]]       500
+  //    a                              b
+  //    [[1, null],[7, 6, 5]]          100
+  //    [[5, 6]]                       200
+  //    [[0, 3],[5],[2, null]]         300
+  //    [[8, 3],[],[4, null, 1, null]] 400
+  //    [[2, 3, 4],[9, 8]]             500
 
   //    slicing the top 2 rows and the bottom row off
 
+  constexpr auto null = 0;
+
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 2 == 0 ? true : false; });
 
-  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+  LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
          LCW{LCW{5, 6}},
-         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)},
-         LCW{LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)},
+         LCW{LCW{8, 3}, LCW{}, LCW({4, null, 1, null}, valids)},
          LCW{LCW{2, 3, 4}, LCW{9, 8}}});
   FCW b({100, 200, 300, 400, 500});
 
   LCW expected_a{
-    LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)};
+    LCW{0, 3}, LCW{5}, LCW({2, null}, valids), LCW{8, 3}, LCW{}, LCW({4, null, 1, null}, valids)};
   FCW expected_b{300, 300, 300, 400, 400, 400};
 
   cudf::table_view t({a, b});

From e73fff0196d5b38b539068ea5e5bd8d6a2336afa Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Wed, 24 Mar 2021 14:01:25 -0400
Subject: [PATCH 07/14] Misc Python/Cython optimizations (#7686)

This PR introduces various small optimizations that should generally improve various common Python overhead. See https://github.com/rapidsai/cudf/pull/7454#issuecomment-804483021 for the motivation behind these optimizations and some benchmarks.

Merge after: #7660

Summary:

* Adds a way to initialize a ColumnAccessor (_init_unsafe) without validating its input. This is useful when converting a `cudf::table` to a `Frame`, where we're guaranteed the columns are well formed
* Improved (faster) `is_numerical_dtype`
* Prioritize check for numeric dtypes in `astype()` and `build_column()`. Numeric types are presumably more common, and we can avoid expensive checks for other dtypes this way.

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7686
---
 python/cudf/cudf/_lib/table.pyx          | 32 +++++++++++++++---------
 python/cudf/cudf/core/buffer.py          |  4 +++
 python/cudf/cudf/core/column/column.py   | 24 ++++++++++--------
 python/cudf/cudf/core/column_accessor.py | 23 ++++++++++++-----
 python/cudf/cudf/core/frame.py           |  4 ++-
 python/cudf/cudf/utils/dtypes.py         | 15 +++++------
 6 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index f97b45d8abf..93d79ba6843 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -99,22 +99,30 @@ cdef class Table:
         cdef vector[unique_ptr[column]].iterator it = columns.begin()
 
         # First construct the index, if any
+        cdef int i
+
         index = None
         if index_names is not None:
-            index_columns = []
-            for _ in index_names:
-                index_columns.append(Column.from_unique_ptr(
-                    move(dereference(it))
-                ))
-                it += 1
-            index = Table(dict(zip(index_names, index_columns)))
+            index_data = ColumnAccessor._create_unsafe(
+                {
+                    name: Column.from_unique_ptr(
+                        move(dereference(it + i))
+                    )
+                    for i, name in enumerate(index_names)
+                }
+            )
+            index = Table(data=index_data)
 
         # Construct the data dict
-        data_columns = []
-        for _ in column_names:
-            data_columns.append(Column.from_unique_ptr(move(dereference(it))))
-            it += 1
-        data = dict(zip(column_names, data_columns))
+        cdef int n_index_columns = len(index_names) if index_names else 0
+        data = ColumnAccessor._create_unsafe(
+            {
+                name: Column.from_unique_ptr(
+                    move(dereference(it + i + n_index_columns))
+                )
+                for i, name in enumerate(column_names)
+            }
+        )
 
         return Table(data=data, index=index)
 
diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
index 350346a87f9..9fc5570e35a 100644
--- a/python/cudf/cudf/core/buffer.py
+++ b/python/cudf/cudf/core/buffer.py
@@ -42,6 +42,10 @@ def __init__(
             self.ptr = data.ptr
             self.size = data.size
             self._owner = owner or data._owner
+        elif isinstance(data, rmm.DeviceBuffer):
+            self.ptr = data.ptr
+            self.size = data.size
+            self._owner = data
         elif hasattr(data, "__array_interface__") or hasattr(
             data, "__cuda_array_interface__"
         ):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b2b2874eeb4..dd06d97d105 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1017,7 +1017,9 @@ def distinct_count(
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
     def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
-        if is_categorical_dtype(dtype):
+        if is_numerical_dtype(dtype):
+            return self.as_numerical_column(dtype)
+        elif is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
         elif pd.api.types.pandas_dtype(dtype).type in {
             np.str_,
@@ -1548,6 +1550,16 @@ def build_column(
     """
     dtype = pd.api.types.pandas_dtype(dtype)
 
+    if is_numerical_dtype(dtype):
+        assert data is not None
+        return cudf.core.column.NumericalColumn(
+            data=data,
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+        )
     if is_categorical_dtype(dtype):
         if not len(children) == 1:
             raise ValueError(
@@ -1634,15 +1646,7 @@ def build_column(
             children=children,
         )
     else:
-        assert data is not None
-        return cudf.core.column.NumericalColumn(
-            data=data,
-            dtype=dtype,
-            mask=mask,
-            size=size,
-            offset=offset,
-            null_count=null_count,
-        )
+        raise TypeError(f"Unrecognized dtype: {dtype}")
 
 
 def build_categorical_column(
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 0c580132290..33bae5c1328 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -19,11 +19,7 @@
 
 import cudf
 from cudf.core import column
-from cudf.utils.utils import (
-    cached_property,
-    to_flat_dict,
-    to_nested_dict,
-)
+from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict
 
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
@@ -84,6 +80,21 @@ def __init__(
             self.multiindex = multiindex
             self._level_names = level_names
 
+    @classmethod
+    def _create_unsafe(
+        cls,
+        data: Dict[Any, ColumnBase],
+        multiindex: bool = False,
+        level_names=None,
+    ) -> ColumnAccessor:
+        # create a ColumnAccessor without verifying column
+        # type or size
+        obj = cls()
+        obj._data = data
+        obj.multiindex = multiindex
+        obj._level_names = level_names
+        return obj
+
     def __iter__(self):
         return self._data.__iter__()
 
@@ -167,7 +178,7 @@ def _column_length(self):
             return 0
 
     def _clear_cache(self):
-        cached_properties = "columns", "names", "_grouped_data"
+        cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
             try:
                 self.__delattr__(attr)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index e6898b8c606..ecff3dee573 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2408,7 +2408,9 @@ def _copy_type_metadata(
         for name, col, other_col in zip(
             self._data.keys(), self._data.values(), other._data.values()
         ):
-            self._data[name] = other_col._copy_type_metadata(col)
+            self._data.set_by_label(
+                name, other_col._copy_type_metadata(col), validate=False
+            )
 
         if include_index:
             if self._index is not None and other._index is not None:
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 1438421bb12..8875a36dba8 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -144,16 +144,13 @@ def numeric_normalize_types(*args):
 
 
 def is_numerical_dtype(obj):
-    if is_categorical_dtype(obj):
+    # TODO: we should handle objects with a `.dtype` attribute,
+    # e.g., arrays, here.
+    try:
+        dtype = np.dtype(obj)
+    except TypeError:
         return False
-    if is_list_dtype(obj):
-        return False
-    return (
-        np.issubdtype(obj, np.bool_)
-        or np.issubdtype(obj, np.floating)
-        or np.issubdtype(obj, np.signedinteger)
-        or np.issubdtype(obj, np.unsignedinteger)
-    )
+    return dtype.kind in "biuf"
 
 
 def is_string_dtype(obj):

From aa7ca46dcd9ebbeef86261b188735e58cfb4c3ca Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 24 Mar 2021 14:58:18 -0500
Subject: [PATCH 08/14] Fix SparkMurmurHash3_32 hash inconsistencies with
 Apache Spark (#7672)

#7024 added a Spark variant of Murmur3 hashing, but it is inconsistent with Apache Spark's hash calculations in a few areas:
- `-0.0` and `0.0` are not treated the same by Apache Spark for floats and doubles
- byte and short integral values are upcast to a 32-bit unsigned int (i.e.: zero-filled) before calculating the hash

In addition libcudf allows hashing of timestamp columns but the JNI bindings asserted if timestamp columns were passed in, disabling the ability to hash on timestamps directly.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Nghia Truong (@ttnghia)
  - Jake Hemstad (@jrhemstad)
  - Alessandro Bellina (@abellina)
  - MithunR (@mythrocks)
  - Robert (Bobby) Evans (@revans2)

URL: https://github.com/rapidsai/cudf/pull/7672
---
 .../cudf/detail/utilities/hash_functions.cuh  |  49 ++++-
 cpp/tests/hashing/hash_test.cpp               | 174 ++++++++++++++----
 .../java/ai/rapids/cudf/ColumnVector.java     |   1 -
 .../java/ai/rapids/cudf/ColumnVectorTest.java |  40 ++++
 4 files changed, 226 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index a2e7d6d4aae..31533a69487 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/assert.cuh>
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <hash/hash_constants.hpp>
 
@@ -570,9 +571,7 @@ struct SparkMurmurHash3_32 {
   template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
   hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const
   {
-    if (key == T{0.0}) {
-      return compute(T{0.0});
-    } else if (isnan(key)) {
+    if (isnan(key)) {
       T nan = std::numeric_limits<T>::quiet_NaN();
       return compute(nan);
     } else {
@@ -630,6 +629,48 @@ hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32<bool>::operator()(bool
   return this->compute<uint32_t>(key);
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<int8_t>::operator()(int8_t const& key) const
+{
+  return this->compute<uint32_t>(key);
+}
+
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<uint8_t>::operator()(uint8_t const& key) const
+{
+  return this->compute<uint32_t>(key);
+}
+
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<int16_t>::operator()(int16_t const& key) const
+{
+  return this->compute<uint32_t>(key);
+}
+
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<uint16_t>::operator()(uint16_t const& key) const
+{
+  return this->compute<uint32_t>(key);
+}
+
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+{
+  return this->compute<uint64_t>(key.value());
+}
+
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+{
+  return this->compute<uint64_t>(key.value());
+}
+
 /**
  * @brief Specialization of MurmurHash3_32 operator for strings.
  */
diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp
index f0d13390edf..5641d445ff3 100644
--- a/cpp/tests/hashing/hash_test.cpp
+++ b/cpp/tests/hashing/hash_test.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/hashing.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -201,27 +202,37 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
   T nan   = std::numeric_limits<T>::quiet_NaN();
   T inf   = std::numeric_limits<T>::infinity();
 
-  fixed_width_column_wrapper<T> const col1({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  fixed_width_column_wrapper<T> const col2(
-    {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+  fixed_width_column_wrapper<T> const col({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  fixed_width_column_wrapper<T> const col_neg_zero(
+    {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  fixed_width_column_wrapper<T> const col_neg_nan(
+    {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
 
-  auto const input1 = cudf::table_view({col1});
-  auto const input2 = cudf::table_view({col2});
+  auto const table_col          = cudf::table_view({col});
+  auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
+  auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
 
-  auto const output1 = cudf::hash(input1);
-  auto const output2 = cudf::hash(input2);
+  auto const hash_col          = cudf::hash(table_col);
+  auto const hash_col_neg_zero = cudf::hash(table_col_neg_zero);
+  auto const hash_col_neg_nan  = cudf::hash(table_col_neg_nan);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view(), true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, true);
 
-  auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
-  auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);
+  constexpr auto serial_hasher   = cudf::hash_id::HASH_SERIAL_MURMUR3;
+  auto const serial_col          = cudf::hash(table_col, serial_hasher, {}, 0);
+  auto const serial_col_neg_zero = cudf::hash(table_col_neg_zero, serial_hasher);
+  auto const serial_col_neg_nan  = cudf::hash(table_col_neg_nan, serial_hasher);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_zero, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*serial_col, *serial_col_neg_nan, true);
 
-  auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
-  auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);
+  // Spark hash is sensitive to 0 and -0
+  constexpr auto spark_hasher  = cudf::hash_id::HASH_SPARK_MURMUR3;
+  auto const spark_col         = cudf::hash(table_col, spark_hasher, {}, 0);
+  auto const spark_col_neg_nan = cudf::hash(table_col_neg_nan, spark_hasher);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
 }
 
 class SerialMurmurHash3Test : public cudf::test::BaseFixture {
@@ -267,10 +278,59 @@ class SparkMurmurHash3Test : public cudf::test::BaseFixture {
 
 TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
 {
-  fixed_width_column_wrapper<int32_t> const strings_col_result(
+  // The hash values were determined by running the following Scala code in Apache Spark:
+  // import org.apache.spark.sql.catalyst.util.DateTimeUtils
+  // val schema = new StructType().add("strings",StringType).add("doubles",DoubleType)
+  //   .add("timestamps",TimestampType).add("decimal64", DecimalType(18,7)).add("longs",LongType)
+  //   .add("floats",FloatType).add("dates",DateType).add("decimal32", DecimalType(9,3))
+  //   .add("ints",IntegerType).add("shorts",ShortType).add("bytes",ByteType)
+  //   .add("bools",BooleanType)
+  // val data = Seq(
+  //  Row("", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat,
+  //      DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, false),
+  //  Row("The quick brown fox", -(0.toDouble), DateTimeUtils.toJavaTimestamp(100),
+  //      BigDecimal("0.00001"), 100.toLong, -(0.toFloat), DateTimeUtils.toJavaDate(100),
+  //      BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true),
+  //  Row("jumps over the lazy dog.", -Double.NaN, DateTimeUtils.toJavaTimestamp(-100),
+  //      BigDecimal("-0.00001"), -100.toLong, -Float.NaN, DateTimeUtils.toJavaDate(-100),
+  //      BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, true),
+  //  Row("All work and no play makes Jack a dull boy", Double.MinValue,
+  //      DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
+  //      Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
+  //      BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true),
+  //  Row("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
+  //      DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
+  //      Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
+  //      BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false))
+  // val df = spark.createDataFrame(sc.parallelize(data), schema)
+  // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}"))
+  // df.select(hash(col("*"))).collect
+  fixed_width_column_wrapper<int32_t> const hash_strings_expected(
     {1467149710, 723257560, -1620282500, -2001858707, 1588473657});
-  fixed_width_column_wrapper<int32_t> const ints_col_result(
+  fixed_width_column_wrapper<int32_t> const hash_doubles_expected(
+    {-1670924195, -853646085, -1281358385, 1897734433, -508695674});
+  fixed_width_column_wrapper<int32_t> const hash_timestamps_expected(
+    {-1670924195, 1114849490, 904948192, -1832979433, 1752430209});
+  fixed_width_column_wrapper<int32_t> const hash_decimal64_expected(
+    {-1670924195, 1114849490, 904948192, 1962370902, -1795328666});
+  fixed_width_column_wrapper<int32_t> const hash_longs_expected(
+    {-1670924195, 1114849490, 904948192, -853646085, -1604625029});
+  fixed_width_column_wrapper<int32_t> const hash_floats_expected(
+    {933211791, 723455942, -349261430, -1225560532, -338752985});
+  fixed_width_column_wrapper<int32_t> const hash_dates_expected(
+    {933211791, 751823303, -1080202046, -1906567553, -1503850410});
+  fixed_width_column_wrapper<int32_t> const hash_decimal32_expected(
+    {-1670924195, 1114849490, 904948192, -1454351396, -193774131});
+  fixed_width_column_wrapper<int32_t> const hash_ints_expected(
     {933211791, 751823303, -1080202046, 723455942, 133916647});
+  fixed_width_column_wrapper<int32_t> const hash_shorts_expected(
+    {933211791, 751823303, -1080202046, -1871935946, 1249274084});
+  fixed_width_column_wrapper<int32_t> const hash_bytes_expected(
+    {933211791, 751823303, -1080202046, 1110053733, 1135925485});
+  fixed_width_column_wrapper<int32_t> const hash_bools_expected(
+    {933211791, -559580957, -559580957, -559580957, 933211791});
+  fixed_width_column_wrapper<int32_t> const hash_combined_expected(
+    {-1947042614, -1731440908, 807283935, 725489209, 822276819});
 
   strings_column_wrapper const strings_col({"",
                                             "The quick brown fox",
@@ -278,26 +338,74 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
                                             "All work and no play makes Jack a dull boy",
                                             "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
 
-  using limits = std::numeric_limits<int32_t>;
-  fixed_width_column_wrapper<int32_t> const ints_col({0, 100, -100, limits::min(), limits::max()});
-
+  using double_limits = std::numeric_limits<double>;
+  using long_limits   = std::numeric_limits<int64_t>;
+  using float_limits  = std::numeric_limits<float>;
+  using int_limits    = std::numeric_limits<int32_t>;
+  fixed_width_column_wrapper<double> const doubles_col(
+    {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()});
+  fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const timestamps_col(
+    {0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000});
+  fixed_point_column_wrapper<int64_t> const decimal64_col(
+    {0L, 100L, -100L, -999999999999999999L, 999999999999999999L}, numeric::scale_type{-7});
+  fixed_width_column_wrapper<int64_t> const longs_col(
+    {0L, 100L, -100L, long_limits::min(), long_limits::max()});
+  fixed_width_column_wrapper<float> const floats_col(
+    {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()});
+  fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
+    {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100});
+  fixed_point_column_wrapper<int32_t> const decimal32_col({0, 100, -100, -999999999, 999999999},
+                                                          numeric::scale_type{-3});
+  fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, int_limits::min(), int_limits::max()});
+  fixed_width_column_wrapper<int16_t> const shorts_col({0, 100, -100, -32768, 32767});
+  fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
   fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
   fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
 
-  auto const input1 = cudf::table_view({strings_col});
-  auto const input2 = cudf::table_view({ints_col});
-  auto const input3 = cudf::table_view({strings_col, ints_col, bools_col1});
-  auto const input4 = cudf::table_view({strings_col, ints_col, bools_col2});
-
-  auto const hashed_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 314);
-  auto const hashed_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 42);
-  auto const hashed_output3 = cudf::hash(input3, cudf::hash_id::HASH_SPARK_MURMUR3, {});
-  auto const hashed_output4 = cudf::hash(input4, cudf::hash_id::HASH_SPARK_MURMUR3, {});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output1->view(), strings_col_result, true);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output2->view(), ints_col_result, true);
-  EXPECT_EQ(input3.num_rows(), hashed_output3->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output3->view(), hashed_output4->view(), true);
+  constexpr auto hasher      = cudf::hash_id::HASH_SPARK_MURMUR3;
+  auto const hash_strings    = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314);
+  auto const hash_doubles    = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42);
+  auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42);
+  auto const hash_decimal64  = cudf::hash(cudf::table_view({decimal64_col}), hasher, {}, 42);
+  auto const hash_longs      = cudf::hash(cudf::table_view({longs_col}), hasher, {}, 42);
+  auto const hash_floats     = cudf::hash(cudf::table_view({floats_col}), hasher, {}, 42);
+  auto const hash_dates      = cudf::hash(cudf::table_view({dates_col}), hasher, {}, 42);
+  auto const hash_decimal32  = cudf::hash(cudf::table_view({decimal32_col}), hasher, {}, 42);
+  auto const hash_ints       = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42);
+  auto const hash_shorts     = cudf::hash(cudf::table_view({shorts_col}), hasher, {}, 42);
+  auto const hash_bytes      = cudf::hash(cudf::table_view({bytes_col}), hasher, {}, 42);
+  auto const hash_bools1     = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42);
+  auto const hash_bools2     = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, true);
+
+  auto const combined_table = cudf::table_view({strings_col,
+                                                doubles_col,
+                                                timestamps_col,
+                                                decimal64_col,
+                                                longs_col,
+                                                floats_col,
+                                                dates_col,
+                                                decimal32_col,
+                                                ints_col,
+                                                shorts_col,
+                                                bytes_col,
+                                                bools_col2});
+  auto const hash_combined  = cudf::hash(combined_table, hasher, {}, 42);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, true);
 }
 
 class MD5HashTest : public cudf::test::BaseFixture {
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index defb6eea5b9..e6675591164 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -606,7 +606,6 @@ public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[])
       assert columns[i] != null : "Column vectors passed may not be null";
       assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size";
       assert !columns[i].getType().isDurationType() : "Unsupported column type Duration";
-      assert !columns[i].getType().isTimestampType() : "Unsupported column type Timestamp";
       assert !columns[i].getType().isNestedType() : "Unsupported column of nested type";
       columnViews[i] = columns[i].getNativeView();
     }
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 00d6e51fd91..02fbe56431b 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -527,6 +527,46 @@ void testSpark32BitMurmur3HashDoubles() {
     }
   }
 
+  @Test
+  void testSpark32BitMurmur3HashTimestamps() {
+    try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs(
+        0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL);
+         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 42, 1114849490, 904948192, 657182333, 42, -57193045)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashDecimal64() {
+    try (ColumnVector v = ColumnVector.decimalFromLongs(-7,
+        0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL);
+         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, 657182333, -57193045)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashDecimal32() {
+    try (ColumnVector v = ColumnVector.decimalFromInts(-3,
+        0, 100, -100, 0x12345678, -0x12345678);
+         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, -958054811, -1447702630)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashDates() {
+    try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts(
+        0, null, 100, -100, 0x12345678, null, -0x12345678);
+         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(933211791, 42, 751823303, -1080202046, -1721170160, 42, 1852996993)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
   @Test
   void testSpark32BitMurmur3HashFloats() {
     try (ColumnVector v = ColumnVector.fromBoxedFloats(

From df3c0f054e3a882eda60d35b9cceb4fbd1c445e6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 24 Mar 2021 15:49:46 -0500
Subject: [PATCH 09/14] Fix return type of `DataFrame.argsort` (#7706)

Fixes: #7577

This PR fixes the return type of `DataFrame.argsort` to return `cudf.Series` instead of a `NumericalColumn`.

Authors:
  - GALI PREM SAGAR (@galipremsagar)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7706
---
 python/cudf/cudf/core/dataframe.py       | 24 +++++++++++++++++++++++-
 python/cudf/cudf/tests/test_dataframe.py | 21 +++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bd009a9ad84..b5f57356698 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3841,10 +3841,32 @@ def argsort(self, ascending=True, na_position="last"):
         - Support axis='index' only.
         - Not supporting: inplace, kind
         - Ascending can be a list of bools to control per column
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a':[10, 0, 2], 'b':[-10, 10, 1]})
+        >>> df
+            a   b
+        0  10 -10
+        1   0  10
+        2   2   1
+        >>> inds = df.argsort()
+        >>> inds
+        0    1
+        1    2
+        2    0
+        dtype: int32
+        >>> df.take(inds)
+            a   b
+        1   0  10
+        2   2   1
+        0  10 -10
         """
-        return self._get_sorted_inds(
+        inds_col = self._get_sorted_inds(
             ascending=ascending, na_position=na_position
         )
+        return cudf.Series(inds_col)
 
     @annotate("SORT_INDEX", color="red", domain="cudf_python")
     def sort_index(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 76a02d5e74a..d72b88f1713 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8495,3 +8495,24 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode):
     got = gdf.explode(label_to_explode, ignore_index)
 
     assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "df,ascending,expected",
+    [
+        (
+            cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}),
+            True,
+            cudf.Series([1, 2, 0], dtype="int32"),
+        ),
+        (
+            cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}),
+            False,
+            cudf.Series([0, 2, 1], dtype="int32"),
+        ),
+    ],
+)
+def test_dataframe_argsort(df, ascending, expected):
+    actual = df.argsort(ascending=ascending)
+
+    assert_eq(actual, expected)

From 14172979ffa3ad7023f6eae7a311fa132b7ad8d1 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 24 Mar 2021 17:13:58 -0500
Subject: [PATCH 10/14] Materialize `RangeIndex` when `index=True` in parquet
 writer (#7711)

Resolves: #6873

This PR enables support to materialize a `RangeIndex` when `index=True`. Didn't add any tests as we already test for this in `test_parquet_index` but we were having the tests wrong due to a typo which was writing to the same file both pandas & cudf dataframes. This test is now fixed in this PR.

Authors:
  - GALI PREM SAGAR (@galipremsagar)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7711
---
 python/cudf/cudf/_lib/parquet.pyx      |  4 +++-
 python/cudf/cudf/_lib/utils.pyx        | 32 +++++++++++++++++++-------
 python/cudf/cudf/tests/test_parquet.py | 25 ++++++++++----------
 python/cudf/cudf/utils/ioutils.py      |  7 ++++--
 4 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 0158df46cc4..d8b4fbbbe4b 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -294,7 +294,9 @@ cpdef write_parquet(
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
     cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink)
 
-    if index is not False and not isinstance(table._index, cudf.RangeIndex):
+    if index is True or (
+        index is None and not isinstance(table._index, cudf.RangeIndex)
+    ):
         tv = table.view()
         tbl_meta = make_unique[table_input_metadata](tv)
         for level, idx_name in enumerate(table._index.names):
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 4fe795e57a9..13eedb34c18 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -99,15 +99,31 @@ cpdef generate_pandas_metadata(Table table, index):
                 idx = table.index
 
             if isinstance(idx, cudf.core.index.RangeIndex):
-                descr = {
-                    "kind": "range",
-                    "name": table.index.name,
-                    "start": table.index.start,
-                    "stop": table.index.stop,
-                    "step": table.index.step,
-                }
+                if index is None:
+                    descr = {
+                        "kind": "range",
+                        "name": table.index.name,
+                        "start": table.index.start,
+                        "stop": table.index.stop,
+                        "step": table.index.step,
+                    }
+                else:
+                    # When `index=True`, RangeIndex needs to be materialized.
+                    materialized_idx = cudf.Index(idx._values, name=idx.name)
+                    descr = \
+                        _index_level_name(
+                            index_name=materialized_idx.name,
+                            level=level,
+                            column_names=col_names
+                        )
+                    index_levels.append(materialized_idx)
             else:
-                descr = _index_level_name(idx.name, level, col_names)
+                descr = \
+                    _index_level_name(
+                        index_name=idx.name,
+                        level=level,
+                        column_names=col_names
+                    )
                 if is_categorical_dtype(idx):
                     raise ValueError(
                         "'category' column dtypes are currently not "
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index a7a11c95e30..fe418d1ade1 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+
 import datetime
 import math
 import os
@@ -1718,24 +1719,24 @@ def test_parquet_nullable_boolean(tmpdir, engine):
     ],
 )
 @pytest.mark.parametrize("index", [None, True, False])
-def test_parquet_index(tmpdir, pdf, index):
-    pandas_path = tmpdir.join("pandas_index.parquet")
-    cudf_path = tmpdir.join("pandas_index.parquet")
+def test_parquet_index(pdf, index):
+    pandas_buffer = BytesIO()
+    cudf_buffer = BytesIO()
 
     gdf = cudf.from_pandas(pdf)
 
-    pdf.to_parquet(pandas_path, index=index)
-    gdf.to_parquet(cudf_path, index=index)
+    pdf.to_parquet(pandas_buffer, index=index)
+    gdf.to_parquet(cudf_buffer, index=index)
 
-    expected = pd.read_parquet(cudf_path)
-    actual = cudf.read_parquet(cudf_path)
+    expected = pd.read_parquet(cudf_buffer)
+    actual = cudf.read_parquet(pandas_buffer)
 
-    assert_eq(expected, actual)
+    assert_eq(expected, actual, check_index_type=True)
 
-    expected = pd.read_parquet(pandas_path)
-    actual = cudf.read_parquet(pandas_path)
+    expected = pd.read_parquet(pandas_buffer)
+    actual = cudf.read_parquet(cudf_buffer)
 
-    assert_eq(expected, actual)
+    assert_eq(expected, actual, check_index_type=True)
 
 
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 5d52d6c7da4..16511627aa2 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
 import datetime
 import os
@@ -193,7 +193,10 @@
 index : bool, default None
     If ``True``, include the dataframe's index(es) in the file output. If
     ``False``, they will not be written to the file. If ``None``, the
-    engine's default behavior will be used.
+    engine's default behavior will be used. However, instead of being saved
+    as values, the ``RangeIndex`` will be stored as a range in the metadata
+    so it doesn’t require much space and is faster. Other indexes will
+    be included as columns in the file output.
 partition_cols : list, optional, default None
     Column names by which to partition the dataset
     Columns are partitioned in the order they are given

From f38daf384a7cd78b681a3a7e6c854b5faadfc1dc Mon Sep 17 00:00:00 2001
From: ChrisJar <chris.jarrett.0@gmail.com>
Date: Wed, 24 Mar 2021 18:21:53 -0500
Subject: [PATCH 11/14] Implement scan operations for decimal columns (#7707)

This adds support for `cummin`, `cummax`, and `cumsum` in cuDF for columns with type `decimal`

Authors:
  - @ChrisJar

Approvers:
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7707
---
 python/cudf/cudf/core/column/decimal.py |  3 ++
 python/cudf/cudf/core/series.py         | 10 ++++--
 python/cudf/cudf/tests/test_scan.py     | 46 +++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 7fbe602f07a..4ba675516ae 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -72,6 +72,9 @@ def binary_operator(self, op, other, reflect=False):
         result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
         return result
 
+    def _apply_scan_op(self, op: str) -> ColumnBase:
+        return libcudf.reduce.scan(op, self, True)
+
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.DecimalColumn":
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 9d4643da637..a664c4fb182 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4725,8 +4725,9 @@ def cumsum(self, axis=0, skipna=True, *args, **kwargs):
                 result_col[first_index:] = None
 
         # pandas always returns int64 dtype if original dtype is int or `bool`
-        if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype(
-            result_col.dtype, np.bool_
+        if not is_decimal_dtype(result_col.dtype) and (
+            np.issubdtype(result_col.dtype, np.integer)
+            or np.issubdtype(result_col.dtype, np.bool_)
         ):
             return Series(
                 result_col.astype(np.int64)._apply_scan_op("sum"),
@@ -4774,6 +4775,11 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs):
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
 
+        if is_decimal_dtype(self.dtype):
+            raise NotImplementedError(
+                "cumprod does not currently support decimal types"
+            )
+
         skipna = True if skipna is None else skipna
 
         if skipna:
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
index dce65947460..f7e8c5a8563 100644
--- a/python/cudf/cudf/tests/test_scan.py
+++ b/python/cudf/cudf/tests/test_scan.py
@@ -6,6 +6,7 @@
 
 import cudf
 from cudf.tests.utils import INTEGER_TYPES, NUMERIC_TYPES, assert_eq, gen_rand
+from cudf.core.dtypes import Decimal64Dtype
 
 params_sizes = [0, 1, 2, 5]
 
@@ -61,6 +62,21 @@ def test_cumsum_masked():
         assert_eq(got, expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [Decimal64Dtype(8, 4), Decimal64Dtype(10, 5), Decimal64Dtype(12, 7)],
+)
+def test_cumsum_decimal(dtype):
+    data = ["243.32", "48.245", "-7234.298", np.nan, "-467.2"]
+    gser = cudf.Series(data).astype(dtype)
+    pser = pd.Series(data, dtype="float64")
+
+    got = gser.cumsum()
+    expected = cudf.Series.from_pandas(pser.cumsum()).astype(dtype)
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize("dtype,nelem", list(_gen_params()))
 def test_cummin(dtype, nelem):
     if dtype == np.int8:
@@ -103,6 +119,21 @@ def test_cummin_masked():
         assert_eq(gs.cummin(), expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [Decimal64Dtype(8, 4), Decimal64Dtype(11, 6), Decimal64Dtype(14, 7)],
+)
+def test_cummin_decimal(dtype):
+    data = ["8394.294", np.nan, "-9940.444", np.nan, "-23.928"]
+    gser = cudf.Series(data).astype(dtype)
+    pser = pd.Series(data, dtype="float64")
+
+    got = gser.cummin()
+    expected = cudf.Series.from_pandas(pser.cummin()).astype(dtype)
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize("dtype,nelem", list(_gen_params()))
 def test_cummax(dtype, nelem):
     if dtype == np.int8:
@@ -145,6 +176,21 @@ def test_cummax_masked():
         assert_eq(gs.cummax(), expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [Decimal64Dtype(8, 4), Decimal64Dtype(11, 6), Decimal64Dtype(14, 7)],
+)
+def test_cummax_decimal(dtype):
+    data = [np.nan, "54.203", "8.222", "644.32", "-562.272"]
+    gser = cudf.Series(data).astype(dtype)
+    pser = pd.Series(data, dtype="float64")
+
+    got = gser.cummax()
+    expected = cudf.Series.from_pandas(pser.cummax()).astype(dtype)
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize("dtype,nelem", list(_gen_params()))
 def test_cumprod(dtype, nelem):
     if dtype == np.int8:

From 31361242612a2f1198f1defb64cd560ee4eecfa8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 24 Mar 2021 18:35:41 -0500
Subject: [PATCH 12/14] Fix comparison between Datetime/Timedelta columns and
 NULL scalars (#7504)

Fixes https://github.com/rapidsai/cudf/issues/6897

Authors:
  - @brandon-b-miller

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/7504
---
 python/cudf/cudf/core/column/datetime.py  |  2 +
 python/cudf/cudf/core/column/timedelta.py |  2 +
 python/cudf/cudf/tests/test_binops.py     | 45 +++++++++++++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index a563248f4ab..0bacbe04356 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -178,6 +178,8 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
                 return cudf.Scalar(None, dtype=other.dtype)
 
             return cudf.Scalar(other)
+        elif other is None:
+            return cudf.Scalar(other, dtype=self.dtype)
         else:
             raise TypeError(f"cannot normalize {type(other)}")
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index e22b511db01..a39638106bb 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -275,6 +275,8 @@ def normalize_binop_value(self, other) -> BinaryOperand:
             return cudf.Scalar(other)
         elif np.isscalar(other):
             return cudf.Scalar(other)
+        elif other is None:
+            return cudf.Scalar(other, dtype=self.dtype)
         else:
             raise TypeError(f"cannot normalize {type(other)}")
 
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 18f2d7e474b..eb8aaaadd51 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1773,6 +1773,51 @@ def decimal_series(input, dtype):
     utils.assert_eq(expect, got)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float64",
+        "str",
+        "datetime64[ns]",
+        "datetime64[us]",
+        "datetime64[ms]",
+        "datetime64[s]",
+        "timedelta64[ns]",
+        "timedelta64[us]",
+        "timedelta64[ms]",
+        "timedelta64[s]",
+    ],
+)
+@pytest.mark.parametrize("null_scalar", [None, cudf.NA, np.datetime64("NaT")])
+@pytest.mark.parametrize("cmpop", _cmpops)
+def test_column_null_scalar_comparison(dtype, null_scalar, cmpop):
+    # This test is meant to validate that comparing
+    # a series of any dtype with a null scalar produces
+    # a new series where all the elements are <NA>.
+
+    if isinstance(null_scalar, np.datetime64):
+        if np.dtype(dtype).kind not in "mM":
+            pytest.skip()
+        null_scalar = null_scalar.astype(dtype)
+
+    dtype = np.dtype(dtype)
+
+    data = [1, 2, 3, 4, 5]
+    sr = cudf.Series(data, dtype=dtype)
+    result = cmpop(sr, null_scalar)
+
+    assert result.isnull().all()
+
+
 @pytest.mark.parametrize("fn", ["eq", "ne", "lt", "gt", "le", "ge"])
 def test_equality_ops_index_mismatch(fn):
     a = cudf.Series(

From b85459814e84ef783389dbaabdea345de9b93513 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 25 Mar 2021 11:28:02 +1100
Subject: [PATCH 13/14] Convert cudf::concatenate APIs to use spans and
 device_uvector (#7621)

Contributes to #7287

This PR replaces `std::vector` with `host_span` in public and detail `cudf::contatenate` functions, and replaces `rmm::device_vector` with `rmm::device_uvector` in the concatenate implementations.

It also strengthens the SFINAE restrictions on `cudf::host_span` and `cudf::device_span` so that they cannot be constructed from containers unless the container's value_type is the same as the span's value_type.

This PR also
 - [x] Updates cython.
 - [x] benchmarks before and after

Authors:
  - Mark Harris (@harrism)

Approvers:
  - Jake Hemstad (@jrhemstad)
  - Vukasin Milovanovic (@vuule)
  - Ashwin Srinath (@shwina)

URL: https://github.com/rapidsai/cudf/pull/7621
---
 cpp/include/cudf/column/column_factories.hpp  |   1 -
 cpp/include/cudf/concatenate.hpp              |  18 +-
 cpp/include/cudf/detail/concatenate.cuh       |   9 +-
 cpp/include/cudf/detail/concatenate.hpp       |  11 +-
 .../cudf/dictionary/detail/concatenate.hpp    |   5 +-
 cpp/include/cudf/lists/detail/concatenate.hpp |   5 +-
 .../cudf/strings/detail/concatenate.hpp       |   5 +-
 .../cudf/structs/detail/concatenate.hpp       |   5 +-
 cpp/include/cudf/utilities/span.hpp           |  35 +++-
 cpp/src/copying/concatenate.cu                |  61 +++----
 cpp/src/dictionary/detail/concatenate.cu      |   5 +-
 cpp/src/interop/from_arrow.cpp                |   3 +-
 cpp/src/join/hash_join.cu                     |   4 +-
 cpp/src/lists/copying/concatenate.cu          |   6 +-
 cpp/src/replace/replace.cu                    |   3 +-
 cpp/src/strings/copying/concatenate.cu        |  50 +++--
 cpp/src/structs/copying/concatenate.cu        |   6 +-
 cpp/src/structs/utilities.cu                  |   3 +-
 cpp/src/structs/utilities.hpp                 |   5 +-
 cpp/tests/copying/concatenate_tests.cu        | 172 +++++++++++-------
 cpp/tests/io/orc_test.cpp                     |  15 +-
 cpp/tests/io/parquet_test.cpp                 |  31 ++--
 cpp/tests/merge/merge_test.cpp                |   2 +-
 python/cudf/cudf/_lib/cpp/concatenate.pxd     |  14 +-
 .../cudf/_lib/cpp/utilities/host_span.pxd     |   8 +
 25 files changed, 282 insertions(+), 200 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/utilities/host_span.pxd

diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 31196824845..43c2407d629 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -21,7 +21,6 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 
 namespace cudf {
 /**
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index 8333cf41b77..182cbbdc3ec 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <memory>
-#include <vector>
 
 namespace cudf {
 /**
@@ -36,13 +36,13 @@ namespace cudf {
  *
  * Returns empty `device_buffer` if the column is not nullable
  *
- * @param views Vector of column views whose bitmask will to be concatenated
+ * @param views host_span of column views whose bitmask will to be concatenated
  * @param mr Device memory resource used for allocating the new device_buffer
  * @return rmm::device_buffer A `device_buffer` containing the bitmasks of all
  * the column views in the views vector
  */
 rmm::device_buffer concatenate_masks(
-  std::vector<column_view> const& views,
+  host_span<column_view const> views,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -51,14 +51,13 @@ rmm::device_buffer concatenate_masks(
  * @throws cudf::logic_error
  * If types of the input columns mismatch
  *
- * @param columns_to_concat The column views to be concatenated into a single
- * column
+ * @param columns_to_concat host_span of column views to be concatenated into a single column
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Unique pointer to a single table having all the rows from the
  * elements of `columns_to_concat` respectively in the same order.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns_to_concat,
+  host_span<column_view const> columns_to_concat,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -82,14 +81,13 @@ std::unique_ptr<column> concatenate(
  * @throws cudf::logic_error
  * If number of columns mismatch
  *
- * @param tables_to_concat The table views to be concatenated into a single
- * table
+ * @param tables_to_concat host_span of table views to be concatenated into a single table
  * @param mr Device memory resource used to allocate the returned table's device memory.
  * @return Unique pointer to a single table having all the rows from the
  * elements of `tables_to_concat` respectively in the same order.
  */
 std::unique_ptr<table> concatenate(
-  std::vector<table_view> const& tables_to_concat,
+  host_span<table_view const> tables_to_concat,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/detail/concatenate.cuh b/cpp/include/cudf/detail/concatenate.cuh
index a30ad6e853d..5f0399d6172 100644
--- a/cpp/include/cudf/detail/concatenate.cuh
+++ b/cpp/include/cudf/detail/concatenate.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -34,8 +35,8 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
-                       rmm::device_vector<size_t> const& d_offsets,
+void concatenate_masks(device_span<column_device_view const> d_views,
+                       device_span<size_t const> d_offsets,
                        bitmask_type* dest_mask,
                        size_type output_size,
                        rmm::cuda_stream_view stream);
@@ -45,7 +46,7 @@ void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-void concatenate_masks(std::vector<column_view> const& views,
+void concatenate_masks(host_span<column_view const> views,
                        bitmask_type* dest_mask,
                        rmm::cuda_stream_view stream);
 
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 43eb5203b37..f7f5567cd76 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -27,22 +28,22 @@ namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
 /**
- * @copydoc cudf::concatenate(std::vector<column_view> const&,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<column_view const>,rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns_to_concat,
+  host_span<column_view const> columns_to_concat,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::concatenate(std::vector<table_view> const&,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<table_view const>,rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> concatenate(
-  std::vector<table_view> const& tables_to_concat,
+  host_span<table_view const> tables_to_concat,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index ae2e0f0ba38..c2fe2dce1fe 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -36,7 +37,7 @@ namespace detail {
  * @return New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index f9adc893b8e..30797443c35 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -42,7 +43,7 @@ namespace detail {
  * @return New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index 3e6fc6d67fc..0740039e896 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -41,7 +42,7 @@ namespace detail {
  * @return New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index ef3da82cfeb..a098703e4b0 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 namespace cudf {
 namespace structs {
@@ -48,7 +49,7 @@ namespace detail {
  * @return        New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index c13e5ce44ae..999306d4ee7 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -126,16 +126,31 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
 
   constexpr host_span() noexcept : base() {}  // required to compile on centos
 
-  template <typename C, std::enable_if_t<is_host_span_supported_container<C>::value>* = nullptr>
+  // Constructor from container
+  template <
+    typename C,
+    // Only supported containers of types convertible to T
+    std::enable_if_t<is_host_span_supported_container<C>::value &&
+                     std::is_convertible<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
+                                           std::declval<C&>().data()))> (*)[],
+                                         T (*)[]>::value>* = nullptr>
   constexpr host_span(C& in) : base(in.data(), in.size())
   {
   }
 
-  template <typename C, std::enable_if_t<is_host_span_supported_container<C>::value>* = nullptr>
+  // Constructor from const container
+  template <
+    typename C,
+    // Only supported containers of types convertible to T
+    std::enable_if_t<is_host_span_supported_container<C>::value &&
+                     std::is_convertible<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
+                                           std::declval<C&>().data()))> (*)[],
+                                         T (*)[]>::value>* = nullptr>
   constexpr host_span(C const& in) : base(in.data(), in.size())
   {
   }
 
+  // Copy construction to support const conversion
   template <typename OtherT,
             std::size_t OtherExtent,
             typename std::enable_if<(Extent == OtherExtent || Extent == dynamic_extent) &&
@@ -175,12 +190,24 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
 
   constexpr device_span() noexcept : base() {}  // required to compile on centos
 
-  template <typename C, std::enable_if_t<is_device_span_supported_container<C>::value>* = nullptr>
+  template <
+    typename C,
+    // Only supported containers of types convertible to T
+    std::enable_if_t<is_device_span_supported_container<C>::value &&
+                     std::is_convertible<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
+                                           std::declval<C&>().data()))> (*)[],
+                                         T (*)[]>::value>* = nullptr>
   constexpr device_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
 
-  template <typename C, std::enable_if_t<is_device_span_supported_container<C>::value>* = nullptr>
+  template <
+    typename C,
+    // Only supported containers of types convertible to T
+    std::enable_if_t<is_device_span_supported_container<C>::value &&
+                     std::is_convertible<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
+                                           std::declval<C&>().data()))> (*)[],
+                                         T (*)[]>::value>* = nullptr>
   constexpr device_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 8cf9db465f3..1b948083982 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/concatenate.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/strings/detail/concatenate.hpp>
@@ -29,7 +30,6 @@
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/binary_search.h>
@@ -50,19 +50,18 @@ constexpr bool use_fused_kernel_heuristic(bool const has_nulls, size_t const num
   return has_nulls || num_columns > 4;
 }
 
-auto create_device_views(std::vector<column_view> const& views, rmm::cuda_stream_view stream)
+auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_view stream)
 {
   // Create device views for each input view
   using CDViewPtr = decltype(
     column_device_view::create(std::declval<column_view>(), std::declval<rmm::cuda_stream_view>()));
   auto device_view_owners = std::vector<CDViewPtr>(views.size());
-  std::transform(
-    views.cbegin(), views.cend(), device_view_owners.begin(), [stream](auto const& col) {
-      // TODO creating this device view can invoke null count computation
-      // even though it isn't used. See this issue:
-      // https://github.com/rapidsai/cudf/issues/4368
-      return column_device_view::create(col, stream);
-    });
+  std::transform(views.begin(), views.end(), device_view_owners.begin(), [stream](auto const& col) {
+    // TODO creating this device view can invoke null count computation
+    // even though it isn't used. See this issue:
+    // https://github.com/rapidsai/cudf/issues/4368
+    return column_device_view::create(col, stream);
+  });
 
   // Assemble contiguous array of device views
   auto device_views = thrust::host_vector<column_device_view>();
@@ -74,7 +73,7 @@ auto create_device_views(std::vector<column_view> const& views, rmm::cuda_stream
   // TODO each of these device vector copies invoke stream synchronization
   // which appears to add unnecessary overhead. See this issue:
   // https://github.com/rapidsai/rmm/issues/120
-  auto d_views = rmm::device_vector<column_device_view>{device_views};
+  auto d_views = make_device_uvector_async(device_views);
 
   // Compute the partition offsets
   auto offsets = thrust::host_vector<size_t>(views.size() + 1);
@@ -85,7 +84,7 @@ auto create_device_views(std::vector<column_view> const& views, rmm::cuda_stream
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
     thrust::plus<size_t>{});
-  auto const d_offsets   = rmm::device_vector<size_t>{offsets};
+  auto d_offsets         = make_device_uvector_async(offsets);
   auto const output_size = offsets.back();
 
   return std::make_tuple(
@@ -132,8 +131,8 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
   }
 }
 
-void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
-                       rmm::device_vector<size_t> const& d_offsets,
+void concatenate_masks(device_span<column_device_view const> d_views,
+                       device_span<size_t const> d_offsets,
                        bitmask_type* dest_mask,
                        size_type output_size,
                        rmm::cuda_stream_view stream)
@@ -141,14 +140,14 @@ void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
   constexpr size_type block_size{256};
   cudf::detail::grid_1d config(output_size, block_size);
   concatenate_masks_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-    d_views.data().get(),
-    d_offsets.data().get(),
+    d_views.data(),
+    d_offsets.data(),
     static_cast<size_type>(d_views.size()),
     dest_mask,
     output_size);
 }
 
-void concatenate_masks(std::vector<column_view> const& views,
+void concatenate_masks(host_span<column_view const> views,
                        bitmask_type* dest_mask,
                        rmm::cuda_stream_view stream)
 {
@@ -214,7 +213,7 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
 }
 
 template <typename T>
-std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
+std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
                                           bool const has_nulls,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
@@ -245,8 +244,8 @@ std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
   auto const kernel = has_nulls ? fused_concatenate_kernel<T, block_size, true>
                                 : fused_concatenate_kernel<T, block_size, false>;
   kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-    d_views.data().get(),
-    d_offsets.data().get(),
+    d_views.data(),
+    d_offsets.data(),
     static_cast<size_type>(d_views.size()),
     *d_out_view,
     d_valid_count.data());
@@ -257,7 +256,7 @@ std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
 }
 
 template <typename T>
-std::unique_ptr<column> for_each_concatenate(std::vector<column_view> const& views,
+std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
                                              bool const has_nulls,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
@@ -289,7 +288,7 @@ std::unique_ptr<column> for_each_concatenate(std::vector<column_view> const& vie
 }
 
 struct concatenate_dispatch {
-  std::vector<column_view> const& views;
+  host_span<column_view const> views;
   rmm::cuda_stream_view stream;
   rmm::mr::device_memory_resource* mr;
 
@@ -298,7 +297,7 @@ struct concatenate_dispatch {
   std::unique_ptr<column> operator()()
   {
     bool const has_nulls =
-      std::any_of(views.cbegin(), views.cend(), [](auto const& col) { return col.has_nulls(); });
+      std::any_of(views.begin(), views.end(), [](auto const& col) { return col.has_nulls(); });
 
     // Use a heuristic to guess when the fused kernel will be faster
     if (use_fused_kernel_heuristic(has_nulls, views.size())) {
@@ -392,7 +391,7 @@ void bounds_and_type_check(ColIter begin, ColIter end)
 }  // anonymous namespace
 
 // Concatenates the elements from a vector of column_views
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_concat,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
@@ -411,15 +410,15 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_c
     columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr});
 }
 
-std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,
+std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   if (tables_to_concat.empty()) { return std::make_unique<table>(); }
 
   table_view const first_table = tables_to_concat.front();
-  CUDF_EXPECTS(std::all_of(tables_to_concat.cbegin(),
-                           tables_to_concat.cend(),
+  CUDF_EXPECTS(std::all_of(tables_to_concat.begin(),
+                           tables_to_concat.end(),
                            [&first_table](auto const& t) {
                              return t.num_columns() == first_table.num_columns();
                            }),
@@ -428,8 +427,8 @@ std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_conc
   std::vector<std::unique_ptr<column>> concat_columns;
   for (size_type i = 0; i < first_table.num_columns(); ++i) {
     std::vector<column_view> cols;
-    std::transform(tables_to_concat.cbegin(),
-                   tables_to_concat.cend(),
+    std::transform(tables_to_concat.begin(),
+                   tables_to_concat.end(),
                    std::back_inserter(cols),
                    [i](auto const& t) { return t.column(i); });
 
@@ -442,7 +441,7 @@ std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_conc
 
 }  // namespace detail
 
-rmm::device_buffer concatenate_masks(std::vector<column_view> const& views,
+rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::mr::device_memory_resource* mr)
 {
   bool const has_nulls =
@@ -465,14 +464,14 @@ rmm::device_buffer concatenate_masks(std::vector<column_view> const& views,
 }
 
 // Concatenates the elements from a vector of column_views
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_concat,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(columns_to_concat, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,
+std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 05349a5f968..cdf086e3f4a 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -62,8 +62,7 @@ struct compute_children_offsets_fn {
    *
    * @param columns The input dictionary columns.
    */
-  compute_children_offsets_fn(std::vector<column_view> const& columns)
-    : columns_ptrs{columns.size()}
+  compute_children_offsets_fn(host_span<column_view const> columns) : columns_ptrs{columns.size()}
   {
     std::transform(
       columns.begin(), columns.end(), columns_ptrs.begin(), [](auto& cv) { return &cv; });
@@ -187,7 +186,7 @@ struct dispatch_compute_indices {
 
 }  // namespace
 
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp
index 729b98d85a8..612e2111b66 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cpp
@@ -150,8 +150,7 @@ struct dispatch_to_cudf_column {
 
 std::unique_ptr<column> get_empty_type_column(size_type size)
 {
-  return std::make_unique<column>(
-    data_type(type_id::EMPTY), size, std::move(rmm::device_buffer(0)));
+  return std::make_unique<column>(data_type(type_id::EMPTY), size, rmm::device_buffer(0));
 }
 
 /**
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index b64e91c18bd..d827d03a6c0 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -442,7 +442,9 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> construct_join_output_
                                               stream,
                                               rmm::mr::get_current_device_resource());
       common_table           = cudf::detail::concatenate(
-        {common_from_build->view(), common_from_probe->view()}, stream, mr);
+        std::vector<table_view>({common_from_build->view(), common_from_probe->view()}),
+        stream,
+        mr);
     }
     joined_indices = concatenate_vector_pairs(complement_indices, joined_indices);
   } else {
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index c6ca56085c8..facf2827f56 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -48,7 +48,7 @@ namespace {
  * @param[in] mr                    Device memory resource used to allocate the
  * returned column's device memory.
  */
-std::unique_ptr<column> merge_offsets(std::vector<lists_column_view> const& columns,
+std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns,
                                       size_type total_list_count,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
@@ -90,7 +90,7 @@ std::unique_ptr<column> merge_offsets(std::vector<lists_column_view> const& colu
  * @copydoc cudf::lists::detail::concatenate
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
@@ -119,7 +119,7 @@ std::unique_ptr<column> concatenate(
 
   // if any of the input columns have nulls, construct the output mask
   bool const has_nulls =
-    std::any_of(columns.cbegin(), columns.cend(), [](auto const& col) { return col.has_nulls(); });
+    std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
   rmm::device_buffer null_mask = create_null_mask(
     total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
   if (has_nulls) {
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 783e0b4b1de..cb142c2c1e2 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -450,7 +450,8 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
   auto replacements = cudf::dictionary_column_view(replacement_values);
 
   auto matched_input = [&] {
-    auto new_keys = cudf::detail::concatenate({values.keys(), replacements.keys()}, stream);
+    auto new_keys = cudf::detail::concatenate(
+      std::vector<cudf::column_view>({values.keys(), replacements.keys()}), stream);
     return cudf::dictionary::detail::add_keys(input, new_keys->view(), stream, mr);
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 65c6c8f2836..48358cb4a38 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/concatenate.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -27,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include "thrust/iterator/transform_iterator.h"
 
 #include <thrust/binary_search.h>
 #include <thrust/for_each.h>
@@ -65,8 +67,7 @@ struct chars_size_transform {
   }
 };
 
-auto create_strings_device_views(std::vector<column_view> const& views,
-                                 rmm::cuda_stream_view stream)
+auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   // Assemble contiguous array of device views
@@ -77,33 +78,30 @@ auto create_strings_device_views(std::vector<column_view> const& views,
 
   // Compute the partition offsets and size of offset column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
-  auto input_offsets = thrust::host_vector<size_t>(views.size() + 1);
+  auto input_offsets = std::vector<size_t>(views.size() + 1);
   auto offset_it     = std::next(input_offsets.begin());
   thrust::transform(
-    thrust::host, views.cbegin(), views.cend(), offset_it, [](auto const& col) -> size_t {
+    thrust::host, views.begin(), views.end(), offset_it, [](auto const& col) -> size_t {
       return static_cast<size_t>(col.size());
     });
   thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it);
-  auto const d_input_offsets = rmm::device_vector<size_t>{input_offsets};
-  auto const output_size     = input_offsets.back();
+  auto d_input_offsets   = cudf::detail::make_device_uvector_async(input_offsets, stream);
+  auto const output_size = input_offsets.back();
 
   // Compute the partition offsets and size of chars column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
-  // Note: Using separate transform and inclusive_scan because
-  // transform_inclusive_scan fails to compile with:
-  // error: the default constructor of "cudf::column_device_view" cannot be
-  // referenced -- it is a deleted function
-  auto d_partition_offsets = rmm::device_vector<size_t>(views.size() + 1);
-  thrust::transform(rmm::exec_policy(stream),
-                    device_views_ptr,
-                    device_views_ptr + views.size(),
-                    std::next(d_partition_offsets.begin()),
-                    chars_size_transform{});
-  thrust::inclusive_scan(rmm::exec_policy(stream),
-                         d_partition_offsets.cbegin(),
-                         d_partition_offsets.cend(),
-                         d_partition_offsets.begin());
-  auto const output_chars_size = d_partition_offsets.back();
+  auto d_partition_offsets = rmm::device_uvector<size_t>(views.size() + 1, stream);
+  size_t zero{0};
+  d_partition_offsets.set_element_async(0, zero, stream);  // zero first element
+
+  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                   device_views_ptr,
+                                   device_views_ptr + views.size(),
+                                   std::next(d_partition_offsets.begin()),
+                                   chars_size_transform{},
+                                   thrust::plus<size_t>{});
+  auto const output_chars_size = d_partition_offsets.back_element(stream);
+  stream.synchronize();  // ensure copy of output_chars_size is complete before returning
 
   return std::make_tuple(std::move(device_view_owners),
                          device_views_ptr,
@@ -205,7 +203,7 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*
   }
 }
 
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
@@ -257,8 +255,8 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
                                   : fused_concatenate_string_offset_kernel<block_size, false>;
     kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       d_views,
-      d_input_offsets.data().get(),
-      d_partition_offsets.data().get(),
+      d_input_offsets.data(),
+      d_partition_offsets.data(),
       static_cast<size_type>(columns.size()),
       strings_count,
       d_new_offsets,
@@ -277,7 +275,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
       auto const kernel = fused_concatenate_string_chars_kernel;
       kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
         d_views,
-        d_partition_offsets.data().get(),
+        d_partition_offsets.data(),
         static_cast<size_type>(columns.size()),
         total_bytes,
         d_new_chars);
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index b2f861c7c8d..6f18c4bcbd4 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -36,7 +36,7 @@ namespace detail {
 /**
  * @copydoc cudf::structs::detail::concatenate
  */
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
@@ -49,7 +49,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
   std::transform(ordered_children.begin(),
                  ordered_children.end(),
                  std::back_inserter(children),
-                 [mr, stream](std::vector<column_view> const& cols) {
+                 [mr, stream](host_span<column_view const> cols) {
                    return cudf::detail::concatenate(cols, stream, mr);
                  });
 
@@ -57,7 +57,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
 
   // if any of the input columns have nulls, construct the output mask
   bool const has_nulls =
-    std::any_of(columns.cbegin(), columns.cend(), [](auto const& col) { return col.has_nulls(); });
+    std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
   rmm::device_buffer null_mask =
     create_null_mask(total_length, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
   if (has_nulls) {
diff --git a/cpp/src/structs/utilities.cu b/cpp/src/structs/utilities.cu
index 09e6c5d949d..274a88d3a05 100644
--- a/cpp/src/structs/utilities.cu
+++ b/cpp/src/structs/utilities.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 
 namespace cudf {
 namespace structs {
@@ -27,7 +28,7 @@ namespace detail {
  * @copydoc cudf::structs::detail::extract_ordered_struct_children
  */
 std::vector<std::vector<column_view>> extract_ordered_struct_children(
-  std::vector<column_view> const& struct_cols)
+  host_span<column_view const> struct_cols)
 {
   auto const num_children = struct_cols[0].num_children();
   auto const num_cols     = static_cast<size_type>(struct_cols.size());
diff --git a/cpp/src/structs/utilities.hpp b/cpp/src/structs/utilities.hpp
index 1e0511cfd83..613754fc765 100644
--- a/cpp/src/structs/utilities.hpp
+++ b/cpp/src/structs/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 namespace cudf {
 namespace structs {
@@ -45,7 +46,7 @@ namespace detail {
  * @return New column with concatenated results.
  */
 std::vector<std::vector<column_view>> extract_ordered_struct_children(
-  std::vector<column_view> const& struct_cols);
+  host_span<column_view const> struct_cols);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index e63cbac1e72..cea53326895 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -99,7 +99,7 @@ TYPED_TEST(TypedColumnTest, ConcatenateNoColumns)
 
 TYPED_TEST(TypedColumnTest, ConcatenateColumnView)
 {
-  cudf::column original{this->type(), this->num_elements(), this->data, this->mask};
+  column original{this->type(), this->num_elements(), this->data, this->mask};
   std::vector<cudf::size_type> indices{0,
                                        this->num_elements() / 3,
                                        this->num_elements() / 3,
@@ -223,7 +223,7 @@ TEST_F(TableTest, ConcatenateTables)
   cols_table2.push_back(col3_table2.release());
   Table t2(std::move(cols_table2));
 
-  auto concat_table = cudf::concatenate({t1.view(), t2.view()});
+  auto concat_table = cudf::concatenate(std::vector<TView>({t1, t2}));
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*concat_table, gold_table);
 }
@@ -341,7 +341,8 @@ TEST_F(TableTest, SizeOverflowTest)
     auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);
 
     cudf::table_view tbl({*many_chars});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl})),
+                 cudf::logic_error);
   }
 
   // string column, overflow on chars
@@ -356,7 +357,8 @@ TEST_F(TableTest, SizeOverflowTest)
       1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{0});
 
     cudf::table_view tbl({*col});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl})),
+                 cudf::logic_error);
   }
 
   // string column, overflow on offsets (rows)
@@ -372,7 +374,8 @@ TEST_F(TableTest, SizeOverflowTest)
       size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{0});
 
     cudf::table_view tbl({*col});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl})),
+                 cudf::logic_error);
   }
 
   // list<struct>, structs too long
@@ -395,8 +398,8 @@ TEST_F(TableTest, SizeOverflowTest)
       1, offsets.release(), std::move(struct_col), 0, rmm::device_buffer{0});
 
     cudf::table_view tbl({*col});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl}),
-                 cudf::logic_error);
+    auto tables = std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl});
+    EXPECT_THROW(cudf::concatenate(tables), cudf::logic_error);
   }
 
   // struct<int, list>, list child too long
@@ -419,8 +422,8 @@ TEST_F(TableTest, SizeOverflowTest)
     auto col = cudf::make_structs_column(size, std::move(children), 0, rmm::device_buffer{0});
 
     cudf::table_view tbl({*col});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl}),
-                 cudf::logic_error);
+    auto tables = std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl});
+    EXPECT_THROW(cudf::concatenate(tables), cudf::logic_error);
   }
 }
 
@@ -463,12 +466,14 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
 
   // build expected output
   std::vector<std::unique_ptr<column>> expected_children;
-  expected_children.push_back(
-    cudf::concatenate({name_cols[0], name_cols[1], name_cols[2], name_cols[3]}));
-  expected_children.push_back(
-    cudf::concatenate({age_cols[0], age_cols[1], age_cols[2], age_cols[3]}));
-  expected_children.push_back(
-    cudf::concatenate({is_human_cols[0], is_human_cols[1], is_human_cols[2], is_human_cols[3]}));
+  auto name_col_vec =
+    std::vector<column_view>({name_cols[0], name_cols[1], name_cols[2], name_cols[3]});
+  auto age_col_vec = std::vector<column_view>({age_cols[0], age_cols[1], age_cols[2], age_cols[3]});
+  auto is_human_col_vec = std::vector<column_view>(
+    {is_human_cols[0], is_human_cols[1], is_human_cols[2], is_human_cols[3]});
+  expected_children.push_back(cudf::concatenate(name_col_vec));
+  expected_children.push_back(cudf::concatenate(age_col_vec));
+  expected_children.push_back(cudf::concatenate(is_human_col_vec));
   std::vector<bool> struct_validity({1, 0, 1, 1, 1, 0});
   auto expected = make_structs_column(
     6,
@@ -484,7 +489,7 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
   src.push_back(structs_column_wrapper({name_cols[3], age_cols[3], is_human_cols[3]}, {1, 0}));
 
   // concatenate
-  auto result = cudf::concatenate({src[0], src[1], src[2], src[3]});
+  auto result = cudf::concatenate(std::vector<column_view>({src[0], src[1], src[2], src[3]}));
   cudf::test::expect_columns_equivalent(*result, *expected);
 }
 
@@ -536,9 +541,13 @@ TEST_F(StructsColumnTest, ConcatenateSplitStructs)
 
   // build expected output
   std::vector<std::unique_ptr<column>> expected_children;
-  expected_children.push_back(cudf::concatenate({split_names_cols[0], split_names_cols[1]}));
-  expected_children.push_back(cudf::concatenate({split_ages_cols[0], split_ages_cols[1]}));
-  expected_children.push_back(cudf::concatenate({split_is_human_cols[0], split_is_human_cols[1]}));
+  auto expected_names = std::vector<column_view>({split_names_cols[0], split_names_cols[1]});
+  auto expected_ages  = std::vector<column_view>({split_ages_cols[0], split_ages_cols[1]});
+  auto expected_is_human =
+    std::vector<column_view>({split_is_human_cols[0], split_is_human_cols[1]});
+  expected_children.push_back(cudf::concatenate(expected_names));
+  expected_children.push_back(cudf::concatenate(expected_ages));
+  expected_children.push_back(cudf::concatenate(expected_is_human));
   auto expected = make_structs_column(7, std::move(expected_children), 0, rmm::device_buffer{});
 
   // concatenate as structs
@@ -552,7 +561,8 @@ TEST_F(StructsColumnTest, ConcatenateSplitStructs)
   }
 
   // concatenate
-  auto result = cudf::concatenate({src[0], src[1]});
+
+  auto result = cudf::concatenate(std::vector<column_view>({src[0], src[1]}));
   cudf::test::expect_columns_equivalent(*result, *expected);
 }
 
@@ -607,8 +617,11 @@ TEST_F(StructsColumnTest, ConcatenateStructsNested)
 
   // build expected output
   std::vector<std::unique_ptr<column>> expected_children;
-  expected_children.push_back(cudf::concatenate({inner_structs[0], inner_structs[1]}));
-  expected_children.push_back(cudf::concatenate({inner_lists[0], inner_lists[1]}));
+
+  expected_children.push_back(
+    cudf::concatenate(std::vector<column_view>({inner_structs[0], inner_structs[1]})));
+  expected_children.push_back(
+    cudf::concatenate(std::vector<column_view>({inner_lists[0], inner_lists[1]})));
   auto expected = make_structs_column(11, std::move(expected_children), 0, rmm::device_buffer{});
 
   // concatenate as structs
@@ -621,7 +634,7 @@ TEST_F(StructsColumnTest, ConcatenateStructsNested)
   }
 
   // concatenate
-  auto result = cudf::concatenate({src[0], src[1]});
+  auto result = cudf::concatenate(std::vector<column_view>({src[0], src[1]}));
   cudf::test::expect_columns_equivalent(*result, *expected);
 }
 
@@ -635,7 +648,7 @@ TEST_F(ListsColumnTest, ConcatenateLists)
     cudf::test::lists_column_wrapper<int> b{4, 5, 6, 7, 8, 9, 10};
     cudf::test::lists_column_wrapper<int> expected{{0, 1, 2, 3}, {4, 5, 6, 7, 8, 9, 10}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -646,7 +659,7 @@ TEST_F(ListsColumnTest, ConcatenateLists)
     cudf::test::lists_column_wrapper<int> expected{
       {0, 1, 1}, {2, 3}, {4, 5}, {6}, {8, 9, 9, 9}, {10, 11}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -657,7 +670,7 @@ TEST_F(ListsColumnTest, ConcatenateLists)
     cudf::test::lists_column_wrapper<int> expected{
       {0, 1}, {2, 3, 4, 5}, {6, 7, 8}, {9}, {10, 11}, {12, 13, 14, 15}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -674,7 +687,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> b{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{4, 5, 6, 7};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -684,7 +697,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> d{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{4, 5, 6, 7};
 
-    auto result = cudf::concatenate({a, b, c, d});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b, c, d}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -694,7 +707,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> b{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{LCW{}, {4, 5, 6, 7}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -704,7 +717,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> d{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{LCW{}, LCW{}, LCW{}, {4, 5, 6, 7}};
 
-    auto result = cudf::concatenate({a, b, c, d});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b, c, d}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -715,7 +728,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> d{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{{1, 2}, LCW{}, LCW{}, {4, 5, 6, 7}};
 
-    auto result = cudf::concatenate({a, b, c, d});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b, c, d}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -732,7 +745,7 @@ TEST_F(ListsColumnTest, ConcatenateListsWithNulls)
     cudf::test::lists_column_wrapper<int> b{{{4, 6, 7}, valids}};
     cudf::test::lists_column_wrapper<int> expected{{{0, 1, 2, 3}, valids}, {{4, 6, 7}, valids}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -746,7 +759,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedLists)
     cudf::test::lists_column_wrapper<int> expected{
       {{0, 1}, {2}}, {{4, 5, 6, 7, 8, 9, 10}}, {{6, 7}}, {{8, 9, 10}, {11, 12}}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -770,7 +783,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedLists)
       {{{31, 32}, {33, 34}}, {{35, 36}, {37, 38}}, {{39, 40}}},
       {{{71, 72}, {74}}, {{75, 76, 77, 78}, {77, 78}}, {{79, 80, 81}}}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -789,7 +802,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedEmptyLists)
     cudf::test::lists_column_wrapper<int> expected{
       {{LCW{}}}, {{0, 1}, {2, 3}}, {{6, 7}}, {LCW{}, {11, 12}}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -815,7 +828,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedEmptyLists)
       {{{31, 32}, {33, 34}}, {{35, 36}, {37, 38}, {1, 2}}, {{39, 40}}},
       {{{LCW{}}}}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -834,7 +847,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedListsWithNulls)
     cudf::test::lists_column_wrapper<int> expected{{{{0, 1}, {2, 3}}, valids},
                                                    {{{4}, {6, 7}}, valids}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -848,7 +861,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedListsWithNulls)
                                                    {{6, 7}},
                                                    {{{{8, 9, 10}, valids}, {11, 12}}, valids}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -864,7 +877,8 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> a{{{{LCW{}}}}};
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
-    EXPECT_THROW(cudf::concatenate({a, b, c}), cudf::logic_error);
+
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
   }
 
   {
@@ -872,20 +886,23 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> a{{{{{LCW{}}}}, valids.begin()}};
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
-    EXPECT_THROW(cudf::concatenate({a, b, c}), cudf::logic_error);
+
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
   }
 
   {
     cudf::test::lists_column_wrapper<int> a{{{{LCW{}}}}};
     cudf::test::lists_column_wrapper<int> b{1, 2, 3};
     cudf::test::lists_column_wrapper<int> c{{3, 4, 5}};
-    EXPECT_THROW(cudf::concatenate({a, b, c}), cudf::logic_error);
+
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
   }
 
   {
     cudf::test::lists_column_wrapper<int> a{{{1, 2, 3}}};
     cudf::test::lists_column_wrapper<int> b{{4, 5}};
-    EXPECT_THROW(cudf::concatenate({a, b}), cudf::logic_error);
+
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::logic_error);
   }
 }
 
@@ -910,14 +927,16 @@ TEST_F(ListsColumnTest, SlicedColumns)
                                                     {{4, 4, 4}, {5, 5}, {6, 6}},
                                                     {{-1, -1, -1, -1}, {-2}},
                                                     {{-3, -3, -3, -3}, {-4}}};
-    auto result0 = cudf::concatenate({split_a[0], split_b[0]});
+
+    auto result0 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result0, expected0);
 
     cudf::test::lists_column_wrapper<int> expected1{{{1, 1, 1}, {2, 2}, {3, 3}},
                                                     {{4, 4, 4}, {5, 5}, {6, 6}},
                                                     {{-5, -5, -5, -5}, {-6}},
                                                     {{-7, -7, -7, -7}, {-8}}};
-    auto result1 = cudf::concatenate({split_a[0], split_b[1]});
+
+    auto result1 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result1, expected1);
 
     cudf::test::lists_column_wrapper<int> expected2{
@@ -926,14 +945,16 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {{-1, -1, -1, -1}, {-2}},
       {{-3, -3, -3, -3}, {-4}},
     };
-    auto result2 = cudf::concatenate({split_a[1], split_b[0]});
+
+    auto result2 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result2, expected2);
 
     cudf::test::lists_column_wrapper<int> expected3{{{7, 7, 7}, {8, 8}, {9, 9}},
                                                     {{10, 10, 10}, {11, 11}, {12, 12}},
                                                     {{-5, -5, -5, -5}, {-6}},
                                                     {{-7, -7, -7, -7}, {-8}}};
-    auto result3 = cudf::concatenate({split_a[1], split_b[1]});
+
+    auto result3 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result3, expected3);
   }
 
@@ -958,7 +979,9 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {LCW{}, {LCW{}}, {{6, 6}, {2}}},
       {{LCW{}}},
       {LCW{}, {LCW{}}}};
-    auto result0 = cudf::concatenate({split_a[0], split_b[0]});
+
+    auto result0 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[0]}));
+
     cudf::test::expect_columns_equivalent(*result0, expected0);
 
     cudf::test::lists_column_wrapper<int> expected1{
@@ -967,7 +990,8 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {{{1, 2, 9}, LCW{}}, {{5, 6, 7, 8, 9}, {0}, {15, 17}}},
       {{LCW{}}},
     };
-    auto result1 = cudf::concatenate({split_a[0], split_b[1]});
+
+    auto result1 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result1, expected1);
 
     cudf::test::lists_column_wrapper<int> expected2{
@@ -975,7 +999,8 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {LCW{}, LCW{}, {{10, 10, 10}, {11, 11}, {12, 12}}, LCW{}},
       {{LCW{}}},
       {LCW{}, {LCW{}}}};
-    auto result2 = cudf::concatenate({split_a[1], split_b[0]});
+
+    auto result2 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result2, expected2);
 
     cudf::test::lists_column_wrapper<int> expected3{
@@ -984,7 +1009,8 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {{{1, 2, 9}, LCW{}}, {{5, 6, 7, 8, 9}, {0}, {15, 17}}},
       {{LCW{}}},
     };
-    auto result3 = cudf::concatenate({split_a[1], split_b[1]});
+
+    auto result3 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result3, expected3);
   }
 }
@@ -1015,14 +1041,16 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
                                                     {{{{-1, -1, -1, -1}, valids}, {-2}}, valids},
                                                     {{{{-3, -3, -3, -3}, valids}, {-4}}, valids},
                                                     {{{{-5, -5, -5, -5}, valids}, {-6}}, valids}};
-    auto result0 = cudf::concatenate({split_a[0], split_b[0]});
+
+    auto result0 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result0, expected0);
 
     cudf::test::lists_column_wrapper<int> expected1{{{{1, 1, 1}, valids}, {2, 2}, {{3, 3}, valids}},
                                                     {{{4, 4, 4}, {{5, 5}, valids}, {6, 6}}, valids},
                                                     {{7, 7, 7}, {8, 8}, {9, 9}},
                                                     {{{{-7, -7, -7, -7}, valids}, {-8}}, valids}};
-    auto result1 = cudf::concatenate({split_a[0], split_b[1]});
+
+    auto result1 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result1, expected1);
 
     cudf::test::lists_column_wrapper<int> expected2{
@@ -1030,13 +1058,15 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
       {{{{-1, -1, -1, -1}, valids}, {-2}}, valids},
       {{{{-3, -3, -3, -3}, valids}, {-4}}, valids},
       {{{{-5, -5, -5, -5}, valids}, {-6}}, valids}};
-    auto result2 = cudf::concatenate({split_a[1], split_b[0]});
+
+    auto result2 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result2, expected2);
 
     cudf::test::lists_column_wrapper<int> expected3{
       {{{10, 10, 10}, {11, 11}, {{12, 12}, valids}}, valids},
       {{{{-7, -7, -7, -7}, valids}, {-8}}, valids}};
-    auto result3 = cudf::concatenate({split_a[1], split_b[1]});
+
+    auto result3 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result3, expected3);
   }
 
@@ -1068,7 +1098,8 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
       {{LCW{}, {{LCW{}}, valids}}, valids},
       {{{{1, 2, 9}, LCW{}}, {{5, 6, 7, 8, 9}, {0}, {15, 17}}}, valids},
     };
-    auto result0 = cudf::concatenate({split_a[0], split_b[0]});
+
+    auto result0 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result0, expected0);
 
     cudf::test::lists_column_wrapper<int> expected1{
@@ -1079,7 +1110,8 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
       {{{LCW{}, LCW{}}, valids}},
       {{LCW{}}},
     };
-    auto result1 = cudf::concatenate({split_a[0], split_b[1]});
+
+    auto result1 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result1, expected1);
 
     cudf::test::lists_column_wrapper<int> expected2{
@@ -1088,14 +1120,16 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
       {{LCW{}, {{LCW{}}, valids}}, valids},
       {{{{1, 2, 9}, LCW{}}, {{5, 6, 7, 8, 9}, {0}, {15, 17}}}, valids},
     };
-    auto result2 = cudf::concatenate({split_a[1], split_b[0]});
+
+    auto result2 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result2, expected2);
 
     cudf::test::lists_column_wrapper<int> expected3{
       {LCW{}, LCW{}, {{{10, 10, 10}, {{11, 11}, valids}, {12, 12}}, valids}, LCW{}},
       {{LCW{}}},
     };
-    auto result3 = cudf::concatenate({split_a[1], split_b[1]});
+
+    auto result3 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result3, expected3);
   }
 }
@@ -1140,11 +1174,12 @@ TEST_F(ListsColumnTest, ListOfStructs)
   }
 
   // build expected output
-  auto expected_child =
-    cudf::concatenate({inner_structs[0], inner_structs[1], inner_structs[2], inner_structs[3]});
+  auto struct_views = std::vector<column_view>(
+    {inner_structs[0], inner_structs[1], inner_structs[2], inner_structs[3]});
+  auto expected_child = cudf::concatenate(struct_views);
   fixed_width_column_wrapper<int> offsets_w{0, 1, 1, 1, 1, 4, 6, 6, 6, 10, 11};
-  auto expected = make_lists_column(
-    10, std::move(offsets_w.release()), std::move(expected_child), 0, rmm::device_buffer{});
+  auto expected =
+    make_lists_column(10, offsets_w.release(), std::move(expected_child), 0, rmm::device_buffer{});
 
   // lists
   std::vector<fixed_width_column_wrapper<int>> offsets;
@@ -1154,7 +1189,7 @@ TEST_F(ListsColumnTest, ListOfStructs)
   offsets.push_back({0, 0, 4, 5});
 
   // concatenate as lists
-  std::vector<std::unique_ptr<cudf::column>> src;
+  std::vector<std::unique_ptr<column>> src;
   for (size_t idx = 0; idx < inner_structs.size(); idx++) {
     int size = static_cast<column_view>(offsets[idx]).size() - 1;
     src.push_back(make_lists_column(
@@ -1162,7 +1197,7 @@ TEST_F(ListsColumnTest, ListOfStructs)
   }
 
   // concatenate
-  auto result = cudf::concatenate({*src[0], *src[1], *src[2], *src[3]});
+  auto result = cudf::concatenate(std::vector<column_view>({*src[0], *src[1], *src[2], *src[3]}));
   cudf::test::expect_columns_equivalent(*result, *expected);
 }
 
@@ -1189,8 +1224,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointConcatentate)
   auto const b = fw_wrapper(vec.begin() + 300, vec.begin() + 700);
   auto const c = fw_wrapper(vec.begin() + 700, vec.end());
 
-  auto const columns  = std::vector<cudf::column_view>{a, b, c};
-  auto const results  = cudf::concatenate(columns);
+  auto const results  = cudf::concatenate(std::vector<cudf::column_view>{a, b, c});
   auto const expected = fw_wrapper(vec.begin(), vec.end());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -1208,8 +1242,7 @@ TEST_F(FixedPointTest, FixedPointConcatentate)
   auto const b = fp_wrapper(vec.begin() + 300, vec.begin() + 700, scale_type{-2});
   auto const c = fp_wrapper(vec.begin() + 700, vec.end(), /*****/ scale_type{-2});
 
-  auto const columns  = std::vector<cudf::column_view>{a, b, c};
-  auto const results  = cudf::concatenate(columns);
+  auto const results  = cudf::concatenate(std::vector<cudf::column_view>{a, b, c});
   auto const expected = fp_wrapper(vec.begin(), vec.end(), scale_type{-2});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -1227,8 +1260,7 @@ TEST_F(FixedPointTest, FixedPointScaleMismatch)
   auto const b = fp_wrapper(vec.begin() + 300, vec.begin() + 700, scale_type{-2});
   auto const c = fp_wrapper(vec.begin() + 700, vec.end(), /*****/ scale_type{-3});
 
-  auto const columns = std::vector<cudf::column_view>{a, b, c};
-  EXPECT_THROW(cudf::concatenate(columns), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::logic_error);
 }
 
 struct DictionaryConcatTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index b0dc01ea001..108befa80a7 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -29,6 +29,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <type_traits>
 
@@ -395,7 +396,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col3_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
   auto col4_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 || i <= 60); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); });
   auto col5_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); });
 
@@ -657,7 +658,7 @@ TEST_F(OrcChunkedWriterTest, SimpleTable)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, true);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedSimple.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -677,7 +678,7 @@ TEST_F(OrcChunkedWriterTest, LargeTables)
   auto table1 = create_random_fixed_table<int>(512, 4096, true);
   auto table2 = create_random_fixed_table<int>(512, 8192, true);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLarge.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -737,7 +738,7 @@ TEST_F(OrcChunkedWriterTest, Strings)
   cols.push_back(strings2.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStrings.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -799,7 +800,7 @@ TEST_F(OrcChunkedWriterTest, ReadStripes)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, true);
 
-  auto full_table = cudf::concatenate({*table2, *table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table2, *table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStripes.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -863,7 +864,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -910,7 +911,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.orc");
   cudf_io::chunked_orc_writer_options opts =
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 013457d8ed6..880f11aaeb2 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -23,6 +23,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -472,7 +473,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   auto col3_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
   auto col4_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 || i <= 60); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); });
   auto col5_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); });
   auto col6_mask =
@@ -1218,7 +1219,7 @@ TEST_F(ParquetChunkedWriterTest, SimpleTable)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, true);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedSimple.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1238,7 +1239,7 @@ TEST_F(ParquetChunkedWriterTest, LargeTables)
   auto table1 = create_random_fixed_table<int>(512, 4096, true);
   auto table2 = create_random_fixed_table<int>(512, 8192, true);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLarge.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1300,7 +1301,7 @@ TEST_F(ParquetChunkedWriterTest, Strings)
   cols.push_back(strings2.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStrings.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1359,7 +1360,7 @@ TEST_F(ParquetChunkedWriterTest, ListColumn)
   auto tbl0 = table_view({col0_tbl0, col1_tbl0, col2_tbl0});
   auto tbl1 = table_view({col0_tbl1, col1_tbl1, col2_tbl1});
 
-  auto expected = cudf::concatenate({tbl0, tbl1});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl0, tbl1}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1413,7 +1414,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct)
 
   auto table_2 = table_view({*list_col_2});
 
-  auto full_table = cudf::concatenate({table_1, table_2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
   cudf_io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("family");
@@ -1504,7 +1505,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
 
   auto table_2 = table_view({*list_col_2});
 
-  auto full_table = cudf::concatenate({table_1, table_2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
   cudf_io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("family");
@@ -1639,7 +1640,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullability)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, false);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNullable.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1678,7 +1679,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
   auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
   auto table_2    = cudf::table_view({struct_2_2});
 
-  auto full_table = cudf::concatenate({table_1, table_2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
   cudf_io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("being");
@@ -1707,7 +1708,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability)
   auto table1 = create_random_fixed_table<int>(5, 5, false);
   auto table2 = create_random_fixed_table<int>(5, 5, false);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNoNullable.parquet");
 
@@ -1764,7 +1765,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   auto table1 = table_view({col00, col10});
   auto table2 = table_view({col01, col11});
 
-  auto full_table = cudf::concatenate({table1, table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table1, table2}));
 
   cudf_io::table_input_metadata metadata(table1);
   metadata.column_metadata[0].set_nullability(true);  // List is nullable at first (root) level
@@ -1809,7 +1810,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
   auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
   auto table_2    = cudf::table_view({struct_2_2});
 
-  auto full_table = cudf::concatenate({table_1, table_2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
   cudf_io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("being").set_nullability(false);
@@ -1838,7 +1839,7 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroups)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, true);
 
-  auto full_table = cudf::concatenate({*table2, *table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table2, *table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedRowGroups.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1951,7 +1952,7 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize)
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1998,7 +1999,7 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2)
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.parquet");
   cudf_io::chunked_parquet_writer_options args =
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 451fa82d5a3..b7d98704aff 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -705,7 +705,7 @@ TEST_F(MergeTest, KeysWithNulls)
   auto valids2 = cudf::detail::make_counting_transform_iterator(
     0, [](auto row) { return (row % 15 == 0) ? false : true; });
   cudf::test::fixed_width_column_wrapper<int32_t> data2(data_iter, data_iter + nrows, valids2);
-  auto all_data = cudf::concatenate({data1, data2});
+  auto all_data = cudf::concatenate(std::vector<cudf::column_view>{{data1, data2}});
 
   std::vector<cudf::order> column_orders{cudf::order::ASCENDING, cudf::order::DESCENDING};
   std::vector<cudf::null_order> null_precedences{cudf::null_order::AFTER, cudf::null_order::BEFORE};
diff --git a/python/cudf/cudf/_lib/cpp/concatenate.pxd b/python/cudf/cudf/_lib/cpp/concatenate.pxd
index b5ec3bcb7d4..c776d23aa85 100644
--- a/python/cudf/cudf/_lib/cpp/concatenate.pxd
+++ b/python/cudf/cudf/_lib/cpp/concatenate.pxd
@@ -5,12 +5,22 @@ from libcpp.vector cimport vector
 
 from cudf._lib.cpp.column.column cimport column, column_view
 from cudf._lib.cpp.table.table cimport table, table_view
-from rmm._lib.device_buffer cimport device_buffer
+from cudf._lib.cpp.utilities.host_span cimport host_span
 
+from rmm._lib.device_buffer cimport device_buffer
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
+    # The versions of concatenate taking vectors don't exist in libcudf
+    # C++, but passing a vector works because a host_span is implicitly
+    # constructable from a vector. In case they are needed in the future,
+    # host_span versions can be added, e.g:
+    #
+    # cdef device_buffer concatenate_masks "cudf::concatenate_masks"(
+    #    host_span[column_view] views
+    # ) except +
+
     cdef device_buffer concatenate_masks "cudf::concatenate_masks"(
-        const vector[column_view] columns
+        const vector[column_view] views
     ) except +
     cdef unique_ptr[column] concatenate_columns "cudf::concatenate"(
         const vector[column_view] columns
diff --git a/python/cudf/cudf/_lib/cpp/utilities/host_span.pxd b/python/cudf/cudf/_lib/cpp/utilities/host_span.pxd
new file mode 100644
index 00000000000..cbbe3710347
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/utilities/host_span.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.vector cimport vector
+
+cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil:
+    cdef cppclass host_span[T]:
+        host_span() except +
+        host_span(vector[T]) except +

From f1f1d0fbeae24faec3a82c7a4a9dd6f3cedc9ef1 Mon Sep 17 00:00:00 2001
From: Kumar Aatish <kaatish@nvidia.com>
Date: Thu, 25 Mar 2021 01:14:10 -0400
Subject: [PATCH 14/14] Add column_device_view to orc writer (#7676)

This PR adds column_device_view members to EncChunk, DictionaryChunk and StripeDictionary structures which are used in the ORC writer. The idea is to replace members in these structures which replicate the same information. Usage of nvstrdesc_s has also been eliminated in the ORC writer.

Fixes #7347, Addresses #5682, Addresses #7334

Authors:
  - Kumar Aatish (@kaatish)

Approvers:
  - Vukasin Milovanovic (@vuule)
  - Devavret Makkar (@devavret)

URL: https://github.com/rapidsai/cudf/pull/7676
---
 cpp/src/io/orc/dict_enc.cu            | 151 +++++++++---------
 cpp/src/io/orc/orc_gpu.h              |  69 ++++++---
 cpp/src/io/orc/stripe_enc.cu          |  82 ++++++----
 cpp/src/io/orc/writer_impl.cu         | 213 ++++++++++----------------
 cpp/src/io/orc/writer_impl.hpp        |  10 +-
 cpp/src/io/parquet/page_dict.cu       |   8 +-
 cpp/src/io/parquet/page_enc.cu        |  10 +-
 cpp/src/io/parquet/writer_impl.cu     |   2 +-
 cpp/src/io/statistics/column_stats.cu |   6 -
 cpp/src/io/statistics/column_stats.h  |   5 +-
 10 files changed, 283 insertions(+), 273 deletions(-)

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 99157a23fcb..5695e882a95 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -17,6 +17,7 @@
 #include "orc_common.h"
 #include "orc_gpu.h"
 
+#include <cudf/table/table_device_view.cuh>
 #include <io/utilities/block_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -46,14 +47,16 @@ struct dictinit_state_s {
 };
 
 /**
- * @brief Return a 12-bit hash from a byte sequence
+ * @brief Return a 12-bit hash from a string
  */
-static inline __device__ uint32_t nvstr_init_hash(char const *ptr, uint32_t len)
+static inline __device__ uint32_t hash_string(const string_view val)
 {
-  if (len != 0) {
-    return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
-  } else {
+  if (val.empty()) {
     return 0;
+  } else {
+    char const *ptr = val.data();
+    uint32_t len    = val.size_bytes();
+    return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
   }
 }
 
@@ -71,7 +74,8 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
 {
   if (t == 0) { s->nnz = 0; }
   for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) {
-    const uint32_t *valid_map = s->chunk.valid_map_base;
+    const uint32_t *valid_map = s->chunk.leaf_column->null_mask();
+    auto column_offset        = s->chunk.leaf_column->offset();
     uint32_t is_valid, nz_pos;
     if (t < block_size / 32) {
       if (!valid_map) {
@@ -80,10 +84,10 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
         uint32_t const row   = s->chunk.start_row + i + t * 32;
         auto const chunk_end = s->chunk.start_row + s->chunk.num_rows;
 
-        auto const valid_map_idx = (row + s->chunk.column_offset) / 32;
+        auto const valid_map_idx = (row + column_offset) / 32;
         uint32_t valid           = (row < chunk_end) ? valid_map[valid_map_idx] : 0;
 
-        auto const rows_in_next_word = (row + s->chunk.column_offset) & 0x1f;
+        auto const rows_in_next_word = (row + column_offset) & 0x1f;
         if (rows_in_next_word != 0) {
           auto const rows_in_current_word = 32 - rows_in_next_word;
           // Read next word if any rows are within the chunk
@@ -111,12 +115,18 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
  * @brief Gather all non-NULL string rows and compute total character data size
  *
  * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_columns Number of columns
+ * @param[in] num_columns Number of string columns
  */
 // blockDim {block_size,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 2)
-  gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
+  gpuInitDictionaryIndices(DictionaryChunk *chunks,
+                           const table_device_view view,
+                           uint32_t *dict_data,
+                           uint32_t *dict_index,
+                           size_t row_index_stride,
+                           size_type *str_col_ids,
+                           uint32_t num_columns)
 {
   __shared__ __align__(16) dictinit_state_s state_g;
 
@@ -131,12 +141,21 @@ __global__ void __launch_bounds__(block_size, 2)
   dictinit_state_s *const s = &state_g;
   uint32_t col_id           = blockIdx.x;
   uint32_t group_id         = blockIdx.y;
-  const nvstrdesc_s *ck_data;
-  uint32_t *dict_data;
   uint32_t nnz, start_row, dict_char_count;
   int t = threadIdx.x;
 
-  if (t == 0) s->chunk = chunks[group_id * num_columns + col_id];
+  if (t == 0) {
+    column_device_view *leaf_column_view = view.begin() + str_col_ids[col_id];
+    s->chunk                             = chunks[group_id * num_columns + col_id];
+    s->chunk.leaf_column                 = leaf_column_view;
+    s->chunk.dict_data =
+      dict_data + col_id * leaf_column_view->size() + group_id * row_index_stride;
+    s->chunk.dict_index = dict_index + col_id * leaf_column_view->size();
+    s->chunk.start_row  = group_id * row_index_stride;
+    s->chunk.num_rows =
+      min(row_index_stride,
+          max(static_cast<size_t>(leaf_column_view->size() - s->chunk.start_row), size_t{0}));
+  }
   for (uint32_t i = 0; i < sizeof(s->map) / sizeof(uint32_t); i += block_size) {
     if (i + t < sizeof(s->map) / sizeof(uint32_t)) s->map.u32[i + t] = 0;
   }
@@ -152,15 +171,15 @@ __global__ void __launch_bounds__(block_size, 2)
   nnz       = s->nnz;
   dict_data = s->chunk.dict_data;
   start_row = s->chunk.start_row;
-  ck_data   = static_cast<const nvstrdesc_s *>(s->chunk.column_data_base) + start_row;
   for (uint32_t i = 0; i < nnz; i += block_size) {
     uint32_t ck_row = 0;
     uint32_t hash   = 0;
     uint32_t len    = 0;
     if (i + t < nnz) {
-      ck_row = s->dict[i + t];
-      len    = static_cast<uint32_t>(ck_data[ck_row].count);
-      hash   = nvstr_init_hash(ck_data[ck_row].ptr, len);
+      ck_row                 = s->dict[i + t];
+      string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
+      len                    = static_cast<uint32_t>(string_val.size_bytes());
+      hash                   = hash_string(string_val);
     }
     len = block_reduce(temp_storage.reduce_storage).Sum(len);
     if (t == 0) s->chunk.string_char_count += len;
@@ -200,10 +219,11 @@ __global__ void __launch_bounds__(block_size, 2)
     uint32_t ck_row = 0, pos = 0, hash = 0, pos_old, pos_new, sh, colliding_row;
     bool collision;
     if (i + t < nnz) {
-      ck_row  = dict_data[i + t] - start_row;
-      hash    = nvstr_init_hash(ck_data[ck_row].ptr, static_cast<uint32_t>(ck_data[ck_row].count));
-      sh      = (hash & 1) ? 16 : 0;
-      pos_old = s->map.u16[hash];
+      ck_row                 = dict_data[i + t] - start_row;
+      string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
+      hash                   = hash_string(string_val);
+      sh                     = (hash & 1) ? 16 : 0;
+      pos_old                = s->map.u16[hash];
     }
     // The isolation of the atomicAdd, along with pos_old/pos_new is to guarantee deterministic
     // behavior for the first row in the hash map that will be used for early duplicate detection
@@ -233,18 +253,16 @@ __global__ void __launch_bounds__(block_size, 2)
   for (uint32_t i = 0; i < nnz; i += block_size) {
     uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0;
     if (i + t < nnz) {
-      const char *str1, *str2;
-      uint32_t len1, len2, hash;
-      ck_row     = s->dict[i + t];
-      str1       = ck_data[ck_row].ptr;
-      len1       = static_cast<uint32_t>(ck_data[ck_row].count);
-      hash       = nvstr_init_hash(str1, len1);
-      ck_row_ref = s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0];
+      ck_row                   = s->dict[i + t];
+      string_view string_value = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
+      auto const string_length = static_cast<uint32_t>(string_value.size_bytes());
+      auto const hash          = hash_string(string_value);
+      ck_row_ref               = s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0];
       if (ck_row_ref != ck_row) {
-        str2    = ck_data[ck_row_ref].ptr;
-        len2    = static_cast<uint32_t>(ck_data[ck_row_ref].count);
-        is_dupe = nvstr_is_equal(str1, len1, str2, len2);
-        dict_char_count += (is_dupe) ? 0 : len1;
+        string_view reference_string =
+          s->chunk.leaf_column->element<string_view>(ck_row_ref + start_row);
+        is_dupe = (string_value == reference_string);
+        dict_char_count += (is_dupe) ? 0 : string_length;
       }
     }
     uint32_t dupes_in_block;
@@ -269,6 +287,12 @@ __global__ void __launch_bounds__(block_size, 2)
     chunks[group_id * num_columns + col_id].string_char_count = s->chunk.string_char_count;
     chunks[group_id * num_columns + col_id].num_dict_strings  = nnz - s->total_dupes;
     chunks[group_id * num_columns + col_id].dict_char_count   = dict_char_count;
+    chunks[group_id * num_columns + col_id].leaf_column       = s->chunk.leaf_column;
+
+    chunks[group_id * num_columns + col_id].dict_data  = s->chunk.dict_data;
+    chunks[group_id * num_columns + col_id].dict_index = s->chunk.dict_index;
+    chunks[group_id * num_columns + col_id].start_row  = s->chunk.start_row;
+    chunks[group_id * num_columns + col_id].num_rows   = s->chunk.num_rows;
   }
 }
 
@@ -357,7 +381,6 @@ __global__ void __launch_bounds__(block_size)
   uint32_t num_strings;
   uint32_t *dict_data, *dict_index;
   uint32_t dict_char_count;
-  const nvstrdesc_s *str_data;
   int t = threadIdx.x;
 
   if (t == 0) s->stripe = stripes[stripe_id * num_columns + col_id];
@@ -366,21 +389,17 @@ __global__ void __launch_bounds__(block_size)
   num_strings = s->stripe.num_strings;
   dict_data   = s->stripe.dict_data;
   if (!dict_data) return;
-  dict_index      = s->stripe.dict_index;
-  str_data        = static_cast<const nvstrdesc_s *>(s->stripe.column_data_base);
-  dict_char_count = 0;
+  dict_index                 = s->stripe.dict_index;
+  string_view current_string = string_view::min();
+  dict_char_count            = 0;
   for (uint32_t i = 0; i < num_strings; i += block_size) {
     uint32_t cur     = (i + t < num_strings) ? dict_data[i + t] : 0;
     uint32_t cur_len = 0;
-    const char *cur_ptr;
-    bool is_dupe = false;
-    if (i + t < num_strings) {
-      cur_ptr = str_data[cur].ptr;
-      cur_len = str_data[cur].count;
-    }
+    bool is_dupe     = false;
+    if (i + t < num_strings) { current_string = s->stripe.leaf_column->element<string_view>(cur); }
     if (i + t != 0 && i + t < num_strings) {
       uint32_t prev = dict_data[i + t - 1];
-      is_dupe       = nvstr_is_equal(cur_ptr, cur_len, str_data[prev].ptr, str_data[prev].count);
+      is_dupe       = (current_string == (s->stripe.leaf_column->element<string_view>(prev)));
     }
     dict_char_count += (is_dupe) ? 0 : cur_len;
     uint32_t dupes_in_block;
@@ -403,14 +422,14 @@ __global__ void __launch_bounds__(block_size)
 }
 
 /**
- * @brief Launches kernel for initializing dictionary chunks
- *
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_columns Number of columns
- * @param[in] num_rowgroups Number of row groups
- * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
+ * @copydoc cudf::io::orc::gpu::InitDictionaryIndices
  */
-void InitDictionaryIndices(DictionaryChunk *chunks,
+void InitDictionaryIndices(const table_device_view &view,
+                           DictionaryChunk *chunks,
+                           uint32_t *dict_data,
+                           uint32_t *dict_index,
+                           size_t row_index_stride,
+                           size_type *str_col_ids,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
                            rmm::cuda_stream_view stream)
@@ -418,20 +437,12 @@ void InitDictionaryIndices(DictionaryChunk *chunks,
   static constexpr int block_size = 512;
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(num_columns, num_rowgroups);
-  gpuInitDictionaryIndices<block_size>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns);
+  gpuInitDictionaryIndices<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
+    chunks, view, dict_data, dict_index, row_index_stride, str_col_ids, num_columns);
 }
 
 /**
- * @brief Launches kernel for building stripe dictionaries
- *
- * @param[in] stripes StripeDictionary device array [stripe][column]
- * @param[in] stripes_host StripeDictionary host array [stripe][column]
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_stripes Number of stripes
- * @param[in] num_rowgroups Number of row groups
- * @param[in] num_columns Number of columns
- * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
+ * @copydoc cudf::io::orc::gpu::BuildStripeDictionaries
  */
 void BuildStripeDictionaries(StripeDictionary *stripes,
                              StripeDictionary *stripes_host,
@@ -447,18 +458,16 @@ void BuildStripeDictionaries(StripeDictionary *stripes,
     stripes, chunks, num_columns);
   for (uint32_t i = 0; i < num_stripes * num_columns; i++) {
     if (stripes_host[i].dict_data != nullptr) {
-      thrust::device_ptr<uint32_t> p = thrust::device_pointer_cast(stripes_host[i].dict_data);
-      const nvstrdesc_s *str_data =
-        static_cast<const nvstrdesc_s *>(stripes_host[i].column_data_base);
+      thrust::device_ptr<uint32_t> dict_data_ptr =
+        thrust::device_pointer_cast(stripes_host[i].dict_data);
+      column_device_view *string_column = stripes_host[i].leaf_column;
       // NOTE: Requires the --expt-extended-lambda nvcc flag
       thrust::sort(rmm::exec_policy(stream),
-                   p,
-                   p + stripes_host[i].num_strings,
-                   [str_data] __device__(const uint32_t &lhs, const uint32_t &rhs) {
-                     return nvstr_is_lesser(str_data[lhs].ptr,
-                                            (uint32_t)str_data[lhs].count,
-                                            str_data[rhs].ptr,
-                                            (uint32_t)str_data[rhs].count);
+                   dict_data_ptr,
+                   dict_data_ptr + stripes_host[i].num_strings,
+                   [string_column] __device__(const uint32_t &lhs, const uint32_t &rhs) {
+                     return string_column->element<string_view>(lhs) <
+                            string_column->element<string_view>(rhs);
                    });
     }
   }
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 7ad92e40cb4..55df0adf95b 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -21,6 +21,7 @@
 #include <io/comp/gpuinflate.h>
 #include <io/orc/orc_common.h>
 #include <io/statistics/column_stats.h>
+#include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -124,16 +125,15 @@ struct RowGroup {
  * @brief Struct to describe an encoder data chunk
  */
 struct EncChunk {
-  const uint32_t *valid_map_base;  // base ptr of input valid bit map
-  size_type column_offset;         // index of the first element relative to the base memory
-  const void *column_data_base;    // base ptr of input column data
-  uint32_t start_row;              // start row of this chunk
-  uint32_t num_rows;               // number of rows in this chunk
-  uint32_t valid_rows;             // max number of valid rows
-  uint8_t encoding_kind;           // column encoding kind (orc::ColumnEncodingKind)
-  uint8_t type_kind;               // column data type (orc::TypeKind)
-  uint8_t dtype_len;               // data type length
-  uint8_t scale;                   // scale for decimals or timestamps
+  uint32_t start_row;     // start row of this chunk
+  uint32_t num_rows;      // number of rows in this chunk
+  uint8_t encoding_kind;  // column encoding kind (orc::ColumnEncodingKind)
+  uint8_t type_kind;      // column data type (orc::TypeKind)
+  uint8_t dtype_len;      // data type length
+  uint8_t scale;          // scale for decimals or timestamps
+
+  uint32_t *dict_index;  // dictionary index from row index
+  column_device_view *leaf_column;
 };
 
 /**
@@ -163,10 +163,7 @@ struct StripeStream {
  * @brief Struct to describe a dictionary chunk
  */
 struct DictionaryChunk {
-  const uint32_t *valid_map_base;  // base ptr of input valid bit map
-  size_type column_offset;         // index of the first element relative to the base memory
-  const void *column_data_base;    // base ptr of column data (ptr,len pair)
-  uint32_t *dict_data;             // dictionary data (index of non-null rows)
+  uint32_t *dict_data;   // dictionary data (index of non-null rows)
   uint32_t *dict_index;  // row indices of corresponding string (row from dictionary index)
   uint32_t start_row;    // start row of this chunk
   uint32_t num_rows;     // num rows in this chunk
@@ -175,20 +172,23 @@ struct DictionaryChunk {
     string_char_count;  // total size of string data (NOTE: assumes less than 4G bytes per chunk)
   uint32_t num_dict_strings;  // number of strings in dictionary
   uint32_t dict_char_count;   // size of dictionary string data for this chunk
+
+  column_device_view *leaf_column;  //!< Pointer to string column
 };
 
 /**
  * @brief Struct to describe a dictionary
  */
 struct StripeDictionary {
-  const void *column_data_base;  // base ptr of column data (ptr,len pair)
-  uint32_t *dict_data;           // row indices of corresponding string (row from dictionary index)
-  uint32_t *dict_index;          // dictionary index from row index
-  uint32_t column_id;            // real column id
-  uint32_t start_chunk;          // first chunk in stripe
-  uint32_t num_chunks;           // number of chunks in the stripe
-  uint32_t num_strings;          // number of unique strings in the dictionary
-  uint32_t dict_char_count;      // total size of dictionary string data
+  uint32_t *dict_data;       // row indices of corresponding string (row from dictionary index)
+  uint32_t *dict_index;      // dictionary index from row index
+  uint32_t column_id;        // real column id
+  uint32_t start_chunk;      // first chunk in stripe
+  uint32_t num_chunks;       // number of chunks in the stripe
+  uint32_t num_strings;      // number of unique strings in the dictionary
+  uint32_t dict_char_count;  // total size of dictionary string data
+
+  column_device_view *leaf_column;  //!< Pointer to string column
 };
 
 /**
@@ -313,6 +313,17 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
                               detail::device_2dspan<encoder_chunk_streams> enc_streams,
                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @brief Set leaf column element of EncChunk
+ *
+ * @param[in] view table device view representing input table
+ * @param[in,out] chunks encoder chunk device array [column][rowgroup]
+ * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
+ */
+void set_chunk_columns(const table_device_view &view,
+                       detail::device_2dspan<EncChunk> chunks,
+                       rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for compacting chunked column data prior to compression
  *
@@ -350,15 +361,25 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
 /**
  * @brief Launches kernel for initializing dictionary chunks
  *
+ * @param[in] view table device view representing input table
  * @param[in,out] chunks DictionaryChunk device array [rowgroup][column]
+ * @param[in] dict_data dictionary data (index of non-null rows)
+ * @param[in] dict_index row indices of corresponding string (row from dictionary index)
+ * @param[in] row_index_stride Rowgroup size in rows
+ * @param[in] str_col_ids List of columns that are strings type
  * @param[in] num_columns Number of columns
  * @param[in] num_rowgroups Number of row groups
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void InitDictionaryIndices(DictionaryChunk *chunks,
+void InitDictionaryIndices(const table_device_view &view,
+                           DictionaryChunk *chunks,
+                           uint32_t *dict_data,
+                           uint32_t *dict_index,
+                           size_t row_index_stride,
+                           size_type *str_col_ids,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
-                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                           rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building stripe dictionaries
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index aef32efaf6e..10932d36309 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -669,19 +669,20 @@ __global__ void __launch_bounds__(block_size)
       if (t * 8 < nrows) {
         uint32_t row  = s->chunk.start_row + present_rows + t * 8;
         uint8_t valid = 0;
-        if (row < s->chunk.valid_rows) {
-          if (s->chunk.valid_map_base) {
-            size_type current_valid_offset = row + s->chunk.column_offset;
-            size_type next_valid_offset    = current_valid_offset + min(32, s->chunk.valid_rows);
+        if (row < s->chunk.leaf_column->size()) {
+          if (s->chunk.leaf_column->nullable()) {
+            size_type current_valid_offset = row + s->chunk.leaf_column->offset();
+            size_type next_valid_offset =
+              current_valid_offset + min(32, s->chunk.leaf_column->size());
 
             bitmask_type mask = cudf::detail::get_mask_offset_word(
-              s->chunk.valid_map_base, 0, current_valid_offset, next_valid_offset);
+              s->chunk.leaf_column->null_mask(), 0, current_valid_offset, next_valid_offset);
             valid = 0xff & mask;
           } else {
             valid = 0xff;
           }
-          if (row + 7 > s->chunk.valid_rows) {
-            valid = valid & ((1 << (s->chunk.valid_rows & 7)) - 1);
+          if (row + 7 > s->chunk.leaf_column->size()) {
+            valid = valid & ((1 << (s->chunk.leaf_column->size() & 7)) - 1);
           }
         }
         s->valid_buf[(row >> 3) & 0x1ff] = valid;
@@ -729,19 +730,18 @@ __global__ void __launch_bounds__(block_size)
       lengths_to_positions(s->buf.u32, 512, t);
       __syncthreads();
       if (valid) {
-        int nz_idx       = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1);
-        void const *base = s->chunk.column_data_base;
+        int nz_idx = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1);
         switch (s->chunk.type_kind) {
           case INT:
           case DATE:
-          case FLOAT: s->vals.u32[nz_idx] = static_cast<const uint32_t *>(base)[row]; break;
+          case FLOAT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint32_t>(row); break;
           case DOUBLE:
-          case LONG: s->vals.u64[nz_idx] = static_cast<const uint64_t *>(base)[row]; break;
-          case SHORT: s->vals.u32[nz_idx] = static_cast<const uint16_t *>(base)[row]; break;
+          case LONG: s->vals.u64[nz_idx] = s->chunk.leaf_column->element<uint64_t>(row); break;
+          case SHORT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint16_t>(row); break;
           case BOOLEAN:
-          case BYTE: s->vals.u8[nz_idx] = static_cast<const uint8_t *>(base)[row]; break;
+          case BYTE: s->vals.u8[nz_idx] = s->chunk.leaf_column->element<uint8_t>(row); break;
           case TIMESTAMP: {
-            int64_t ts       = static_cast<const int64_t *>(base)[row];
+            int64_t ts       = s->chunk.leaf_column->element<int64_t>(row);
             int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)];
             int64_t seconds  = ts / ts_scale;
             int64_t nanos    = (ts - seconds * ts_scale);
@@ -772,16 +772,13 @@ __global__ void __launch_bounds__(block_size)
           }
           case STRING:
             if (s->chunk.encoding_kind == DICTIONARY_V2) {
-              uint32_t dict_idx = static_cast<const uint32_t *>(base)[row];
-              if (dict_idx > 0x7fffffffu)
-                dict_idx = static_cast<const uint32_t *>(base)[dict_idx & 0x7fffffffu];
+              uint32_t dict_idx = s->chunk.dict_index[row];
+              if (dict_idx > 0x7fffffffu) dict_idx = s->chunk.dict_index[dict_idx & 0x7fffffffu];
               s->vals.u32[nz_idx] = dict_idx;
             } else {
-              const nvstrdesc_s *str_desc = static_cast<const nvstrdesc_s *>(base) + row;
-              const char *ptr             = str_desc->ptr;
-              uint32_t count              = static_cast<uint32_t>(str_desc->count);
-              s->u.strenc.str_data[s->buf.u32[t] - 1] = ptr;
-              s->lengths.u32[nz_idx]                  = count;
+              string_view value = s->chunk.leaf_column->element<string_view>(row);
+              s->u.strenc.str_data[s->buf.u32[t] - 1] = value.data();
+              s->lengths.u32[nz_idx]                  = value.size_bytes();
             }
             break;
           default: break;
@@ -899,8 +896,8 @@ __global__ void __launch_bounds__(block_size)
     streams[col_id][group_id].lengths[t] = s->strm_pos[t];
     if (!s->stream.data_ptrs[t]) {
       streams[col_id][group_id].data_ptrs[t] =
-        static_cast<uint8_t *>(const_cast<void *>(s->chunk.column_data_base)) +
-        s->chunk.start_row * s->chunk.dtype_len;
+        static_cast<uint8_t *>(const_cast<void *>(s->chunk.leaf_column->head())) +
+        (s->chunk.leaf_column->offset() + s->chunk.start_row) * s->chunk.dtype_len;
     }
   }
 }
@@ -939,8 +936,8 @@ __global__ void __launch_bounds__(block_size)
     s->nrows         = s->u.dict_stripe.num_strings;
     s->cur_row       = 0;
   }
-  auto const str_desc  = static_cast<const nvstrdesc_s *>(s->u.dict_stripe.column_data_base);
-  auto const dict_data = s->u.dict_stripe.dict_data;
+  column_device_view *string_column = s->u.dict_stripe.leaf_column;
+  auto const dict_data              = s->u.dict_stripe.dict_data;
   __syncthreads();
   if (s->chunk.encoding_kind != DICTIONARY_V2) {
     return;  // This column isn't using dictionary encoding -> bail out
@@ -951,8 +948,13 @@ __global__ void __launch_bounds__(block_size)
     uint32_t string_idx = (t < numvals) ? dict_data[s->cur_row + t] : 0;
     if (cid == CI_DICTIONARY) {
       // Encoding string contents
-      const char *ptr = (t < numvals) ? str_desc[string_idx].ptr : 0;
-      uint32_t count  = (t < numvals) ? static_cast<uint32_t>(str_desc[string_idx].count) : 0;
+      const char *ptr = 0;
+      uint32_t count  = 0;
+      if (t < numvals) {
+        auto string_val = string_column->element<string_view>(string_idx);
+        ptr             = string_val.data();
+        count           = string_val.size_bytes();
+      }
       s->u.strenc.str_data[t] = ptr;
       StoreStringData(s->stream.data_ptrs[CI_DICTIONARY] + s->strm_pos[CI_DICTIONARY],
                       &s->u.strenc,
@@ -961,7 +963,10 @@ __global__ void __launch_bounds__(block_size)
       if (!t) { s->strm_pos[CI_DICTIONARY] += s->u.strenc.char_count; }
     } else {
       // Encoding string lengths
-      uint32_t count  = (t < numvals) ? static_cast<uint32_t>(str_desc[string_idx].count) : 0;
+      uint32_t count =
+        (t < numvals)
+          ? static_cast<uint32_t>(string_column->element<string_view>(string_idx).size_bytes())
+          : 0;
       uint32_t nz_idx = (s->cur_row + t) & 0x3ff;
       if (t < numvals) s->lengths.u32[nz_idx] = count;
       __syncthreads();
@@ -982,6 +987,15 @@ __global__ void __launch_bounds__(block_size)
   if (t == 0) { strm_ptr->lengths[cid] = s->strm_pos[cid]; }
 }
 
+__global__ void __launch_bounds__(512)
+  gpu_set_chunk_columns(const table_device_view view, device_2dspan<EncChunk> chunks)
+{
+  // Set leaf_column member of EncChunk
+  for (size_type i = threadIdx.x; i < chunks.size().second; i += blockDim.x) {
+    chunks[blockIdx.x][i].leaf_column = view.begin() + blockIdx.x;
+  }
+}
+
 /**
  * @brief Merge chunked column data into a single contiguous stream
  *
@@ -1189,6 +1203,16 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
     <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, chunks, enc_streams);
 }
 
+void set_chunk_columns(const table_device_view &view,
+                       device_2dspan<EncChunk> chunks,
+                       rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(512, 1);
+  dim3 dim_grid(chunks.size().first, 1);
+
+  gpu_set_chunk_columns<<<dim_grid, dim_block, 0, stream.value()>>>(view, chunks);
+}
+
 void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index eb5e90bbeec..cb75698fd8d 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -42,7 +42,6 @@ namespace detail {
 namespace orc {
 using namespace cudf::io::orc;
 using namespace cudf::io;
-using cudf::io::orc::gpu::nvstrdesc_s;
 
 struct row_group_index_info {
   int32_t pos       = -1;  // Position
@@ -111,39 +110,6 @@ constexpr T to_clockscale(cudf::type_id timestamp_id)
 
 }  // namespace
 
-/**
- * @brief Helper kernel for converting string data/offsets into nvstrdesc
- * REMOVEME: Once we eliminate the legacy readers/writers, the kernels could be
- * made to use the native offset+data layout.
- */
-__global__ void stringdata_to_nvstrdesc(gpu::nvstrdesc_s *dst,
-                                        const size_type *offsets,
-                                        const char *strdata,
-                                        const uint32_t *nulls,
-                                        const size_type column_offset,
-                                        size_type column_size)
-{
-  size_type row = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row < column_size) {
-    uint32_t is_valid = (nulls != nullptr)
-                          ? (nulls[(row + column_offset) / 32] >> ((row + column_offset) % 32)) & 1
-                          : 1;
-    size_t count;
-    const char *ptr;
-    if (is_valid) {
-      size_type cur  = offsets[row];
-      size_type next = offsets[row + 1];
-      ptr            = strdata + cur;
-      count          = (next > cur) ? next - cur : 0;
-    } else {
-      ptr   = nullptr;
-      count = 0;
-    }
-    dst[row].ptr   = ptr;
-    dst[row].count = count;
-  }
-}
-
 /**
  * @brief Helper class that adds ORC-specific column info
  */
@@ -160,31 +126,14 @@ class orc_column_view {
                            rmm::cuda_stream_view stream)
     : _id(id),
       _str_id(str_id),
-      _string_type(col.type().id() == type_id::STRING),
-      _type_width(_string_type ? 0 : cudf::size_of(col.type())),
+      _is_string_type(col.type().id() == type_id::STRING),
+      _type_width(_is_string_type ? 0 : cudf::size_of(col.type())),
       _data_count(col.size()),
       _null_count(col.null_count()),
-      _data(col.head<uint8_t>() + col.offset() * _type_width),
       _nulls(col.null_mask()),
-      _column_offset(col.offset()),
       _clockscale(to_clockscale<uint8_t>(col.type().id())),
       _type_kind(to_orc_type(col.type().id()))
   {
-    if (_string_type && _data_count > 0) {
-      strings_column_view view{col};
-      _indexes = rmm::device_buffer(_data_count * sizeof(gpu::nvstrdesc_s), stream);
-
-      stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream.value()>>>(
-        static_cast<gpu::nvstrdesc_s *>(_indexes.data()),
-        view.offsets().data<size_type>() + view.offset(),
-        view.chars().data<char>(),
-        _nulls,
-        _column_offset,
-        _data_count);
-      _data = _indexes.data();
-
-      stream.synchronize();
-    }
     // Generating default name if name isn't present in metadata
     if (metadata && _id < metadata->column_names.size()) {
       _name = metadata->column_names[_id];
@@ -193,7 +142,7 @@ class orc_column_view {
     }
   }
 
-  auto is_string() const noexcept { return _string_type; }
+  auto is_string() const noexcept { return _is_string_type; }
   void set_dict_stride(size_t stride) noexcept { dict_stride = stride; }
   auto get_dict_stride() const noexcept { return dict_stride; }
 
@@ -207,7 +156,7 @@ class orc_column_view {
   }
   auto host_dict_chunk(size_t rowgroup) const
   {
-    assert(_string_type);
+    assert(_is_string_type);
     return &dict[rowgroup * dict_stride + _str_id];
   }
   auto device_dict_chunk() const { return d_dict; }
@@ -223,7 +172,7 @@ class orc_column_view {
   }
   auto host_stripe_dict(size_t stripe) const
   {
-    assert(_string_type);
+    assert(_is_string_type);
     return &stripe_dict[stripe * dict_stride + _str_id];
   }
   auto device_stripe_dict() const { return d_stripe_dict; }
@@ -233,9 +182,7 @@ class orc_column_view {
   size_t data_count() const noexcept { return _data_count; }
   size_t null_count() const noexcept { return _null_count; }
   bool nullable() const noexcept { return (_nulls != nullptr); }
-  void const *data() const noexcept { return _data; }
   uint32_t const *nulls() const noexcept { return _nulls; }
-  size_type column_offset() const noexcept { return _column_offset; }
   uint8_t clockscale() const noexcept { return _clockscale; }
 
   void set_orc_encoding(ColumnEncodingKind e) { _encoding_kind = e; }
@@ -245,17 +192,15 @@ class orc_column_view {
 
  private:
   // Identifier within set of columns and string columns, respectively
-  size_t _id        = 0;
-  size_t _str_id    = 0;
-  bool _string_type = false;
-
-  size_t _type_width       = 0;
-  size_t _data_count       = 0;
-  size_t _null_count       = 0;
-  void const *_data        = nullptr;
-  uint32_t const *_nulls   = nullptr;
-  size_type _column_offset = 0;
-  uint8_t _clockscale      = 0;
+  size_t _id           = 0;
+  size_t _str_id       = 0;
+  bool _is_string_type = false;
+
+  size_t _type_width     = 0;
+  size_t _data_count     = 0;
+  size_t _null_count     = 0;
+  uint32_t const *_nulls = nullptr;
+  uint8_t _clockscale    = 0;
 
   // ORC-related members
   std::string _name{};
@@ -263,7 +208,6 @@ class orc_column_view {
   ColumnEncodingKind _encoding_kind;
 
   // String dictionary-related members
-  rmm::device_buffer _indexes;
   size_t dict_stride                       = 0;
   gpu::DictionaryChunk const *dict         = nullptr;
   gpu::StripeDictionary const *stripe_dict = nullptr;
@@ -308,8 +252,10 @@ std::vector<stripe_rowgroups> writer::impl::gather_stripe_info(
   return infos;
 }
 
-void writer::impl::init_dictionaries(orc_column_view *columns,
+void writer::impl::init_dictionaries(const table_device_view &view,
+                                     orc_column_view *columns,
                                      std::vector<int> const &str_col_ids,
+                                     device_span<size_type> d_str_col_ids,
                                      uint32_t *dict_data,
                                      uint32_t *dict_index,
                                      hostdevice_vector<gpu::DictionaryChunk> *dict)
@@ -321,26 +267,17 @@ void writer::impl::init_dictionaries(orc_column_view *columns,
     auto &str_column = columns[str_col_ids[i]];
     str_column.set_dict_stride(str_col_ids.size());
     str_column.attach_dict_chunk(dict->host_ptr(), dict->device_ptr());
-
-    for (size_t g = 0; g < num_rowgroups; g++) {
-      auto *ck              = &(*dict)[g * str_col_ids.size() + i];
-      ck->valid_map_base    = str_column.nulls();
-      ck->column_offset     = str_column.column_offset();
-      ck->column_data_base  = str_column.data();
-      ck->dict_data         = dict_data + i * str_column.data_count() + g * row_index_stride_;
-      ck->dict_index        = dict_index + i * str_column.data_count();  // Indexed by abs row
-      ck->start_row         = g * row_index_stride_;
-      ck->num_rows          = std::min<uint32_t>(row_index_stride_,
-                                        std::max<int>(str_column.data_count() - ck->start_row, 0));
-      ck->num_strings       = 0;
-      ck->string_char_count = 0;
-      ck->num_dict_strings  = 0;
-      ck->dict_char_count   = 0;
-    }
   }
 
-  dict->host_to_device(stream);
-  gpu::InitDictionaryIndices(dict->device_ptr(), str_col_ids.size(), num_rowgroups, stream);
+  gpu::InitDictionaryIndices(view,
+                             dict->device_ptr(),
+                             dict_data,
+                             dict_index,
+                             row_index_stride_,
+                             d_str_col_ids.data(),
+                             d_str_col_ids.size(),
+                             num_rowgroups,
+                             stream);
   dict->device_to_host(stream, true);
 }
 
@@ -358,19 +295,19 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
     str_column.attach_stripe_dict(stripe_dict.host_ptr(), stripe_dict.device_ptr());
 
     for (auto const &stripe : stripe_bounds) {
-      auto &sd            = stripe_dict[stripe.id * str_col_ids.size() + col_idx];
-      sd.column_data_base = str_column.host_dict_chunk(0)->column_data_base;
-      sd.dict_data        = str_column.host_dict_chunk(stripe.first)->dict_data;
-      sd.dict_index       = dict_index + col_idx * str_column.data_count();  // Indexed by abs row
-      sd.column_id        = str_col_ids[col_idx];
-      sd.start_chunk      = stripe.first;
-      sd.num_chunks       = stripe.size;
-      sd.dict_char_count  = 0;
+      auto &sd           = stripe_dict[stripe.id * str_col_ids.size() + col_idx];
+      sd.dict_data       = str_column.host_dict_chunk(stripe.first)->dict_data;
+      sd.dict_index      = dict_index + col_idx * str_column.data_count();  // Indexed by abs row
+      sd.column_id       = str_col_ids[col_idx];
+      sd.start_chunk     = stripe.first;
+      sd.num_chunks      = stripe.size;
+      sd.dict_char_count = 0;
       sd.num_strings =
         std::accumulate(stripe.cbegin(), stripe.cend(), 0, [&](auto dt_str_cnt, auto rg_idx) {
           const auto &dt = dict[rg_idx * str_col_ids.size() + col_idx];
           return dt_str_cnt + dt.num_dict_strings;
         });
+      sd.leaf_column = dict[col_idx].leaf_column;
     }
 
     if (enable_dictionary_) {
@@ -593,15 +530,16 @@ struct segmented_valid_cnt_input {
   std::vector<size_type> indices;
 };
 
-encoded_data writer::impl::encode_columns(host_span<orc_column_view const> columns,
+encoded_data writer::impl::encode_columns(const table_device_view &view,
+                                          host_span<orc_column_view const> columns,
                                           std::vector<int> const &str_col_ids,
                                           host_span<stripe_rowgroups const> stripe_bounds,
                                           orc_streams const &streams)
 {
   auto const num_columns   = columns.size();
   auto const num_rowgroups = stripes_size(stripe_bounds);
-  hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, num_rowgroups);
-  hostdevice_2dvector<gpu::encoder_chunk_streams> chunk_streams(num_columns, num_rowgroups);
+  hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, num_rowgroups, stream);
+  hostdevice_2dvector<gpu::encoder_chunk_streams> chunk_streams(num_columns, num_rowgroups, stream);
   auto const stream_offsets = streams.compute_offsets(columns, num_rowgroups);
   rmm::device_uvector<uint8_t> encoded_data(stream_offsets.data_size(), stream);
 
@@ -614,23 +552,17 @@ encoded_data writer::impl::encode_columns(host_span<orc_column_view const> colum
         auto const rg_idx = *rg_idx_it;
         auto &ck          = chunks[column.id()][rg_idx];
 
-        ck.start_row  = (rg_idx * row_index_stride_);
-        ck.num_rows   = std::min<uint32_t>(row_index_stride_, column.data_count() - ck.start_row);
-        ck.valid_rows = column.data_count();
+        ck.start_row = (rg_idx * row_index_stride_);
+        ck.num_rows  = std::min<uint32_t>(row_index_stride_, column.data_count() - ck.start_row);
         ck.encoding_kind = column.orc_encoding();
         ck.type_kind     = column.orc_kind();
         if (ck.type_kind == TypeKind::STRING) {
-          ck.valid_map_base   = column.nulls();
-          ck.column_offset    = column.column_offset();
-          ck.column_data_base = (ck.encoding_kind == DICTIONARY_V2)
-                                  ? column.host_stripe_dict(stripe.id)->dict_index
-                                  : column.data();
+          ck.dict_index = (ck.encoding_kind == DICTIONARY_V2)
+                            ? column.host_stripe_dict(stripe.id)->dict_index
+                            : nullptr;
           ck.dtype_len = 1;
         } else {
-          ck.valid_map_base   = column.nulls();
-          ck.column_offset    = column.column_offset();
-          ck.column_data_base = column.data();
-          ck.dtype_len        = column.type_width();
+          ck.dtype_len = column.type_width();
         }
         ck.scale = column.clockscale();
         // Only need to check row groups that end within the stripe
@@ -730,6 +662,8 @@ encoded_data writer::impl::encode_columns(host_span<orc_column_view const> colum
   chunks.host_to_device(stream);
   chunk_streams.host_to_device(stream);
 
+  gpu::set_chunk_columns(view, chunks, stream);
+
   if (!str_col_ids.empty()) {
     auto d_stripe_dict = columns[str_col_ids[0]].device_stripe_dict();
     gpu::EncodeStripeDictionaries(
@@ -791,8 +725,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
   size_t num_chunks        = num_rowgroups * columns.size();
 
   std::vector<std::vector<uint8_t>> stat_blobs(num_stat_blobs);
-  hostdevice_vector<stats_column_desc> stat_desc(columns.size());
-  hostdevice_vector<statistics_merge_group> stat_merge(num_stat_blobs);
+  hostdevice_vector<stats_column_desc> stat_desc(columns.size(), stream);
+  hostdevice_vector<statistics_merge_group> stat_merge(num_stat_blobs, stream);
   rmm::device_uvector<statistics_chunk> stat_chunks(num_chunks + num_stat_blobs, stream);
   rmm::device_uvector<statistics_group> stat_groups(num_chunks, stream);
 
@@ -811,11 +745,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
       case TypeKind::STRING: desc->stats_dtype = dtype_string; break;
       default: desc->stats_dtype = dtype_none; break;
     }
-    desc->num_rows         = column.data_count();
-    desc->num_values       = column.data_count();
-    desc->valid_map_base   = column.nulls();
-    desc->column_offset    = column.column_offset();
-    desc->column_data_base = column.data();
+    desc->num_rows   = column.data_count();
+    desc->num_values = column.data_count();
     if (desc->stats_dtype == dtype_timestamp64) {
       // Timestamp statistics are in milliseconds
       switch (column.clockscale()) {
@@ -869,8 +800,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
     stat_merge.device_ptr(), stat_chunks.data() + num_chunks, num_stat_blobs, stream);
   stat_merge.device_to_host(stream, true);
 
-  hostdevice_vector<uint8_t> blobs(stat_merge[num_stat_blobs - 1].start_chunk +
-                                   stat_merge[num_stat_blobs - 1].num_chunks);
+  hostdevice_vector<uint8_t> blobs(
+    stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream);
   gpu::orc_encode_statistics(blobs.device_ptr(),
                              stat_merge.device_ptr(),
                              stat_chunks.data() + num_chunks,
@@ -1061,6 +992,22 @@ void writer::impl::init_state()
   out_sink_->host_write(MAGIC, std::strlen(MAGIC));
 }
 
+rmm::device_uvector<size_type> get_string_column_ids(const table_device_view &view,
+                                                     rmm::cuda_stream_view stream)
+{
+  rmm::device_uvector<size_type> string_column_ids(view.num_columns(), stream);
+  auto iter     = thrust::make_counting_iterator<size_type>(0);
+  auto end_iter = thrust::copy_if(rmm::exec_policy(stream),
+                                  iter,
+                                  iter + view.num_columns(),
+                                  string_column_ids.begin(),
+                                  [view] __device__(size_type index) {
+                                    return (view.column(index).type().id() == type_id::STRING);
+                                  });
+  string_column_ids.resize(end_iter - string_column_ids.begin(), stream);
+  return string_column_ids;
+}
+
 void writer::impl::write(table_view const &table)
 {
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
@@ -1074,6 +1021,9 @@ void writer::impl::write(table_view const &table)
       "be specified");
   }
 
+  auto device_columns    = table_device_view::create(table, stream);
+  auto string_column_ids = get_string_column_ids(*device_columns, stream);
+
   // Wrapper around cudf columns to attach ORC-specific type info
   std::vector<orc_column_view> orc_columns;
   orc_columns.reserve(num_columns);
@@ -1093,9 +1043,15 @@ void writer::impl::write(table_view const &table)
   // Build per-column dictionary indices
   const auto num_rowgroups   = div_by_rowgroups<size_t>(num_rows);
   const auto num_dict_chunks = num_rowgroups * str_col_ids.size();
-  hostdevice_vector<gpu::DictionaryChunk> dict(num_dict_chunks);
+  hostdevice_vector<gpu::DictionaryChunk> dict(num_dict_chunks, stream);
   if (!str_col_ids.empty()) {
-    init_dictionaries(orc_columns.data(), str_col_ids, dict_data.data(), dict_index.data(), &dict);
+    init_dictionaries(*device_columns,
+                      orc_columns.data(),
+                      str_col_ids,
+                      string_column_ids,
+                      dict_data.data(),
+                      dict_index.data(),
+                      &dict);
   }
 
   // Decide stripe boundaries early on, based on uncompressed size
@@ -1103,23 +1059,22 @@ void writer::impl::write(table_view const &table)
 
   // Build stripe-level dictionaries
   const auto num_stripe_dict = stripe_bounds.size() * str_col_ids.size();
-  hostdevice_vector<gpu::StripeDictionary> stripe_dict(num_stripe_dict);
+  hostdevice_vector<gpu::StripeDictionary> stripe_dict(num_stripe_dict, stream);
   if (!str_col_ids.empty()) {
     build_dictionaries(
       orc_columns.data(), str_col_ids, stripe_bounds, dict, dict_index.data(), stripe_dict);
   }
 
   auto streams  = create_streams(orc_columns, stripe_bounds);
-  auto enc_data = encode_columns(orc_columns, str_col_ids, stripe_bounds, streams);
+  auto enc_data = encode_columns(*device_columns, orc_columns, str_col_ids, stripe_bounds, streams);
 
   // Assemble individual disparate column chunks into contiguous data streams
   const auto num_index_streams = (num_columns + 1);
   const auto num_data_streams  = streams.size() - num_index_streams;
-  hostdevice_2dvector<gpu::StripeStream> strm_descs(stripe_bounds.size(), num_data_streams);
+  hostdevice_2dvector<gpu::StripeStream> strm_descs(stripe_bounds.size(), num_data_streams, stream);
   auto stripes =
     gather_stripes(num_rows, num_index_streams, stripe_bounds, &enc_data.streams, &strm_descs);
 
-  auto device_columns = table_device_view::create(table);
   // Gather column statistics
   std::vector<std::vector<uint8_t>> column_stats;
   if (enable_statistics_ && num_columns > 0 && num_rows > 0) {
@@ -1160,8 +1115,8 @@ void writer::impl::write(table_view const &table)
 
   // Compress the data streams
   rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks);
+  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
   if (compression_kind_ != NONE) {
     strm_descs.host_to_device(stream);
     gpu::CompressOrcDataStreams(static_cast<uint8_t *>(compressed_data.data()),
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index f0ec3a70cec..352cb11440f 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -186,14 +186,18 @@ class writer::impl {
   /**
    * @brief Builds up column dictionaries indices
    *
+   * @param view Table device view representing input table
    * @param columns List of columns
    * @param str_col_ids List of columns that are strings type
+   * @param d_str_col_ids List of columns that are strings type in device memory
    * @param dict_data Dictionary data memory
    * @param dict_index Dictionary index memory
    * @param dict List of dictionary chunks
    */
-  void init_dictionaries(orc_column_view* columns,
+  void init_dictionaries(const table_device_view& view,
+                         orc_column_view* columns,
                          std::vector<int> const& str_col_ids,
+                         device_span<size_type> d_str_col_ids,
                          uint32_t* dict_data,
                          uint32_t* dict_index,
                          hostdevice_vector<gpu::DictionaryChunk>* dict);
@@ -238,13 +242,15 @@ class writer::impl {
   /**
    * @brief Encodes the input columns into streams.
    *
+   * @param view Table device view representing input table
    * @param columns List of columns
    * @param str_col_ids List of columns that are strings type
    * @param stripe_bounds List of stripe boundaries
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Encoded data and per-chunk stream descriptors
    */
-  encoded_data encode_columns(host_span<orc_column_view const> columns,
+  encoded_data encode_columns(const table_device_view& view,
+                              host_span<orc_column_view const> columns,
                               std::vector<int> const& str_col_ids,
                               host_span<stripe_rowgroups const> stripe_bounds,
                               orc_streams const& streams);
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index 46d471d5cf7..2676f30474d 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -52,8 +52,10 @@ inline __device__ uint32_t uint64_hash16(uint64_t v)
   return uint32_hash16((uint32_t)(v + (v >> 32)));
 }
 
-inline __device__ uint32_t nvstr_hash16(const uint8_t *p, uint32_t len)
+inline __device__ uint32_t hash_string(const string_view &val)
 {
+  const char *p = val.data();
+  uint32_t len  = val.size_bytes();
   uint32_t hash = len;
   if (len > 0) {
     uint32_t align_p    = 3 & reinterpret_cast<uintptr_t>(p);
@@ -181,7 +183,7 @@ __global__ void __launch_bounds__(block_size, 1)
   } else if (dtype == INT96) {
     dtype_len_in = 8;
   } else {
-    dtype_len_in = (dtype == BYTE_ARRAY) ? sizeof(nvstrdesc_s) : dtype_len;
+    dtype_len_in = dtype_len;
   }
   __syncthreads();
   while (s->row_cnt < s->ck.num_rows) {
@@ -206,7 +208,7 @@ __global__ void __launch_bounds__(block_size, 1)
         if (dtype == BYTE_ARRAY) {
           auto str1 = s->col.leaf_column->element<string_view>(row);
           len += str1.size_bytes();
-          hash = nvstr_hash16(reinterpret_cast<const uint8_t *>(str1.data()), str1.size_bytes());
+          hash = hash_string(str1);
           // Walk the list of rows with the same hash
           next_addr = &s->hashmap[hash];
           while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) {
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 3b29394686f..51ec0013f1a 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -79,8 +79,10 @@ struct page_enc_state_s {
 /**
  * @brief Return a 12-bit hash from a byte sequence
  */
-inline __device__ uint32_t nvstr_init_hash(const uint8_t *ptr, uint32_t len)
+inline __device__ uint32_t hash_string(const string_view &val)
 {
+  char const *ptr = val.data();
+  uint32_t len    = val.size_bytes();
   if (len != 0) {
     return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
   } else {
@@ -199,7 +201,7 @@ __global__ void __launch_bounds__(block_size)
     // dtype_len, which determines how much memory we need to allocate for the fragment.
     dtype_len_in = 8;
   } else {
-    dtype_len_in = (dtype == BYTE_ARRAY) ? sizeof(nvstrdesc_s) : dtype_len;
+    dtype_len_in = dtype_len;
   }
   __syncthreads();
 
@@ -218,7 +220,7 @@ __global__ void __launch_bounds__(block_size)
         if (dtype == BYTE_ARRAY) {
           auto str = s->col.leaf_column->element<string_view>(val_idx);
           len += str.size_bytes();
-          hash = nvstr_init_hash(reinterpret_cast<const uint8_t *>(str.data()), str.size_bytes());
+          hash = hash_string(str);
         } else if (dtype_len_in == 8) {
           hash = uint64_init_hash(s->col.leaf_column->element<uint64_t>(val_idx));
         } else {
@@ -1059,7 +1061,7 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
   } else if (dtype == INT96) {
     dtype_len_in = 8;
   } else {
-    dtype_len_in = (dtype == BYTE_ARRAY) ? sizeof(nvstrdesc_s) : dtype_len_out;
+    dtype_len_in = dtype_len_out;
   }
   dict_bits = (dtype == BOOLEAN) ? 1 : (s->page.dict_bits_plus1 - 1);
   if (t == 0) {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 31baf419f45..1e8a6920ea4 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -969,7 +969,7 @@ void writer::impl::write(table_view const &table)
   }
   // Create table_device_view so that corresponding column_device_view data
   // can be written into col_desc members
-  auto parent_column_table_device_view = table_device_view::create(single_streams_table);
+  auto parent_column_table_device_view = table_device_view::create(single_streams_table, stream);
   rmm::device_uvector<column_device_view> leaf_column_views(0, stream);
 
   // Initialize column description
diff --git a/cpp/src/io/statistics/column_stats.cu b/cpp/src/io/statistics/column_stats.cu
index 128bd905259..52f21f0a9ad 100644
--- a/cpp/src/io/statistics/column_stats.cu
+++ b/cpp/src/io/statistics/column_stats.cu
@@ -187,12 +187,6 @@ gatherFloatColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Sto
   }
 }
 
-// FIXME: Use native libcudf string type
-struct nvstrdesc_s {
-  const char *ptr;
-  size_t count;
-};
-
 /**
  * @brief Gather statistics for string columns
  *
diff --git a/cpp/src/io/statistics/column_stats.h b/cpp/src/io/statistics/column_stats.h
index d1d414aa7b4..d7895de50ce 100644
--- a/cpp/src/io/statistics/column_stats.h
+++ b/cpp/src/io/statistics/column_stats.h
@@ -45,10 +45,7 @@ struct stats_column_desc {
   uint32_t num_rows;             //!< number of rows in column
   uint32_t num_values;  //!< Number of data values in column. Different from num_rows in case of
                         //!< nested columns
-  const uint32_t *valid_map_base;  //!< base of valid bit map for this column (null if not present)
-  size_type column_offset;         //! < index of the first element relative to the base memory
-  const void *column_data_base;    //!< base ptr to column data
-  int32_t ts_scale;  //!< timestamp scale (>0: multiply by scale, <0: divide by -scale)
+  int32_t ts_scale;     //!< timestamp scale (>0: multiply by scale, <0: divide by -scale)
 
   column_device_view *leaf_column;    //!< Pointer to leaf column
   column_device_view *parent_column;  //!< Pointer to parent column. Is nullptr if not list type.