From 2a1075e462be8df207180c872e60cd4fbeef88d9 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 20 May 2021 15:18:08 -0700
Subject: [PATCH 01/24] use address and length for GDS reads/writes (#8301)

Since we want GDS reads/writes to be 4 KiB aligned, sometimes we can't use the `DeviceMemoryBuffer` as is and need to adjust the size written. This change makes the JNI APIs more flexible to accommodate those.

Authors:
  - Rong Ou (https://github.com/rongou)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/8301
---
 java/src/main/java/ai/rapids/cudf/CuFile.java | 50 +++++++++++++++++--
 1 file changed, 46 insertions(+), 4 deletions(-)
diff --git a/java/src/main/java/ai/rapids/cudf/CuFile.java b/java/src/main/java/ai/rapids/cudf/CuFile.java
index 00c9cdb9fd5..4baad834570 100644
--- a/java/src/main/java/ai/rapids/cudf/CuFile.java
+++ b/java/src/main/java/ai/rapids/cudf/CuFile.java
@@ -78,11 +78,25 @@ public static boolean libraryLoaded() {
    * @param path        The file path to copy to.
    * @param file_offset The file offset from which to write the buffer.
    * @param buffer      The device buffer to copy from.
-   * @return The file offset from which the buffer was appended.
    */
   public static void writeDeviceBufferToFile(File path, long file_offset,
                                              BaseDeviceMemoryBuffer buffer) {
-    writeToFile(path.getAbsolutePath(), file_offset, buffer.getAddress(), buffer.getLength());
+    writeDeviceMemoryToFile(path, file_offset, buffer.getAddress(), buffer.getLength());
+  }
+
+  /**
+   * Write device memory to a given file path synchronously.
+   * <p>
+   * This method is NOT thread safe if the path points to the same file on disk.
+   *
+   * @param path        The file path to copy to.
+   * @param file_offset The file offset from which to write the buffer.
+   * @param address     The device memory address to copy from.
+   * @param length      The length to copy.
+   */
+  public static void writeDeviceMemoryToFile(File path, long file_offset, long address,
+                                             long length) {
+    writeToFile(path.getAbsolutePath(), file_offset, address, length);
   }
 
   /**
@@ -95,7 +109,21 @@ public static void writeDeviceBufferToFile(File path, long file_offset,
    * @return The file offset from which the buffer was appended.
    */
   public static long appendDeviceBufferToFile(File path, BaseDeviceMemoryBuffer buffer) {
-    return appendToFile(path.getAbsolutePath(), buffer.getAddress(), buffer.getLength());
+    return appendDeviceMemoryToFile(path, buffer.getAddress(), buffer.getLength());
+  }
+
+  /**
+   * Append device memory to a given file path synchronously.
+   * <p>
+   * This method is NOT thread safe if the path points to the same file on disk.
+   *
+   * @param path    The file path to copy to.
+   * @param address The device memory address to copy from.
+   * @param length  The length to copy.
+   * @return The file offset from which the buffer was appended.
+   */
+  public static long appendDeviceMemoryToFile(File path, long address, long length) {
+    return appendToFile(path.getAbsolutePath(), address, length);
   }
 
   /**
@@ -109,7 +137,21 @@ public static long appendDeviceBufferToFile(File path, BaseDeviceMemoryBuffer bu
    */
   public static void readFileToDeviceBuffer(BaseDeviceMemoryBuffer buffer, File path,
                                             long fileOffset) {
-    readFromFile(buffer.getAddress(), buffer.getLength(), path.getAbsolutePath(), fileOffset);
+    readFileToDeviceMemory(buffer.getAddress(), buffer.getLength(), path, fileOffset);
+  }
+
+  /**
+   * Read a file into device memory synchronously.
+   * <p>
+   * This method is NOT thread safe if the path points to the same file on disk.
+   *
+   * @param address The device memory address to read into.
+   * @param length  The length to read.
+   * @param path       The file path to copy from.
+   * @param fileOffset The file offset from which to copy the content.
+   */
+  public static void readFileToDeviceMemory(long address, long length, File path, long fileOffset) {
+    readFromFile(address, length, path.getAbsolutePath(), fileOffset);
   }
 
   private static native void writeToFile(String path, long file_offset, long address, long length);

From b5531448243794974fae6987957d65d3339ee2ef Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 20 May 2021 17:26:02 -0500
Subject: [PATCH 02/24] Return python lists for __getitem__ calls to list type
 series (#8265)

Make it so that this works:

```
x = cudf.Series([[1,2,None]])
x[0]
# [1, 2, <NA>]
```

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/8265
---
 python/cudf/cudf/_lib/cpp/scalar/scalar.pxd |  6 ++
 python/cudf/cudf/_lib/scalar.pyx            | 63 +++++++++++++++++++--
 python/cudf/cudf/core/indexing.py           |  6 +-
 python/cudf/cudf/tests/test_list.py         | 18 ++++++
 4 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
index fec1c6382e6..de5cb05447c 100644
--- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
+++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
@@ -9,6 +9,9 @@ from libcpp.string cimport string
 from cudf._lib.cpp.types cimport data_type
 from cudf._lib.cpp.wrappers.decimals cimport scale_type
 
+from cudf._lib.cpp.column.column_view cimport column_view
+
+
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
     cdef cppclass scalar:
         scalar() except +
@@ -60,3 +63,6 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
                            bool is_valid) except +
         int64_t value() except +
         # TODO: Figure out how to add an int32 overload of value()
+
+    cdef cppclass list_scalar(scalar):
+        column_view view() except +
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 9f8a8ee6b1e..cb355a15f15 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -18,9 +18,18 @@ from libcpp.utility cimport move
 from libcpp cimport bool
 
 import cudf
-from cudf._lib.types import cudf_to_np_types, duration_unit_map
+from cudf.core.dtypes import ListDtype
+from cudf._lib.types import (
+    cudf_to_np_types,
+    duration_unit_map
+)
 from cudf._lib.types import datetime_unit_map
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.types cimport underlying_type_t_type_id, dtype_from_column_view
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.table cimport Table
+from cudf._lib.interop import to_arrow
 
 from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
@@ -41,12 +50,12 @@ from cudf._lib.cpp.scalar.scalar cimport (
     timestamp_scalar,
     duration_scalar,
     string_scalar,
-    fixed_point_scalar
+    fixed_point_scalar,
+    list_scalar,
 )
-from cudf.utils.dtypes import _decimal_to_int64
+from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype
 cimport cudf._lib.cpp.types as libcudf_types
 
-
 cdef class DeviceScalar:
 
     def __init__(self, value, dtype):
@@ -97,6 +106,8 @@ cdef class DeviceScalar:
     def _to_host_scalar(self):
         if isinstance(self.dtype, cudf.Decimal64Dtype):
             result = _get_py_decimal_from_fixed_point(self.c_value)
+        elif is_list_dtype(self.dtype):
+            result = _get_py_list_from_list(self.c_value)
         elif pd.api.types.is_string_dtype(self.dtype):
             result = _get_py_string_from_string(self.c_value)
         elif pd.api.types.is_numeric_dtype(self.dtype):
@@ -159,6 +170,22 @@ cdef class DeviceScalar:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
+        elif cdtype.id() == libcudf_types.LIST:
+            if (
+                <list_scalar*>s.get_raw_ptr()
+            )[0].view().type().id() == libcudf_types.LIST:
+                s._dtype = dtype_from_column_view(
+                    (<list_scalar*>s.get_raw_ptr())[0].view()
+                )
+            else:
+                s._dtype = ListDtype(
+                    cudf_to_np_types[
+                        <underlying_type_t_type_id>(
+                            (<list_scalar*>s.get_raw_ptr())[0]
+                            .view().type().id()
+                        )
+                    ]
+                )
         else:
             if dtype is not None:
                 s._dtype = dtype
@@ -268,6 +295,19 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s,
         )
     )
 
+cdef _get_py_list_from_list(unique_ptr[scalar]& s):
+
+    if not s.get()[0].is_valid():
+        return cudf.NA
+
+    cdef column_view list_col_view = (<list_scalar*>s.get()).view()
+    cdef Column list_col = Column.from_column_view(list_col_view, None)
+    cdef Table to_arrow_table = Table({"col": list_col})
+
+    arrow_table = to_arrow(to_arrow_table, [["col", []]])
+    result = arrow_table['col'].to_pylist()
+    return _nested_na_replace(result)
+
 cdef _get_py_string_from_string(unique_ptr[scalar]& s):
     if not s.get()[0].is_valid():
         return cudf.NA
@@ -440,3 +480,16 @@ def _create_proxy_nat_scalar(dtype):
         return result
     else:
         raise TypeError('NAT only valid for datetime and timedelta')
+
+
+def _nested_na_replace(input_list):
+    '''
+    Replace `None` with `cudf.NA` in the result of
+    `__getitem__` calls to list type columns
+    '''
+    for idx, value in enumerate(input_list):
+        if isinstance(value, list):
+            _nested_na_replace(value)
+        elif value is None:
+            input_list[idx] = cudf.NA
+    return input_list
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index 7de1aaf9726..21d075ae67d 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -85,7 +85,11 @@ def __getitem__(self, arg):
             arg = list(arg)
         data = self._sr._column[arg]
 
-        if is_scalar(data) or _is_null_host_scalar(data):
+        if (
+            isinstance(data, list)
+            or is_scalar(data)
+            or _is_null_host_scalar(data)
+        ):
             return data
         index = self._sr.index.take(arg)
         return self._sr._copy_construct(data=data, index=index)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 5dcecc6c9e1..7edcb08a7c8 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf import NA
 from cudf.tests.utils import assert_eq
 
 
@@ -332,3 +333,20 @@ def test_concatenate_list_with_nonlist():
         gdf1 = cudf.DataFrame({"A": [["a", "c"], ["b", "d"], ["c", "d"]]})
         gdf2 = cudf.DataFrame({"A": ["a", "b", "c"]})
         gdf1["A"] + gdf2["A"]
+
+
+@pytest.mark.parametrize(
+    "indata,expect",
+    [
+        ([1], [1]),
+        ([1, 2, 3], [1, 2, 3]),
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]),
+        ([None], [NA]),
+        ([1, None, 3], [1, NA, 3]),
+        ([[1, None, 3], [None, 5, 6]], [[1, NA, 3], [NA, 5, 6]]),
+    ],
+)
+def test_list_getitem(indata, expect):
+    list_sr = cudf.Series([indata])
+    # __getitem__ shall fill None with cudf.NA
+    assert list_sr[0] == expect

From c7d052426d6ceceff732307df13bebfbc15b046a Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 20 May 2021 16:09:23 -0700
Subject: [PATCH 03/24] Copy nested types upon construction (#8244)

Closes #7561

This PR makes sure upon constructing cudf object, nested types from the pyarrow array is copied to cudf object. This should handle arbitrary nesting of `Lists`, `Structs`. For decimal types, precision is copied from the array.

Authors:
  - Michael Wang (https://github.com/isVoid)
  - Keith Kraus (https://github.com/kkraus14)

Approvers:
  - Keith Kraus (https://github.com/kkraus14)

URL: https://github.com/rapidsai/cudf/pull/8244
---
 python/cudf/cudf/core/column/column.py |  64 ++++++++++++++-
 python/cudf/cudf/core/dtypes.py        |   8 +-
 python/cudf/cudf/tests/test_dtypes.py  | 104 ++++++++++++++++++++++++-
 3 files changed, 169 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 20f302f7e59..4bf4b2b87f2 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -40,7 +40,12 @@
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer
-from cudf.core.dtypes import CategoricalDtype, IntervalDtype
+from cudf.core.dtypes import (
+    CategoricalDtype,
+    IntervalDtype,
+    ListDtype,
+    StructDtype,
+)
 from cudf.utils import ioutils, utils
 from cudf.utils.dtypes import (
     check_cast_unsupported_dtype,
@@ -291,8 +296,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
             "None"
         ]
 
-        if isinstance(result.dtype, cudf.Decimal64Dtype):
-            result.dtype.precision = array.type.precision
+        result = _copy_type_metadata_from_arrow(array, result)
         return result
 
     def _get_mask_as_column(self) -> ColumnBase:
@@ -2230,6 +2234,60 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
     return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)
 
 
+def _copy_type_metadata_from_arrow(
+    arrow_array: pa.array, cudf_column: ColumnBase
+) -> ColumnBase:
+    """
+    Similar to `Column._copy_type_metadata`, except copies type metadata
+    from arrow array into a cudf column. Recursive for every level.
+    * When `arrow_array` is struct type and `cudf_column` is StructDtype, copy
+    field names.
+    * When `arrow_array` is decimal type and `cudf_column` is
+    Decimal64Dtype, copy precisions.
+    """
+    if pa.types.is_decimal(arrow_array.type) and isinstance(
+        cudf_column, cudf.core.column.DecimalColumn
+    ):
+        cudf_column.dtype.precision = arrow_array.type.precision
+    elif pa.types.is_struct(arrow_array.type) and isinstance(
+        cudf_column, cudf.core.column.StructColumn
+    ):
+        base_children = tuple(
+            _copy_type_metadata_from_arrow(arrow_array.field(i), col_child)
+            for i, col_child in enumerate(cudf_column.base_children)
+        )
+        cudf_column.set_base_children(base_children)
+        return cudf.core.column.StructColumn(
+            data=None,
+            size=cudf_column.base_size,
+            dtype=StructDtype.from_arrow(arrow_array.type),
+            mask=cudf_column.base_mask,
+            offset=cudf_column.offset,
+            null_count=cudf_column.null_count,
+            children=base_children,
+        )
+    elif pa.types.is_list(arrow_array.type) and isinstance(
+        cudf_column, cudf.core.column.ListColumn
+    ):
+        if arrow_array.values and cudf_column.base_children:
+            base_children = (
+                cudf_column.base_children[0],
+                _copy_type_metadata_from_arrow(
+                    arrow_array.values, cudf_column.base_children[1]
+                ),
+            )
+            return cudf.core.column.ListColumn(
+                size=cudf_column.base_size,
+                dtype=ListDtype.from_arrow(arrow_array.type),
+                mask=cudf_column.base_mask,
+                offset=cudf_column.offset,
+                null_count=cudf_column.null_count,
+                children=base_children,
+            )
+
+    return cudf_column
+
+
 def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 7db8ba15caa..f0b0dbba4a5 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -143,6 +143,8 @@ def __init__(self, element_type: Any) -> None:
     def element_type(self) -> Dtype:
         if isinstance(self._typ.value_type, pa.ListType):
             return ListDtype.from_arrow(self._typ.value_type)
+        elif isinstance(self._typ.value_type, pa.StructType):
+            return StructDtype.from_arrow(self._typ.value_type)
         else:
             return np.dtype(self._typ.value_type.to_pandas_dtype()).name
 
@@ -176,10 +178,10 @@ def __eq__(self, other):
         return self._typ.equals(other._typ)
 
     def __repr__(self):
-        if isinstance(self.element_type, ListDtype):
-            return f"ListDtype({self.element_type.__repr__()})"
+        if isinstance(self.element_type, (ListDtype, StructDtype)):
+            return f"{type(self).__name__}({self.element_type.__repr__()})"
         else:
-            return f"ListDtype({self.element_type})"
+            return f"{type(self).__name__}({self.element_type})"
 
     def __hash__(self):
         return hash(self._typ)
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index b6e2aac0304..a5895caf49f 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -6,14 +6,16 @@
 import pytest
 
 import cudf
+from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
     CategoricalDtype,
     Decimal64Dtype,
+    IntervalDtype,
     ListDtype,
     StructDtype,
-    IntervalDtype,
 )
 from cudf.tests.utils import assert_eq
+from cudf.utils.dtypes import np_to_pa_dtype
 
 
 def test_cdt_basic():
@@ -155,3 +157,103 @@ def test_interval_dtype_pyarrow_round_trip(fields, closed):
     expect = pa_array
     got = IntervalDtype.from_arrow(expect).to_arrow()
     assert expect.equals(got)
+
+
+def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array):
+    """
+    In cudf, each column holds its dtype. And since column may have child
+    columns, child columns also holds their datatype. This method tests
+    that every level of `column` matches the type of the given `array`
+    recursively.
+    """
+
+    if isinstance(column.dtype, ListDtype):
+        return array.type.equals(
+            column.dtype.to_arrow()
+        ) and assert_column_array_dtype_equal(
+            column.base_children[1], array.values
+        )
+    elif isinstance(column.dtype, StructDtype):
+        return array.type.equals(column.dtype.to_arrow()) and all(
+            [
+                assert_column_array_dtype_equal(child, array.field(i))
+                for i, child in enumerate(column.base_children)
+            ]
+        )
+    elif isinstance(column.dtype, Decimal64Dtype):
+        return array.type.equals(column.dtype.to_arrow())
+    elif isinstance(column.dtype, CategoricalDtype):
+        raise NotImplementedError()
+    else:
+        return array.type.equals(np_to_pa_dtype(column.dtype))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [[{"name": 123}]],
+        [
+            [
+                {
+                    "IsLeapYear": False,
+                    "data": {"Year": 1999, "Month": 7},
+                    "names": ["Mike", None],
+                },
+                {
+                    "IsLeapYear": True,
+                    "data": {"Year": 2004, "Month": 12},
+                    "names": None,
+                },
+                {
+                    "IsLeapYear": False,
+                    "data": {"Year": 1996, "Month": 2},
+                    "names": ["Rose", "Richard"],
+                },
+            ]
+        ],
+        [
+            [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}],
+            [
+                {"human?": None, "deets": {"weight": 5.3, "age": 25}},
+                {"human?": False, "deets": {"weight": 8.0, "age": 31}},
+                {"human?": False, "deets": None},
+            ],
+            [],
+            None,
+            [{"human?": None, "deets": {"weight": 6.9, "age": None}}],
+        ],
+        [
+            {
+                "name": "var0",
+                "val": [
+                    {"name": "var1", "val": None, "type": "optional<struct>"}
+                ],
+                "type": "list",
+            },
+            {},
+            {
+                "name": "var2",
+                "val": [
+                    {
+                        "name": "var3",
+                        "val": {"field": 42},
+                        "type": "optional<struct>",
+                    },
+                    {
+                        "name": "var4",
+                        "val": {"field": 3.14},
+                        "type": "optional<struct>",
+                    },
+                ],
+                "type": "list",
+            },
+            None,
+        ],
+    ],
+)
+def test_lists_of_structs_dtype(data):
+    got = cudf.Series(data)
+    expected = pa.array(data)
+
+    assert_column_array_dtype_equal(got._column, expected)
+    assert expected.equals(got._column.to_arrow())

From 9a85b3baf0742b89ebce8389309efade89bdca3f Mon Sep 17 00:00:00 2001
From: pxLi <pxli@nyu.edu>
Date: Fri, 21 May 2021 08:52:26 +0800
Subject: [PATCH 04/24] Update cudfjni version to 21.06.0 (#8292)

Signed-off-by: Peixin Li <pxli@nyu.edu>

supplement to #8267,
as discussed, cudf JNI  and plugin will follow pattern YY.MM.P

Authors:
  - pxLi (https://github.com/pxLi)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/8292
---
 java/ci/README.md | 2 +-
 java/pom.xml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/ci/README.md b/java/ci/README.md
index 458a76bcd04..968ce279a2c 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -49,5 +49,5 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-21.06-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-21.06.0-SNAPSHOT-cuda11.jar.
 
diff --git a/java/pom.xml b/java/pom.xml
index cec20ec04af..fe2d9a453f7 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>21.06-SNAPSHOT</version>
+    <version>21.06.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>

From b84c7923519cb7b64c247a9d010686e0ed4bf1fc Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Fri, 21 May 2021 22:42:33 +0800
Subject: [PATCH 05/24] Fix concatenate_lists_ignore_null on rows of all_nulls
 (#8312)

After the rework of `cudf::lists::concatenate_rows`, something changed on null handling failed [corresponding cuDF Java tests](https://github.com/rapidsai/cudf/blob/branch-21.06/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java#L2234).
In specific, when we apply `concatenate_null_policy::IGNORE`, the output lists are always null free, even if input data contains rows consisting of all nulls.

In my opinion, we had better creating null mask for input rows of  `all_nulls`, to keep align with single column concatenate.

Signed-off-by: sperlingxx <lovedreamf@gmail.com>

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/8312
---
 .../combine/concatenate_list_elements.cu      | 40 ++++++++++++++++---
 .../concatenate_list_elements_tests.cpp       | 25 +++++++-----
 .../lists/combine/concatenate_rows_tests.cpp  | 36 ++++++++++-------
 3 files changed, 70 insertions(+), 31 deletions(-)

diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index b76cd19d94b..c5a28a8ec5f 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -41,6 +41,7 @@ namespace {
  * concatenation.
  */
 std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
+                                                      bool build_null_mask,
                                                       rmm::cuda_stream_view stream,
                                                       rmm::mr::device_memory_resource* mr)
 {
@@ -50,9 +51,13 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
   auto out_offsets = make_numeric_column(
     data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
 
+  // The array of int8_t stores validities for the output list elements.
+  auto validities = rmm::device_uvector<int8_t>(build_null_mask ? num_rows : 0, stream);
+
   auto const d_out_offsets  = out_offsets->mutable_view().template begin<offset_type>();
   auto const d_row_offsets  = lists_column_view(input).offsets_begin();
   auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin();
+  auto const lists_dv_ptr   = column_device_view::create(lists_column_view(input).child());
 
   // Concatenating the lists at the same row by converting the entry offsets from the child column
   // into row offsets of the root column. Those entry offsets are subtracted by the first entry
@@ -62,7 +67,22 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
                     iter,
                     iter + num_rows + 1,
                     d_out_offsets,
-                    [d_row_offsets, d_list_offsets] __device__(auto const idx) {
+                    [d_row_offsets,
+                     d_list_offsets,
+                     lists_dv     = *lists_dv_ptr,
+                     d_validities = validities.begin(),
+                     build_null_mask,
+                     iter] __device__(auto const idx) {
+                      if (build_null_mask) {
+                        // The output row will be null only if all lists on the input row are null.
+                        auto const is_valid = thrust::any_of(thrust::seq,
+                                                             iter + d_row_offsets[idx],
+                                                             iter + d_row_offsets[idx + 1],
+                                                             [&] __device__(auto const list_idx) {
+                                                               return lists_dv.is_valid(list_idx);
+                                                             });
+                        d_validities[idx]   = static_cast<int8_t>(is_valid);
+                      }
                       auto const start_offset = d_list_offsets[d_row_offsets[0]];
                       return d_list_offsets[d_row_offsets[idx]] - start_offset;
                     });
@@ -71,11 +91,18 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
   auto out_entries = std::make_unique<column>(
     lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream));
 
+  auto [null_mask, null_count] = [&] {
+    return build_null_mask
+             ? cudf::detail::valid_if(
+                 validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr)
+             : std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+  }();
+
   return make_lists_column(num_rows,
                            std::move(out_offsets),
                            std::move(out_entries),
-                           input.null_count(),
-                           cudf::detail::copy_bitmask(input, stream, mr),
+                           null_count,
+                           null_count > 0 ? std::move(null_mask) : rmm::device_buffer{},
                            stream,
                            mr);
 }
@@ -241,9 +268,10 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
 
   if (input.size() == 0) { return cudf::empty_like(input); }
 
-  return (null_policy == concatenate_null_policy::IGNORE ||
-          !lists_column_view(input).child().has_nulls())
-           ? concatenate_lists_ignore_null(input, stream, mr)
+  bool has_null_list = lists_column_view(input).child().has_nulls();
+
+  return (null_policy == concatenate_null_policy::IGNORE || !has_null_list)
+           ? concatenate_lists_ignore_null(input, has_null_list, stream, mr)
            : concatenate_lists_nullifying_rows(input, stream, mr);
 }
 
diff --git a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
index de6307471a9..7d79cf4aebe 100644
--- a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
@@ -147,19 +147,23 @@ TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputWithNulls)
   auto row5      = ListsCol{ListsCol{{1, 2, 3, null}, null_at(3)},
                        ListsCol{{null}, null_at(0)},
                        ListsCol{{null, null, null, null, null}, all_nulls()}};
-  auto const col = build_lists_col(row0, row1, row2, row3, row4, row5);
+  auto row6 =
+    ListsCol{{ListsCol{} /*NULL*/, ListsCol{} /*NULL*/, ListsCol{} /*NULL*/}, all_nulls()};
+  auto const col = build_lists_col(row0, row1, row2, row3, row4, row5, row6);
 
   // Ignore null list elements.
   {
     auto const results = cudf::lists::concatenate_list_elements(col);
     auto const expected =
-      ListsCol{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})},
-               ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})},
-               ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})},
-               ListsCol{{null, 18}, null_at(0)},
-               ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
-               ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
-                        null_at({3, 4, 5, 6, 7, 8, 9})}};
+      ListsCol{{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})},
+                ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})},
+                ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})},
+                ListsCol{{null, 18}, null_at(0)},
+                ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
+                ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
+                         null_at({3, 4, 5, 6, 7, 8, 9})},
+                ListsCol{} /*NULL*/},
+               null_at(6)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
   }
 
@@ -174,8 +178,9 @@ TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputWithNulls)
                 ListsCol{} /*NULL*/,
                 ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
                 ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
-                         null_at({3, 4, 5, 6, 7, 8, 9})}},
-               null_at({0, 2, 3})};
+                         null_at({3, 4, 5, 6, 7, 8, 9})},
+                ListsCol{} /*NULL*/},
+               null_at({0, 2, 3, 6})};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
   }
 }
diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
index 3e085af7740..af22f329634 100644
--- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
@@ -184,24 +184,27 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls)
                               ListsCol{{null, 2, 3, 4}, null_at(0)},
                               ListsCol{} /*NULL*/,
                               ListsCol{{1, 2, null, 4}, null_at(2)},
-                              ListsCol{{1, 2, 3, null}, null_at(3)}},
-                             null_at(3)}
+                              ListsCol{{1, 2, 3, null}, null_at(3)},
+                              ListsCol{} /*NULL*/},
+                             null_at({3, 6})}
                       .release();
   auto const col2 = ListsCol{{ListsCol{{10, 11, 12, null}, null_at(3)},
                               ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)},
                               ListsCol{} /*NULL*/,
                               ListsCol{{null, 18}, null_at(0)},
                               ListsCol{{19, 20, null}, null_at(2)},
-                              ListsCol{{null}, null_at(0)}},
-                             null_at(2)}
+                              ListsCol{{null}, null_at(0)},
+                              ListsCol{} /*NULL*/},
+                             null_at({2, 6})}
                       .release();
   auto const col3 = ListsCol{{ListsCol{} /*NULL*/,
                               ListsCol{{20, null}, null_at(1)},
                               ListsCol{{null, 21, null, null}, null_at({0, 2, 3})},
                               ListsCol{},
                               ListsCol{22, 23, 24, 25},
-                              ListsCol{{null, null, null, null, null}, all_nulls()}},
-                             null_at(0)}
+                              ListsCol{{null, null, null, null, null}, all_nulls()},
+                              ListsCol{} /*NULL*/},
+                             null_at({0, 6})}
                       .release();
 
   // Ignore null list elements
@@ -209,13 +212,15 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls)
     auto const results =
       cudf::lists::concatenate_rows(TView{{col1->view(), col2->view(), col3->view()}});
     auto const expected =
-      ListsCol{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})},
-               ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})},
-               ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})},
-               ListsCol{{null, 18}, null_at(0)},
-               ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
-               ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
-                        null_at({3, 4, 5, 6, 7, 8, 9})}}
+      ListsCol{{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})},
+                ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})},
+                ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})},
+                ListsCol{{null, 18}, null_at(0)},
+                ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
+                ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
+                         null_at({3, 4, 5, 6, 7, 8, 9})},
+                ListsCol{} /*NULL*/},
+               null_at(6)}
         .release();
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
   }
@@ -232,8 +237,9 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls)
                 ListsCol{} /*NULL*/,
                 ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
                 ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
-                         null_at({3, 4, 5, 6, 7, 8, 9})}},
-               null_at({0, 2, 3})}
+                         null_at({3, 4, 5, 6, 7, 8, 9})},
+                ListsCol{} /*NULL*/},
+               null_at({0, 2, 3, 6})}
         .release();
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
   }

From 6920f9be9237c77258972aab9bfebd1566ac11aa Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 21 May 2021 13:17:20 -0400
Subject: [PATCH 06/24] Update readme with correct CUDA versions (#8315)

Replaces CUDA 10.1/10.2 with 11.0/11.2.

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/8315
---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 733d1c7897b..587f18d2603 100644
--- a/README.md
+++ b/README.md
@@ -67,13 +67,13 @@ cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html),
 
 For `cudf version == 21.06` :
 ```bash
-# for CUDA 10.1
+# for CUDA 11.0
 conda install -c rapidsai -c nvidia -c numba -c conda-forge \
-    cudf=21.06 python=3.7 cudatoolkit=10.1
+    cudf=21.06 python=3.7 cudatoolkit=11.0
 
-# or, for CUDA 10.2
+# or, for CUDA 11.2
 conda install -c rapidsai -c nvidia -c numba -c conda-forge \
-    cudf=21.06 python=3.7 cudatoolkit=10.2
+    cudf=21.06 python=3.7 cudatoolkit=11.2
 
 ```
 

From 5c6b92a38c5a82ee259b6414a8bbc568d8e78389 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Fri, 21 May 2021 10:44:27 -0700
Subject: [PATCH 07/24] COLLECT_LIST support returning empty output columns.
 (#8279)

Fixes the group-by portion of #7611.

When `COLLECT_LIST()` or `COLLECT_SET()` aggregations are called on a grouped input, if the input column is empty, then one sees the following failure:
```
C++ exception with description "cuDF failure at: .../cpp/src/column/column_factories.cpp:67:
make_empty_column is invalid to call on nested types" thrown in the test body.
```
The operation should have resulted in an empty `LIST` column. `make_empty_column()` does not support `LIST` types (in part because the `data_type` parameter does not capture the types of the child columns).

This commit fixes this by constructing the output column from the specified `values` input, but only for `COLLECT_LIST()` and `COLLECT_SET()`; other aggregation types are unchanged.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Nghia Truong (https://github.com/ttnghia)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/8279
---
 cpp/src/groupby/groupby.cu               | 41 +++++++++++++-
 cpp/tests/groupby/collect_list_tests.cpp | 70 ++++++++++++++++++++++++
 cpp/tests/groupby/collect_set_tests.cpp  |  3 +-
 cpp/tests/groupby/nth_element_tests.cpp  | 40 ++++++++++++++
 4 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index a5fd6d6f9bb..f132d6b1511 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -79,6 +79,44 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
 groupby::~groupby() = default;
 
 namespace {
+
+/**
+ * @brief Factory to construct empty result columns.
+ *
+ * Adds special handling for COLLECT_LIST/COLLECT_SET, because:
+ * 1. `make_empty_column()` does not support construction of nested columns.
+ * 2. Empty lists need empty child columns, to persist type information.
+ */
+struct empty_column_constructor {
+  column_view values;
+
+  template <typename ValuesType, aggregation::Kind k>
+  std::unique_ptr<cudf::column> operator()() const
+  {
+    using namespace cudf;
+    using namespace cudf::detail;
+
+    if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) {
+      return make_lists_column(
+        0, make_empty_column(data_type{type_to_id<offset_type>()}), empty_like(values), 0, {});
+    }
+
+    // If `values` is LIST typed, and the aggregation results match the type,
+    // construct empty results based on `values`.
+    // Most generally, this applies if input type matches output type.
+    //
+    // Note: `target_type_t` is not recursive, and `ValuesType` does not consider children.
+    //       It is important that `COLLECT_LIST` and `COLLECT_SET` are handled before this
+    //       point, because `COLLECT_LIST(LIST)` produces `LIST<LIST>`, but `target_type_t`
+    //       wouldn't know the difference.
+    if constexpr (std::is_same_v<target_type_t<ValuesType, k>, ValuesType>) {
+      return empty_like(values);
+    }
+
+    return make_empty_column(target_type(values.type(), k));
+  }
+};
+
 /// Make an empty table with appropriate types for requested aggs
 auto empty_results(host_span<aggregation_request const> requests)
 {
@@ -93,7 +131,8 @@ auto empty_results(host_span<aggregation_request const> requests)
         request.aggregations.end(),
         std::back_inserter(results),
         [&request](auto const& agg) {
-          return make_empty_column(cudf::detail::target_type(request.values.type(), agg->kind));
+          return cudf::detail::dispatch_type_and_aggregation(
+            request.values.type(), agg->kind, empty_column_constructor{request.values});
         });
 
       return aggregation_result{std::move(results)};
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index 7580c1c4e3b..9d2141c913c 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -86,6 +86,21 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion)
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K, int32_t> keys{};
+  fixed_width_column_wrapper<V, int32_t> values{};
+
+  fixed_width_column_wrapper<K, int32_t> expect_keys{};
+  lists_column_wrapper<V, int32_t> expect_vals{};
+
+  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
+}
+
 TYPED_TEST(groupby_collect_list_test, CollectLists)
 {
   using K = int32_t;
@@ -124,6 +139,61 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion)
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  using LCW = cudf::test::lists_column_wrapper<V, int32_t>;
+
+  auto offsets = data_type{type_to_id<offset_type>()};
+
+  fixed_width_column_wrapper<K, int32_t> keys{};
+  auto values = cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {});
+
+  fixed_width_column_wrapper<K, int32_t> expect_keys{};
+
+  auto expect_child =
+    cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {});
+  auto expect_values =
+    cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {});
+
+  auto agg = cudf::make_collect_list_aggregation();
+  test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
+}
+
+TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  using LCW = cudf::test::lists_column_wrapper<V, int32_t>;
+
+  fixed_width_column_wrapper<K, int32_t> keys{};
+  auto struct_child  = LCW{};
+  auto struct_column = structs_column_wrapper{{struct_child}};
+
+  auto values = cudf::make_lists_column(
+    0, make_empty_column(data_type{type_to_id<offset_type>()}), struct_column.release(), 0, {});
+
+  fixed_width_column_wrapper<K, int32_t> expect_keys{};
+
+  auto expect_struct_child  = LCW{};
+  auto expect_struct_column = structs_column_wrapper{{expect_struct_child}};
+
+  auto expect_child =
+    cudf::make_lists_column(0,
+                            make_empty_column(data_type{type_to_id<offset_type>()}),
+                            expect_struct_column.release(),
+                            0,
+                            {});
+  auto expect_values = cudf::make_lists_column(
+    0, make_empty_column(data_type{type_to_id<offset_type>()}), std::move(expect_child), 0, {});
+
+  auto agg = cudf::make_collect_list_aggregation();
+  test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
+}
+
 TYPED_TEST(groupby_collect_list_test, dictionary)
 {
   using K = int32_t;
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index ce3a9a49372..d5a881a1993 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -58,8 +58,7 @@ TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool);
 TYPED_TEST(CollectSetTypedTest, TrivialInput)
 {
   // Empty input
-  // TODO: Enable this test after issue#7611 has been fixed
-  // test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET);
+  test_single_agg(COL_K{}, COL_V{}, COL_K{}, LCL_V{}, CollectSetTest::collect_set());
 
   // Single key input
   {
diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp
index ec0265a3023..5630cba09da 100644
--- a/cpp/tests/groupby/nth_element_tests.cpp
+++ b/cpp/tests/groupby/nth_element_tests.cpp
@@ -362,5 +362,45 @@ TEST_F(groupby_nth_element_string_test, dictionary)
     keys, vals, expect_keys, expect_vals->view(), cudf::make_nth_element_aggregation(2));
 }
 
+template <typename T>
+struct groupby_nth_element_lists_test : BaseFixture {
+};
+
+TYPED_TEST_CASE(groupby_nth_element_lists_test, FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(groupby_nth_element_lists_test, Basics)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  using lists = cudf::test::lists_column_wrapper<V, int32_t>;
+
+  auto keys   = fixed_width_column_wrapper<K, int32_t>{1, 1, 2, 2, 3, 3};
+  auto values = lists{{1, 2}, {3, 4}, {5, 6, 7}, lists{}, {9, 10}, {11}};
+
+  auto expected_keys   = fixed_width_column_wrapper<K, int32_t>{1, 2, 3};
+  auto expected_values = lists{{1, 2}, {5, 6, 7}, {9, 10}};
+
+  test_single_agg(
+    keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0));
+}
+
+TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  using lists = cudf::test::lists_column_wrapper<V, int32_t>;
+
+  auto keys   = fixed_width_column_wrapper<K, int32_t>{};
+  auto values = lists{};
+
+  auto expected_keys   = fixed_width_column_wrapper<K, int32_t>{};
+  auto expected_values = lists{};
+
+  test_single_agg(
+    keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2));
+}
+
 }  // namespace test
 }  // namespace cudf

From de579a59714f960fe33440811b4c49e5efeb3f3f Mon Sep 17 00:00:00 2001
From: Kumar Aatish <kaatish@nvidia.com>
Date: Fri, 21 May 2021 16:05:18 -0400
Subject: [PATCH 08/24] Added decimal writing for CSV writer (#8296)

Addresses #7110

column_to_strings_fn was specialized for fixed point type to enable support for csv writer. A test was added to validate output file created by csv writer for decimal type column.

Authors:
  - Kumar Aatish (https://github.com/kaatish)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/8296
---
 cpp/src/io/csv/writer_impl.cu |  12 +++-
 cpp/tests/io/csv_test.cpp     | 104 ++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index d2b6be5eead..13760381373 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -119,7 +119,8 @@ struct column_to_strings_fn {
     return not((std::is_same<column_type, cudf::string_view>::value) ||
                (std::is_integral<column_type>::value) ||
                (std::is_floating_point<column_type>::value) ||
-               (cudf::is_timestamp<column_type>()) || (cudf::is_duration<column_type>()));
+               (cudf::is_fixed_point<column_type>()) || (cudf::is_timestamp<column_type>()) ||
+               (cudf::is_duration<column_type>()));
   }
 
   explicit column_to_strings_fn(
@@ -189,6 +190,15 @@ struct column_to_strings_fn {
     return cudf::strings::detail::from_floats(column, stream_, mr_);
   }
 
+  // fixed point:
+  //
+  template <typename column_type>
+  std::enable_if_t<cudf::is_fixed_point<column_type>(), std::unique_ptr<column>> operator()(
+    column_view const& column) const
+  {
+    return cudf::strings::detail::from_fixed_point(column, stream_, mr_);
+  }
+
   // timestamps:
   //
   template <typename column_type>
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 6bc08cf24a6..e45b67505ba 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -22,9 +22,11 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -61,6 +63,16 @@ using table_view = cudf::table_view;
 auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
   ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
+// Base test fixture for tests
+struct CsvWriterTest : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+struct CsvFixedPointWriterTest : public CsvWriterTest {
+};
+
+TYPED_TEST_CASE(CsvFixedPointWriterTest, cudf::test::FixedPointTypes);
+
 // Base test fixture for tests
 struct CsvReaderTest : public cudf::test::BaseFixture {
 };
@@ -307,6 +319,98 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumn)
   expect_column_data_equal(std::vector<TypeParam>(sequence, sequence + num_rows), view.column(0));
 }
 
+TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale)
+{
+  std::vector<std::string> reference_strings = {
+    "1.23", "-8.76", "5.43", "-0.12", "0.25", "-0.23", "-0.27", "0.00", "0.00"};
+
+  auto validity = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return (i % 2 == 0) ? true : false; });
+  cudf::test::strings_column_wrapper strings(
+    reference_strings.begin(), reference_strings.end(), validity);
+
+  std::vector<std::string> valid_reference_strings;
+  thrust::copy_if(thrust::host,
+                  reference_strings.begin(),
+                  reference_strings.end(),
+                  thrust::make_counting_iterator(0),
+                  std::back_inserter(valid_reference_strings),
+                  validity.functor());
+  reference_strings = valid_reference_strings;
+
+  using DecimalType = TypeParam;
+  auto input_column = cudf::strings::to_fixed_point(
+    cudf::strings_column_view(strings),
+    cudf::data_type{cudf::type_to_id<DecimalType>(), numeric::scale_type{-2}});
+
+  auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
+
+  auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnNegativeScale.csv";
+
+  cudf_io::csv_writer_options writer_options =
+    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table);
+
+  cudf_io::write_csv(writer_options);
+
+  std::vector<std::string> result_strings;
+  result_strings.reserve(reference_strings.size());
+
+  std::ifstream read_result_file(filepath);
+  assert(read_result_file.is_open());
+
+  std::copy(std::istream_iterator<std::string>(read_result_file),
+            std::istream_iterator<std::string>(),
+            std::back_inserter(result_strings));
+
+  EXPECT_EQ(result_strings, reference_strings);
+}
+
+TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale)
+{
+  std::vector<std::string> reference_strings = {
+    "123000", "-876000", "543000", "-12000", "25000", "-23000", "-27000", "0000", "0000"};
+
+  auto validity = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return (i % 2 == 0) ? true : false; });
+  cudf::test::strings_column_wrapper strings(
+    reference_strings.begin(), reference_strings.end(), validity);
+
+  std::vector<std::string> valid_reference_strings;
+  thrust::copy_if(thrust::host,
+                  reference_strings.begin(),
+                  reference_strings.end(),
+                  thrust::make_counting_iterator(0),
+                  std::back_inserter(valid_reference_strings),
+                  validity.functor());
+  reference_strings = valid_reference_strings;
+
+  using DecimalType = TypeParam;
+  auto input_column = cudf::strings::to_fixed_point(
+    cudf::strings_column_view(strings),
+    cudf::data_type{cudf::type_to_id<DecimalType>(), numeric::scale_type{3}});
+
+  auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
+
+  auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnPositiveScale.csv";
+
+  cudf_io::csv_writer_options writer_options =
+    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table);
+
+  cudf_io::write_csv(writer_options);
+
+  std::vector<std::string> result_strings;
+  result_strings.reserve(reference_strings.size());
+
+  std::ifstream read_result_file(filepath);
+  assert(read_result_file.is_open());
+
+  std::copy(std::istream_iterator<std::string>(read_result_file),
+            std::istream_iterator<std::string>(),
+            std::back_inserter(result_strings));
+
+  EXPECT_EQ(result_strings, reference_strings);
+}
+
 TEST_F(CsvReaderTest, MultiColumn)
 {
   constexpr auto num_rows = 10;

From 696902d236eb580f947a89ddd147d1c6b7fd1c89 Mon Sep 17 00:00:00 2001
From: ChrisJar <chris.jarrett.0@gmail.com>
Date: Sun, 23 May 2021 06:33:51 -0500
Subject: [PATCH 09/24] Enable implicit casting when concatenating mixed types
 (#8276)

This enables implicit casting when decimal columns are concatenated with numeric columns by casting the numeric columns to decimal columns.

Closes #8264

Authors:
  - https://github.com/ChrisJar

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/8276
---
 python/cudf/cudf/core/frame.py        |  19 +-
 python/cudf/cudf/core/series.py       |   9 +-
 python/cudf/cudf/tests/test_concat.py | 265 ++++++++++++++++++++++++++
 python/cudf/cudf/utils/dtypes.py      |  26 ++-
 4 files changed, 291 insertions(+), 28 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index f59954aaf08..cda4e8cbd4c 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -32,6 +32,7 @@
     is_numerical_dtype,
     is_scalar,
     min_scalar_type,
+    find_common_type,
 )
 
 T = TypeVar("T", bound="Frame")
@@ -4029,8 +4030,11 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
         # default to the first non-null dtype
         dtypes[idx] = cols[0].dtype
         # If all the non-null dtypes are int/float, find a common dtype
-        if all(is_numerical_dtype(col.dtype) for col in cols):
-            dtypes[idx] = np.find_common_type([col.dtype for col in cols], [])
+        if all(
+            is_numerical_dtype(col.dtype) or is_decimal_dtype(col.dtype)
+            for col in cols
+        ):
+            dtypes[idx] = find_common_type([col.dtype for col in cols])
         # If all categorical dtypes, combine the categories
         elif all(
             isinstance(col, cudf.core.column.CategoricalColumn) for col in cols
@@ -4045,17 +4049,6 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
             dtypes[idx] = min_scalar_type(len(categories[idx]))
-        elif all(
-            isinstance(col, cudf.core.column.DecimalColumn) for col in cols
-        ):
-            # Find the largest scale and the largest difference between
-            # precision and scale of the columns to be concatenated
-            s = max([col.dtype.scale for col in cols])
-            lhs = max([col.dtype.precision - col.dtype.scale for col in cols])
-            # Combine to get the necessary precision and clip at the maximum
-            # precision
-            p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs)
-            dtypes[idx] = cudf.Decimal64Dtype(p, s)
         # Otherwise raise an error if columns have different dtypes
         elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols):
             raise ValueError("All columns must be the same type")
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index d812214caf8..a894baf8235 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -45,7 +45,6 @@
 from cudf.utils import cudautils, docutils, ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    _decimal_normalize_types,
     can_convert_to_column,
     is_decimal_dtype,
     is_list_dtype,
@@ -53,7 +52,7 @@
     is_mixed_with_object_dtype,
     is_scalar,
     min_scalar_type,
-    numeric_normalize_types,
+    find_common_type,
 )
 from cudf.utils.utils import (
     get_appropriate_dispatched_func,
@@ -2402,10 +2401,8 @@ def _concat(cls, objs, axis=0, index=True):
                     )
 
             if dtype_mismatch:
-                if isinstance(objs[0]._column, cudf.core.column.DecimalColumn):
-                    objs = _decimal_normalize_types(*objs)
-                else:
-                    objs = numeric_normalize_types(*objs)
+                common_dtype = find_common_type([obj.dtype for obj in objs])
+                objs = [obj.astype(common_dtype) for obj in objs]
 
         col = _concat_columns([o._column for o in objs])
 
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 31dc6012905..5c4c121db4d 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from decimal import Decimal
 
 import cudf as gd
 from cudf.tests.utils import assert_eq, assert_exceptions_equal
@@ -1262,3 +1263,267 @@ def test_concat_decimal_series(ltype, rtype):
     expected = pd.concat([ps1, ps2])
 
     assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "df1, df2, df3, expected",
+    [
+        (
+            gd.DataFrame(
+                {"val": [Decimal("42.5"), Decimal("8.7")]},
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.DataFrame(
+                {"val": [Decimal("9.23"), Decimal("-67.49")]},
+                dtype=Decimal64Dtype(6, 4),
+            ),
+            gd.DataFrame({"val": [8, -5]}, dtype="int32"),
+            gd.DataFrame(
+                {
+                    "val": [
+                        Decimal("42.5"),
+                        Decimal("8.7"),
+                        Decimal("9.23"),
+                        Decimal("-67.49"),
+                        Decimal("8"),
+                        Decimal("-5"),
+                    ]
+                },
+                dtype=Decimal64Dtype(7, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.DataFrame(
+                {"val": [Decimal("95.2"), Decimal("23.4")]},
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.DataFrame({"val": [54, 509]}, dtype="uint16"),
+            gd.DataFrame({"val": [24, -48]}, dtype="int32"),
+            gd.DataFrame(
+                {
+                    "val": [
+                        Decimal("95.2"),
+                        Decimal("23.4"),
+                        Decimal("54"),
+                        Decimal("509"),
+                        Decimal("24"),
+                        Decimal("-48"),
+                    ]
+                },
+                dtype=Decimal64Dtype(5, 2),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.DataFrame(
+                {"val": [Decimal("36.56"), Decimal("-59.24")]},
+                dtype=Decimal64Dtype(9, 4),
+            ),
+            gd.DataFrame({"val": [403.21, 45.13]}, dtype="float32"),
+            gd.DataFrame({"val": [52.262, -49.25]}, dtype="float64"),
+            gd.DataFrame(
+                {
+                    "val": [
+                        Decimal("36.56"),
+                        Decimal("-59.24"),
+                        Decimal("403.21"),
+                        Decimal("45.13"),
+                        Decimal("52.262"),
+                        Decimal("-49.25"),
+                    ]
+                },
+                dtype=Decimal64Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.DataFrame(
+                {"val": [Decimal("9563.24"), Decimal("236.633")]},
+                dtype=Decimal64Dtype(9, 4),
+            ),
+            gd.DataFrame({"val": [5393, -95832]}, dtype="int64"),
+            gd.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"),
+            gd.DataFrame(
+                {
+                    "val": [
+                        Decimal("9563.24"),
+                        Decimal("236.633"),
+                        Decimal("5393"),
+                        Decimal("-95832"),
+                        Decimal("-29.234"),
+                        Decimal("-31.945"),
+                    ]
+                },
+                dtype=Decimal64Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+    ],
+)
+def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
+    df = gd.concat([df1, df2, df3])
+    assert_eq(df, expected)
+    assert_eq(df.val.dtype, expected.val.dtype)
+
+
+@pytest.mark.parametrize(
+    "s1, s2, s3, expected",
+    [
+        (
+            gd.Series(
+                [Decimal("32.8"), Decimal("-87.7")], dtype=Decimal64Dtype(6, 2)
+            ),
+            gd.Series(
+                [Decimal("101.243"), Decimal("-92.449")],
+                dtype=Decimal64Dtype(9, 6),
+            ),
+            gd.Series([94, -22], dtype="int32"),
+            gd.Series(
+                [
+                    Decimal("32.8"),
+                    Decimal("-87.7"),
+                    Decimal("101.243"),
+                    Decimal("-92.449"),
+                    Decimal("94"),
+                    Decimal("-22"),
+                ],
+                dtype=Decimal64Dtype(10, 6),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2)
+            ),
+            gd.Series([33, 984], dtype="uint32"),
+            gd.Series([593, -702], dtype="int32"),
+            gd.Series(
+                [
+                    Decimal("7.2"),
+                    Decimal("122.1"),
+                    Decimal("33"),
+                    Decimal("984"),
+                    Decimal("593"),
+                    Decimal("-702"),
+                ],
+                dtype=Decimal64Dtype(5, 2),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("982.94"), Decimal("-493.626")],
+                dtype=Decimal64Dtype(9, 4),
+            ),
+            gd.Series([847.98, 254.442], dtype="float32"),
+            gd.Series([5299.262, -2049.25], dtype="float64"),
+            gd.Series(
+                [
+                    Decimal("982.94"),
+                    Decimal("-493.626"),
+                    Decimal("847.98"),
+                    Decimal("254.442"),
+                    Decimal("5299.262"),
+                    Decimal("-2049.25"),
+                ],
+                dtype=Decimal64Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("492.204"), Decimal("-72824.455")],
+                dtype=Decimal64Dtype(9, 4),
+            ),
+            gd.Series([8438, -27462], dtype="int64"),
+            gd.Series([-40.292, 49202.953], dtype="float64"),
+            gd.Series(
+                [
+                    Decimal("492.204"),
+                    Decimal("-72824.455"),
+                    Decimal("8438"),
+                    Decimal("-27462"),
+                    Decimal("-40.292"),
+                    Decimal("49202.953"),
+                ],
+                dtype=Decimal64Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+    ],
+)
+def test_concat_decimal_numeric_series(s1, s2, s3, expected):
+    s = gd.concat([s1, s2, s3])
+    assert_eq(s, expected)
+
+
+@pytest.mark.parametrize(
+    "s1, s2, expected",
+    [
+        (
+            gd.Series(
+                [Decimal("955.22"), Decimal("8.2")], dtype=Decimal64Dtype(5, 2)
+            ),
+            gd.Series(["2007-06-12", "2006-03-14"], dtype="datetime64"),
+            gd.Series(
+                [
+                    "955.22",
+                    "8.20",
+                    "2007-06-12 00:00:00",
+                    "2006-03-14 00:00:00",
+                ],
+                index=[0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("-52.44"), Decimal("365.22")],
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.Series(
+                np.arange(
+                    "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]"
+                ),
+                dtype="datetime64[s]",
+            ),
+            gd.Series(
+                [
+                    "-52.44",
+                    "365.22",
+                    "2005-02-01 12:00:00",
+                    "2005-02-01 13:00:00",
+                    "2005-02-01 14:00:00",
+                ],
+                index=[0, 1, 0, 1, 2],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("753.0"), Decimal("94.22")],
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.Series([np.timedelta64(111, "s"), np.timedelta64(509, "s")]),
+            gd.Series(
+                ["753.00", "94.22", "0 days 00:01:51", "0 days 00:08:29"],
+                index=[0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("753.0"), Decimal("94.22")],
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.Series(
+                [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")]
+            ),
+            gd.Series(
+                ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"],
+                index=[0, 1, 0, 1],
+            ),
+        ),
+    ],
+)
+def test_concat_decimal_non_numeric(s1, s2, expected):
+    s = gd.concat([s1, s2])
+    assert_eq(s, expected)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 16c35bab4b1..0b59116f8e6 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -290,13 +290,15 @@ def is_decimal_dtype(obj):
     )
 
 
-def _decimal_normalize_types(*args):
-    s = max([a.dtype.scale for a in args])
-    lhs = max([a.dtype.precision - a.dtype.scale for a in args])
+def _find_common_type_decimal(dtypes):
+    # Find the largest scale and the largest difference between
+    # precision and scale of the columns to be concatenated
+    s = max([dtype.scale for dtype in dtypes])
+    lhs = max([dtype.precision - dtype.scale for dtype in dtypes])
+    # Combine to get the necessary precision and clip at the maximum
+    # precision
     p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs)
-    dtype = cudf.Decimal64Dtype(p, s)
-
-    return [a.astype(dtype) for a in args]
+    return cudf.Decimal64Dtype(p, s)
 
 
 def cudf_dtype_from_pydata_dtype(dtype):
@@ -690,9 +692,15 @@ def find_common_type(dtypes):
     dtypes = set(dtypes)
 
     if any(is_decimal_dtype(dtype) for dtype in dtypes):
-        raise NotImplementedError(
-            "DecimalDtype is not yet supported in find_common_type"
-        )
+        if all(
+            is_decimal_dtype(dtype) or is_numerical_dtype(dtype)
+            for dtype in dtypes
+        ):
+            return _find_common_type_decimal(
+                [dtype for dtype in dtypes if is_decimal_dtype(dtype)]
+            )
+        else:
+            return np.dtype("O")
 
     # Corner case 1:
     # Resort to np.result_type to handle "M" and "m" types separately

From ef20706d2f66ba6b32611f99c7b265c26d543d11 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 24 May 2021 07:22:37 -0400
Subject: [PATCH 10/24] Add separator-on-null parameter to strings concatenate
 APIs (#8282)

Closes #4728

This PR adds a new parameter to the `cudf::strings::concatenate` APIs to specify if separators should be added between null entries when the null-replacement (narep) parameter is valid. If the narep scalar is invalid (i.e. null itself) then the entire output row becomes null. If not, separators are added between each element. Examples:

```
s1 = ['a', 'b', null, 'dd', null]
s2 = ['A', null, 'CC', 'D', null]
concatenate( {s1, s2}, sep='+', narep=invalid ) -> ['a+A', null, null, 'dd+D', null]
concatenate( {s1, s2}, sep='+', narep='@' ) -> ['a+A', 'b+@', '@+CC', 'dd+D', '@+@']
concatenate( {s1, s2}, sep='+', narep='' ) -> ['a+A', 'b+', '+CC', 'dd+D', '+']
```

The new parameter is an enum `separator_on_nulls` which has `YES` or `NO` settings. The default parameter value will be `YES` to keep the current behavior as expected by Python cudf and for consistency with Pandas behavior.
Specifying `NO` here will suppress the separator with null elements (when narep is valid).

```
concatenate( {s1, s2}, sep='+', narep='', NO ) -> ['a+A', 'b', 'CC', 'dd+D', '']
```

This PR also changes the name of the `cudf::strings::concatenate_list_elements` API to `cudf::strings::join_list_elements` instead. The API pattern and behavior more mimic the `cudf::strings::join_strings` then the concatenate functions. Also, these are called by the Python `join` functions so the rename makes it more consistent with cudf.

This is a breaking change in order to make these APIs more consistent. Previously, the separators column version was returning nulls only for an all-null row. This has been changed to honor the `separator_on_null` parameter instead. Currently there was no Python cudf API calling this version. Only the rename required minor changes to the Cython layer.

The gtests were updated to reflect the new behavior. None of the pytests required any changes since the default parameter value matches the original behavior for those APIs that cudf actually calls.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)
  - Keith Kraus (https://github.com/kkraus14)
  - Thomas Graves (https://github.com/tgravescs)
  - Christopher Harris (https://github.com/cwharris)

URL: https://github.com/rapidsai/cudf/pull/8282
---
 cpp/CMakeLists.txt                            |   2 +-
 cpp/include/cudf/strings/combine.hpp          | 134 ++++++++-----
 cpp/include/cudf/strings/detail/combine.hpp   |   4 +-
 cpp/src/io/csv/writer_impl.cu                 |  15 +-
 cpp/src/strings/combine/concatenate.cu        | 177 +++++++++---------
 ...list_elements.cu => join_list_elements.cu} | 128 +++++++------
 cpp/tests/CMakeLists.txt                      |   2 +-
 .../strings/combine/concatenate_tests.cpp     | 125 ++++++++++---
 ...tests.cpp => join_list_elements_tests.cpp} | 117 +++++++-----
 python/cudf/cudf/_lib/cpp/strings/combine.pxd |   4 +-
 python/cudf/cudf/_lib/strings/combine.pyx     |   6 +-
 11 files changed, 445 insertions(+), 269 deletions(-)
 rename cpp/src/strings/combine/{concatenate_list_elements.cu => join_list_elements.cu} (64%)
 rename cpp/tests/strings/combine/{concatenate_list_elements_tests.cpp => join_list_elements_tests.cpp} (82%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index af6f60b031d..aa3b4406320 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -333,8 +333,8 @@ add_library(cudf
     src/strings/char_types/char_cases.cu
     src/strings/char_types/char_types.cu
     src/strings/combine/concatenate.cu
-    src/strings/combine/concatenate_list_elements.cu
     src/strings/combine/join.cu
+    src/strings/combine/join_list_elements.cu
     src/strings/contains.cu
     src/strings/convert/convert_booleans.cu
     src/strings/convert/convert_datetime.cu
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 6887ef0e670..360efe15303 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,12 +30,21 @@ namespace strings {
  * @brief Strings APIs for concatenate and join
  */
 
+/**
+ * @brief Setting for specifying how separators are added with
+ * null strings elements.
+ */
+enum class separator_on_nulls {
+  YES,  ///< Always add separators between elements
+  NO    ///< Do not add separators if an element is null
+};
+
 /**
  * @brief Concatenates all strings in the column into one new string delimited
  * by an optional separator string.
  *
  * This returns a column with one string. Any null entries are ignored unless
- * the narep parameter specifies a replacement string.
+ * the @p narep parameter specifies a replacement string.
  *
  * @code{.pseudo}
  * Example:
@@ -70,11 +79,9 @@ std::unique_ptr<column> join_strings(
  *
  * - If row separator for a given row is null, output column for that row is null, unless
  *   there is a valid @p separator_narep
- * - If all column values for a given row is null, output column for that row is null, unless
- *   there is a valid @p col_narep
- * - null column values for a given row are skipped, if the column replacement isn't valid
- * - The separator is only applied between two valid column values
- * - If valid @p separator_narep and @p col_narep are provided, the output column is always
+ * - The separator is applied between two output row values if the @p separate_nulls
+ *   is `YES` or only between valid rows if @p separate_nulls is `NO`.
+ * - If @p separator_narep and @p col_narep are both valid, the output column is always
  *   non nullable
  *
  * @code{.pseudo}
@@ -83,16 +90,25 @@ std::unique_ptr<column> join_strings(
  * c1   = [null, 'cc', 'dd', null, null, 'gg']
  * c2   = ['bb', '',   null, null, null, 'hh']
  * sep  = ['::', '%%', '^^', '!',  '*',  null]
- * out0 = concatenate([c0, c1, c2], sep)
- * out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null]
+ * out = concatenate({c0, c1, c2}, sep)
+ * // all rows have at least one null or sep[i]==null
+ * out is [null, null, null, null, null, null]
  *
  * sep_rep = '+'
- * out1    = concatenate([c0, c1, c2], sep, sep_rep)
- * out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh']
- *
- * col_rep = '-'
- * out2    = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep)
- * out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null]
+ * out = concatenate({c0, c1, c2}, sep, sep_rep)
+ * // all rows with at least one null output as null
+ * out is [null, null, null, null, null, 'ff+gg+hh']
+ *
+ * col_narep = '-'
+ * sep_na = non-valid scalar
+ * out = concatenate({c0, c1, c2}, sep, sep_na, col_narep)
+ * // only the null entry in the sep column produces a null row
+ * out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null]
+ *
+ * col_narep = ''
+ * out = concatenate({c0, c1, c2}, sep, sep_rep, col_narep, separator_on_nulls:NO)
+ * // parameter suppresses separator for null rows
+ * out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', 'ff+gg+hh']
  * @endcode
  *
  * @throw cudf::logic_error if no input columns are specified - table view is empty
@@ -108,6 +124,8 @@ std::unique_ptr<column> join_strings(
  * @param col_narep String that should be used in place of any null strings
  *        found in any column. Default of invalid-scalar means no null column value replacements.
  *        Default is an invalid string.
+ * @param separate_nulls If YES, then the separator is included for null rows
+ *        if `col_narep` is valid.
  * @param mr Resource for allocating device memory.
  * @return New column with concatenated results.
  */
@@ -116,15 +134,9 @@ std::unique_ptr<column> concatenate(
   strings_column_view const& separators,
   string_scalar const& separator_narep = string_scalar("", false),
   string_scalar const& col_narep       = string_scalar("", false),
+  separator_on_nulls separate_nulls    = separator_on_nulls::YES,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
-/**
- * @addtogroup strings_combine
- * @{
- * @file strings/combine.hpp
- * @brief Strings APIs for concatenate and join
- */
-
 /**
  * @brief Row-wise concatenates the given list of strings columns and
  * returns a single strings column result.
@@ -136,20 +148,30 @@ std::unique_ptr<column> concatenate(
  * row to be null entry unless a narep string is specified to be used
  * in its place.
  *
- * The number of strings in the columns provided must be the same.
+ * If @p separate_nulls is set to `NO` and @p narep is valid then
+ * separators are not added to the output between null elements.
+ * Otherwise, separators are always added if @p narep is valid.
+ *
+ * More than one column must be specified in the input @p strings_columns
+ * table.
  *
  * @code{.pseudo}
  * Example:
- * s1 = ['aa', null, '', 'aa']
- * s2 = ['', 'bb', 'bb', null]
- * r1 = concatenate([s1,s2])
- * r1 is ['aa', null, 'bb', null]
- * r2 = concatenate([s1,s2],':','_')
- * r2 is ['aa:', '_:bb', ':bb', 'aa:_']
+ * s1 = ['aa', null, '', 'dd']
+ * s2 = ['', 'bb', 'cc', null]
+ * out = concatenate({s1, s2})
+ * out is ['aa', null, 'cc', null]
+ *
+ * out = concatenate({s1, s2}, ':', '_')
+ * out is ['aa:', '_:bb', ':cc', 'dd:_']
+ *
+ * out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO)
+ * out is ['aa:', 'bb', ':cc', 'dd']
  * @endcode
  *
  * @throw cudf::logic_error if input columns are not all strings columns.
  * @throw cudf::logic_error if separator is not valid.
+ * @throw cudf::logic_error if only one column is specified
  *
  * @param strings_columns List of string columns to concatenate.
  * @param separator String that should inserted between each string from each row.
@@ -157,6 +179,7 @@ std::unique_ptr<column> concatenate(
  * @param narep String that should be used in place of any null strings
  *        found in any column. Default of invalid-scalar means any null entry in any column will
  *        produces a null result for that row.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with concatenated results.
  */
@@ -164,6 +187,7 @@ std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
   string_scalar const& separator      = string_scalar(""),
   string_scalar const& narep          = string_scalar("", false),
+  separator_on_nulls separate_nulls   = separator_on_nulls::YES,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -171,24 +195,30 @@ std::unique_ptr<column> concatenate(
  * within each row and returns a single strings column result.
  *
  * Each new string is created by concatenating the strings from the same row (same list element)
- * delimited by the row separator provided in the `separators` strings column.
+ * delimited by the row separator provided in the @p separators strings column.
  *
  * A null list row will always result in a null string in the output row. Any non-null list row
  * having a null element will result in the corresponding output row to be null unless a valid
- * `string_narep` scalar is provided to be used in its place. Any null row in the `separators`
- * column will also result in a null output row unless a valid `separator_narep` scalar is provided
+ * @p string_narep scalar is provided to be used in its place. Any null row in the @p separators
+ * column will also result in a null output row unless a valid @p separator_narep scalar is provided
  * to be used in place of the null separators.
  *
+ * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the
+ * output between null elements. Otherwise, separators are always added if @p narep is valid.
+ *
  * @code{.pseudo}
  * Example:
- * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ]
+ * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ]
  * sep  = ['::', '%%',  '!',  '*',  null]
  *
- * r1 = strings::concatenate_list_elements(s, sep)
- * r1 is ['aa::bb::cc', null, '!dd', null, null]
+ * out = join_list_elements(s, sep)
+ * out is ['aa::bb::cc', null, '!dd', null, null]
+ *
+ * out = join_list_elements(s, sep, ':', '_')
+ * out is ['aa::bb::cc', null,  '!dd', 'ee*_', 'ff:gg']
  *
- * r2 = strings::concatenate_list_elements(s, sep, ':', '_')
- * r2 is ['aa::bb::cc', null,  '!dd', 'ee*_', 'ff:gg']
+ * out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO)
+ * out is ['aa::bb::cc', null,  '!dd', 'ee', 'ff:gg']
  * @endcode
  *
  * @throw cudf::logic_error if input column is not lists of strings column.
@@ -203,14 +233,16 @@ std::unique_ptr<column> concatenate(
  * @param string_narep String that should be used to replace null strings in any non-null list row,
  *        default is an invalid-scalar denoting that list rows containing null strings will result
  *        in null string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column with concatenated results.
  */
-std::unique_ptr<column> concatenate_list_elements(
+std::unique_ptr<column> join_list_elements(
   const lists_column_view& lists_strings_column,
   const strings_column_view& separators,
   string_scalar const& separator_narep = string_scalar("", false),
   string_scalar const& string_narep    = string_scalar("", false),
+  separator_on_nulls separate_nulls    = separator_on_nulls::YES,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -218,21 +250,27 @@ std::unique_ptr<column> concatenate_list_elements(
  * within each row and returns a single strings column result.
  *
  * Each new string is created by concatenating the strings from the same row (same list element)
- * delimited by the separator provided.
+ * delimited by the @p separator provided.
  *
  * A null list row will always result in a null string in the output row. Any non-null list row
- * having a null elenent will result in the corresponding output row to be null unless a narep
- * string is specified to be used in its place.
+ * having a null elenent will result in the corresponding output row to be null unless a
+ * @p narep string is specified to be used in its place.
+ *
+ * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the
+ * output between null elements. Otherwise, separators are always added if @p narep is valid.
  *
  * @code{.pseudo}
  * Example:
- * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ]
+ * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ]
+ *
+ * out = join_list_elements(s)
+ * out is ['aabbcc', null, 'dd', null, 'ff']
  *
- * r1 = strings::concatenate_list_elements(s)
- * r1 is ['aabbcc', null, 'dd', null, 'ff']
+ * out = join_list_elements(s, ':', '_')
+ * out is ['aa:bb:cc', null,  ':dd', 'ee:_', 'ff']
  *
- * r2 = strings::concatenate_list_elements(s, ':', '_')
- * r2 is ['aa:bb:cc', null,  ':dd', 'ee:_', 'ff']
+ * out = join_list_elements(s, ':', '', separator_on_nulls::NO)
+ * out is ['aa:bb:cc', null,  ':dd', 'ee', 'ff']
  * @endcode
  *
  * @throw cudf::logic_error if input column is not lists of strings column.
@@ -244,13 +282,15 @@ std::unique_ptr<column> concatenate_list_elements(
  * @param narep String that should be used to replace null strings in any non-null list row, default
  *        is an invalid-scalar denoting that list rows containing null strings will result in null
  *        string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column with concatenated results.
  */
-std::unique_ptr<column> concatenate_list_elements(
+std::unique_ptr<column> join_list_elements(
   const lists_column_view& lists_strings_column,
   string_scalar const& separator      = string_scalar(""),
   string_scalar const& narep          = string_scalar("", false),
+  separator_on_nulls separate_nulls   = separator_on_nulls::YES,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index 6e25a4dfa38..d6bdf398886 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -36,6 +37,7 @@ std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
   string_scalar const& separator,
   string_scalar const& narep,
+  separator_on_nulls separate_nulls   = separator_on_nulls::YES,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 13760381373..bc0e1243d4f 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -28,6 +28,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -414,11 +415,19 @@ void writer::impl::write(table_view const& table,
       auto str_table_view = str_table_ptr->view();
 
       // concatenate columns in each row into one big string column
-      //(using null representation and delimiter):
+      // (using null representation and delimiter):
       //
       std::string delimiter_str{options_.get_inter_column_delimiter()};
-      auto str_concat_col = cudf::strings::detail::concatenate(
-        str_table_view, delimiter_str, options_.get_na_rep(), stream);
+      auto str_concat_col = [&] {
+        if (str_table_view.num_columns() > 1)
+          return cudf::strings::detail::concatenate(str_table_view,
+                                                    delimiter_str,
+                                                    options_.get_na_rep(),
+                                                    strings::separator_on_nulls::YES,
+                                                    stream);
+        cudf::string_scalar narep{options_.get_na_rep()};
+        return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
+      }();
 
       write_chunked(str_concat_col->view(), metadata, stream);
     }
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 5d7b9152ff3..1329ad3113f 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -41,67 +41,93 @@ namespace strings {
 namespace detail {
 namespace {
 
-/**
- * @brief Concatenate strings functor
- *
- * This will concatenate the strings from each row of the given table
- * and apply the separator. The null-replacement string `d_narep` is
- * used in place of any string in a row that contains a null entry.
- */
-struct concat_strings_fn {
+struct concat_strings_base {
   table_device_view const d_table;
-  string_view const d_separator;
   string_scalar_device_view const d_narep;
+  separator_on_nulls separate_nulls;
   offset_type* d_offsets{};
   char* d_chars{};
 
-  __device__ void operator()(size_type idx)
+  /**
+   * @brief Concatenate each table row to a single output string.
+   *
+   * This will concatenate the strings from each row of the given table
+   * and apply the separator. The null-replacement string `d_narep` is
+   * used in place of any string in a row that contains a null entry.
+   *
+   * @param idx The current row to process
+   * @param d_separator String to place in between each column's row
+   */
+  __device__ void process_row(size_type idx, string_view const d_separator)
   {
-    bool const null_element =
-      thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) {
-        return col.is_null(idx);
-      });
-    // handle a null row
-    if (null_element && !d_narep.is_valid()) {
+    if (!d_narep.is_valid() &&
+        thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) {
+          return col.is_null(idx);
+        })) {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
 
-    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    size_type bytes = 0;
+    char* d_buffer       = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    offset_type bytes    = 0;
+    bool write_separator = false;
+
     for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) {
-      auto const d_column = *itr;
-      auto const d_str =
-        d_column.is_null(idx) ? d_narep.value() : d_column.element<string_view>(idx);
-      if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str);
-      bytes += d_str.size_bytes();
-      // separator goes only in between elements
-      if (itr + 1 < d_table.end()) {
+      auto const d_column     = *itr;
+      bool const null_element = d_column.is_null(idx);
+
+      if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) {
         if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator);
         bytes += d_separator.size_bytes();
+        write_separator = false;
       }
+
+      // write out column's row data (or narep if the row is null)
+      auto const d_str = null_element ? d_narep.value() : d_column.element<string_view>(idx);
+      if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str);
+      bytes += d_str.size_bytes();
+
+      write_separator =
+        write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element;
     }
+
     if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
+/**
+ * @brief Single separator concatenate functor
+ */
+struct concat_strings_fn : concat_strings_base {
+  string_view const d_separator;
+
+  concat_strings_fn(table_device_view const& d_table,
+                    string_view const& d_separator,
+                    string_scalar_device_view const& d_narep,
+                    separator_on_nulls separate_nulls)
+    : concat_strings_base{d_table, d_narep, separate_nulls}, d_separator(d_separator)
+  {
+  }
+
+  __device__ void operator()(size_type idx) { process_row(idx, d_separator); }
+};
+
 }  // namespace
 
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator,
                                     string_scalar const& narep,
+                                    separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   auto const num_columns = strings_columns.num_columns();
-  CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
+  CUDF_EXPECTS(num_columns > 1, "At least two columns must be specified");
   // check all columns are of type string
   CUDF_EXPECTS(std::all_of(strings_columns.begin(),
                            strings_columns.end(),
                            [](auto c) { return c.type().id() == type_id::STRING; }),
                "All columns must be of type string");
-  if (num_columns == 1)  // single strings column returns a copy
-    return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
   auto const strings_count = strings_columns.num_rows();
   if (strings_count == 0)  // empty begets empty
     return detail::make_empty_strings_column(stream, mr);
@@ -112,7 +138,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
   // Create device views from the strings columns.
   auto d_table = table_device_view::create(strings_columns, stream);
-  concat_strings_fn fn{*d_table, d_separator, d_narep};
+  concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
   auto children = make_strings_children(fn, strings_count, stream, mr);
 
   // create resulting null mask
@@ -120,9 +146,9 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings_count),
     [d_table = *d_table, d_narep] __device__(size_type idx) {
-      bool null_element = thrust::any_of(
+      if (d_narep.is_valid()) return true;
+      return !thrust::any_of(
         thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); });
-      return (!null_element || d_narep.is_valid());
     },
     stream,
     mr);
@@ -145,68 +171,42 @@ namespace {
  * when a separator row is null `d_separator_narep`. The `d_narep` is
  * used in place of a null entry in the strings columns.
  */
-struct multi_separator_concat_fn {
-  table_device_view const d_table;
+struct multi_separator_concat_fn : concat_strings_base {
   column_device_view const d_separators;
   string_scalar_device_view const d_separator_narep;
-  string_scalar_device_view const d_narep;
-  offset_type* d_offsets{};
-  char* d_chars{};
 
-  __device__ void operator()(size_type idx)
+  multi_separator_concat_fn(table_device_view const& d_table,
+                            column_device_view const& d_separators,
+                            string_scalar_device_view const& d_separator_narep,
+                            string_scalar_device_view const& d_narep,
+                            separator_on_nulls separate_nulls)
+    : concat_strings_base{d_table, d_narep, separate_nulls},
+      d_separators(d_separators),
+      d_separator_narep(d_separator_narep)
   {
-    bool const all_nulls =
-      thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) {
-        return col.is_null(idx);
-      });
+  }
 
-    if ((d_separators.is_null(idx) && !d_separator_narep.is_valid()) ||
-        (all_nulls && !d_narep.is_valid())) {
+  __device__ void operator()(size_type idx)
+  {
+    if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
 
-    // point to output location
-    char* d_buffer    = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    offset_type bytes = 0;
-
-    // there is at least one non-null column value
     auto const d_separator = d_separators.is_valid(idx) ? d_separators.element<string_view>(idx)
                                                         : d_separator_narep.value();
-    auto const d_null_rep = d_narep.is_valid() ? d_narep.value() : string_view{};
-
-    // write output entry for this row
-    bool colval_written = false;  // state variable for writing separators
-    for (auto const d_column : d_table) {
-      // if the row is null and if there is no replacement, skip it
-      if (d_column.is_null(idx) && !d_narep.is_valid()) continue;
-
-      // separator in this row is written only after the first output
-      if (colval_written) {
-        if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator);
-        bytes += d_separator.size_bytes();
-      }
-
-      // write out column's row data (or narep if the row is null)
-      string_view const d_str =
-        d_column.is_null(idx) ? d_null_rep : d_column.element<string_view>(idx);
-      if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str);
-      bytes += d_str.size_bytes();
-
-      // column's string or narep could by empty so we need this flag
-      // to know we got this far even if no actual bytes were copied
-      colval_written = true;  // use the separator before the next column
-    }
-
-    if (!d_chars) d_offsets[idx] = bytes;
+    // base class utility function handles the rest
+    process_row(idx, d_separator);
   }
 };
+
 }  // namespace
 
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     strings_column_view const& separators,
                                     string_scalar const& separator_narep,
                                     string_scalar const& col_narep,
+                                    separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
@@ -234,20 +234,19 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // Create device views from the strings columns.
   auto d_table = table_device_view::create(strings_columns, stream);
 
-  multi_separator_concat_fn mscf{*d_table, separator_col_view, separator_rep, col_rep};
+  multi_separator_concat_fn mscf{
+    *d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
   auto children = make_strings_children(mscf, strings_count, stream, mr);
 
   // Create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings_count),
-    [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type ridx) {
-      if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return false;
-      bool all_nulls =
-        thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) {
-          return col.is_null(ridx);
-        });
-      return all_nulls ? col_rep.is_valid() : true;
+    [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type idx) {
+      if (!separator_col_view.is_valid(idx) && !separator_rep.is_valid()) return false;
+      if (col_rep.is_valid()) return true;
+      return !thrust::any_of(
+        thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); });
     },
     stream,
     mr);
@@ -268,21 +267,29 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator,
                                     string_scalar const& narep,
+                                    separator_on_nulls separate_nulls,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr);
+  return detail::concatenate(
+    strings_columns, separator, narep, separate_nulls, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     strings_column_view const& separators,
                                     string_scalar const& separator_narep,
                                     string_scalar const& col_narep,
+                                    separator_on_nulls separate_nulls,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(
-    strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr);
+  return detail::concatenate(strings_columns,
+                             separators,
+                             separator_narep,
+                             col_narep,
+                             separate_nulls,
+                             rmm::cuda_stream_default,
+                             mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/concatenate_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
similarity index 64%
rename from cpp/src/strings/combine/concatenate_list_elements.cu
rename to cpp/src/strings/combine/join_list_elements.cu
index 1157b8f3fce..7a83097566c 100644
--- a/cpp/src/strings/combine/concatenate_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -51,6 +52,7 @@ struct compute_size_and_concatenate_fn {
   offset_type const* const list_offsets;
   column_device_view const strings_dv;
   string_scalar_device_view const string_narep_dv;
+  separator_on_nulls const separate_nulls;
 
   offset_type* d_offsets{nullptr};
 
@@ -72,33 +74,38 @@ struct compute_size_and_concatenate_fn {
       return;
     }
 
-    auto const separator      = func.separator(idx);
-    auto const separator_size = separator.size_bytes();
-    auto size_bytes           = size_type{0};
-    bool written              = false;
-    char* output_ptr          = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto const separator = func.separator(idx);
+    auto size_bytes      = size_type{0};
+    char* output_ptr     = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    bool write_separator = false;
 
     for (size_type str_idx = list_offsets[idx], idx_end = list_offsets[idx + 1]; str_idx < idx_end;
          ++str_idx) {
-      if (not d_chars and (strings_dv.is_null(str_idx) and not string_narep_dv.is_valid())) {
+      bool null_element = strings_dv.is_null(str_idx);
+
+      if (not d_chars and (null_element and not string_narep_dv.is_valid())) {
         d_offsets[idx]    = 0;
         d_validities[idx] = false;
         return;  // early termination: the entire list of strings will result in a null string
       }
-      auto const d_str = strings_dv.is_null(str_idx) ? string_narep_dv.value()
-                                                     : strings_dv.element<string_view>(str_idx);
-      size_bytes += separator_size + d_str.size_bytes();
-      if (output_ptr) {
-        // Separator is inserted only in between strings
-        if (written) { output_ptr = detail::copy_string(output_ptr, separator); }
-        output_ptr = detail::copy_string(output_ptr, d_str);
-        written    = true;
+
+      if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) {
+        if (output_ptr) output_ptr = detail::copy_string(output_ptr, separator);
+        size_bytes += separator.size_bytes();
+        write_separator = false;
       }
+
+      auto const d_str =
+        null_element ? string_narep_dv.value() : strings_dv.element<string_view>(str_idx);
+      if (output_ptr) output_ptr = detail::copy_string(output_ptr, d_str);
+      size_bytes += d_str.size_bytes();
+
+      write_separator =
+        write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element;
     }
 
-    // Separator is inserted only in between strings
     if (not d_chars) {
-      d_offsets[idx]    = static_cast<size_type>(size_bytes - separator_size);
+      d_offsets[idx]    = size_bytes;
       d_validities[idx] = true;
     }
   }
@@ -123,11 +130,12 @@ struct scalar_separator_fn {
 
 }  // namespace
 
-std::unique_ptr<column> concatenate_list_elements(lists_column_view const& lists_strings_column,
-                                                  string_scalar const& separator,
-                                                  string_scalar const& narep,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
+                                           string_scalar const& separator,
+                                           string_scalar const& narep,
+                                           separator_on_nulls separate_nulls,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
                "The input column must be a column of lists of strings");
@@ -146,14 +154,14 @@ std::unique_ptr<column> concatenate_list_elements(lists_column_view const& lists
   auto const sep_dv          = get_scalar_device_view(const_cast<string_scalar&>(separator));
   auto const string_narep_dv = get_scalar_device_view(const_cast<string_scalar&>(narep));
 
-  auto const func    = scalar_separator_fn{sep_dv};
-  auto const comp_fn = compute_size_and_concatenate_fn<decltype(func)>{
-    func,
-    *lists_dv_ptr,
-    lists_strings_column.offsets_begin(),
-    *strings_dv_ptr,
-    string_narep_dv,
-  };
+  auto const func = scalar_separator_fn{sep_dv};
+  auto const comp_fn =
+    compute_size_and_concatenate_fn<decltype(func)>{func,
+                                                    *lists_dv_ptr,
+                                                    lists_strings_column.offsets_begin(),
+                                                    *strings_dv_ptr,
+                                                    string_narep_dv,
+                                                    separate_nulls};
   auto [offsets_column, chars_column, null_mask, null_count] =
     make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr);
 
@@ -191,12 +199,13 @@ struct column_separators_fn {
 
 }  // namespace
 
-std::unique_ptr<column> concatenate_list_elements(lists_column_view const& lists_strings_column,
-                                                  strings_column_view const& separators,
-                                                  string_scalar const& separator_narep,
-                                                  string_scalar const& string_narep,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
+                                           strings_column_view const& separators,
+                                           string_scalar const& separator_narep,
+                                           string_scalar const& string_narep,
+                                           separator_on_nulls separate_nulls,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
                "The input column must be a column of lists of strings");
@@ -217,14 +226,14 @@ std::unique_ptr<column> concatenate_list_elements(lists_column_view const& lists
   auto const sep_dv_ptr      = column_device_view::create(separators.parent(), stream);
   auto const sep_narep_dv    = get_scalar_device_view(const_cast<string_scalar&>(separator_narep));
 
-  auto const func    = column_separators_fn{*sep_dv_ptr, sep_narep_dv};
-  auto const comp_fn = compute_size_and_concatenate_fn<decltype(func)>{
-    func,
-    *lists_dv_ptr,
-    lists_strings_column.offsets_begin(),
-    *strings_dv_ptr,
-    string_narep_dv,
-  };
+  auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv};
+  auto const comp_fn =
+    compute_size_and_concatenate_fn<decltype(func)>{func,
+                                                    *lists_dv_ptr,
+                                                    lists_strings_column.offsets_begin(),
+                                                    *strings_dv_ptr,
+                                                    string_narep_dv,
+                                                    separate_nulls};
   auto [offsets_column, chars_column, null_mask, null_count] =
     make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr);
 
@@ -239,25 +248,32 @@ std::unique_ptr<column> concatenate_list_elements(lists_column_view const& lists
 
 }  // namespace detail
 
-std::unique_ptr<column> concatenate_list_elements(lists_column_view const& lists_strings_column,
-                                                  string_scalar const& separator,
-                                                  string_scalar const& narep,
-                                                  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
+                                           string_scalar const& separator,
+                                           string_scalar const& narep,
+                                           separator_on_nulls separate_nulls,
+                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_list_elements(
-    lists_strings_column, separator, narep, rmm::cuda_stream_default, mr);
+  return detail::join_list_elements(
+    lists_strings_column, separator, narep, separate_nulls, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> concatenate_list_elements(lists_column_view const& lists_strings_column,
-                                                  strings_column_view const& separators,
-                                                  string_scalar const& separator_narep,
-                                                  string_scalar const& string_narep,
-                                                  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
+                                           strings_column_view const& separators,
+                                           string_scalar const& separator_narep,
+                                           string_scalar const& string_narep,
+                                           separator_on_nulls separate_nulls,
+                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_list_elements(
-    lists_strings_column, separators, separator_narep, string_narep, rmm::cuda_stream_default, mr);
+  return detail::join_list_elements(lists_strings_column,
+                                    separators,
+                                    separator_narep,
+                                    string_narep,
+                                    separate_nulls,
+                                    rmm::cuda_stream_default,
+                                    mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f36ec70479b..bbcfd69a52b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -328,8 +328,8 @@ ConfigureTest(STRINGS_TEST
     strings/booleans_tests.cpp
     strings/case_tests.cpp
     strings/chars_types_tests.cpp
-    strings/combine/concatenate_list_elements_tests.cpp
     strings/combine/concatenate_tests.cpp
+    strings/combine/join_list_elements_tests.cpp
     strings/combine/join_strings_tests.cpp
     strings/concatenate_tests.cpp
     strings/contains_tests.cpp
diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
index c1c390e8a82..d91f669e42d 100644
--- a/cpp/tests/strings/combine/concatenate_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -95,6 +95,58 @@ TEST_F(StringsCombineTest, Concatenate)
   }
 }
 
+TEST_F(StringsCombineTest, ConcatenateSkipNulls)
+{
+  cudf::test::strings_column_wrapper strings1({"eee", "", "", "", "aa", "bbb", "ééé"},
+                                              {1, 0, 0, 1, 1, 1, 1});
+  cudf::test::strings_column_wrapper strings2({"xyz", "", "d", "éa", "", "", "f"},
+                                              {1, 0, 1, 1, 1, 0, 1});
+  cudf::test::strings_column_wrapper strings3({"q", "", "s", "t", "u", "", "w"},
+                                              {1, 1, 1, 1, 1, 0, 1});
+
+  cudf::table_view table({strings1, strings2, strings3});
+
+  {
+    cudf::test::strings_column_wrapper expected(
+      {"eee+xyz+q", "++", "+d+s", "+éa+t", "aa++u", "bbb++", "ééé+f+w"});
+    auto results = cudf::strings::concatenate(table,
+                                              cudf::string_scalar("+"),
+                                              cudf::string_scalar(""),
+                                              cudf::strings::separator_on_nulls::YES);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+  {
+    cudf::test::strings_column_wrapper expected(
+      {"eee+xyz+q", "", "d+s", "+éa+t", "aa++u", "bbb", "ééé+f+w"});
+    auto results = cudf::strings::concatenate(table,
+                                              cudf::string_scalar("+"),
+                                              cudf::string_scalar(""),
+                                              cudf::strings::separator_on_nulls::NO);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+  {
+    cudf::test::strings_column_wrapper expected(
+      {"eee+xyz+q", "", "", "+éa+t", "aa++u", "", "ééé+f+w"}, {1, 0, 0, 1, 1, 0, 1});
+    auto results = cudf::strings::concatenate(table,
+                                              cudf::string_scalar("+"),
+                                              cudf::string_scalar("", false),
+                                              cudf::strings::separator_on_nulls::NO);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+  {
+    cudf::test::strings_column_wrapper sep_col({"+", "-", ".", "@", "*", "^^", "#"});
+    auto results = cudf::strings::concatenate(table,
+                                              cudf::strings_column_view(sep_col),
+                                              cudf::string_scalar(""),
+                                              cudf::string_scalar(""),
+                                              cudf::strings::separator_on_nulls::NO);
+
+    cudf::test::strings_column_wrapper expected(
+      {"eee+xyz+q", "", "d.s", "@éa@t", "aa**u", "bbb", "ééé#f#w"});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+}
+
 TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
@@ -107,6 +159,12 @@ TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns)
   cudf::test::expect_strings_empty(results->view());
 }
 
+TEST_F(StringsCombineTest, SingleColumnErrorCheck)
+{
+  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0}}), cudf::logic_error);
+}
+
 struct StringsConcatenateWithColSeparatorTest : public cudf::test::BaseFixture {
 };
 
@@ -157,7 +215,6 @@ TEST_F(StringsConcatenateWithColSeparatorTest, SingleColumnEmptyAndNullStringsNo
 
   auto exp_results =
     cudf::test::strings_column_wrapper({"", "", "", ""}, {false, true, false, false});
-
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0}}, cudf::strings_column_view(sep_col));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
@@ -295,12 +352,20 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnEmptyAndNullStringsNoR
   auto sep_col = cudf::test::strings_column_wrapper(
     {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false});
 
-  auto exp_results = cudf::test::strings_column_wrapper(
-    {"", "", "", "", "", "", "", ""}, {false, false, true, false, true, false, true, false});
-
+  auto exp_results1 = cudf::test::strings_column_wrapper(
+    {"", "", "", "", "", "", "", ""}, {false, false, true, false, false, false, false, false});
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true);
+
+  auto exp_results2 = cudf::test::strings_column_wrapper(
+    {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false});
+  results = cudf::strings::concatenate(cudf::table_view{{col0, col1}},
+                                       cudf::strings_column_view(sep_col),
+                                       cudf::string_scalar("", false),
+                                       cudf::string_scalar(""),
+                                       cudf::strings::separator_on_nulls::NO);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true);
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacements)
@@ -315,13 +380,23 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacement
     {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"},
     {true, true, false, true, false, true, false, true, true, true, true, true});
 
-  auto exp_results = cudf::test::strings_column_wrapper(
-    {"eeexyzfoo", "<null>~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""},
-    {true, true, false, true, false, true, false, true, true, false, false, false});
+  auto exp_results1 = cudf::test::strings_column_wrapper(
+    {"eeexyzfoo", "<null>~~~", "", "", "", "", "", "", "", "", "", ""},
+    {true, true, false, false, false, false, false, false, false, false, false, false});
 
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true);
+
+  auto exp_results2 = cudf::test::strings_column_wrapper(
+    {"eeexyzfoo", "<null>~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""},
+    {true, true, false, true, false, true, false, true, true, true, true, true});
+  results = cudf::strings::concatenate(cudf::table_view{{col0, col1}},
+                                       cudf::strings_column_view(sep_col),
+                                       cudf::string_scalar("", false),
+                                       cudf::string_scalar(""),
+                                       cudf::strings::separator_on_nulls::NO);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true);
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorReplacement)
@@ -335,26 +410,26 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorRepl
   auto sep_col = cudf::test::strings_column_wrapper(
     {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"},
     {true, true, false, true, false, true, false, true, true, true, true, true});
-  auto sep_rep = cudf::string_scalar("!!!!!!!!!!");
+  auto sep_rep = cudf::string_scalar("!!!!!!!");
 
-  auto exp_results = cudf::test::strings_column_wrapper(
-    {"eeexyzfoo",
-     "<null>~~~",
-     "!!!!!!!!!!éaff",
-     "éééf",
-     "éa",
-     "",
-     "éaff",
-     "valid",
-     "doo",
-     "",
-     "",
-     ""},
-    {true, true, true, true, true, true, true, true, true, false, false, false});
+  auto exp_results1 = cudf::test::strings_column_wrapper(
+    {"eeexyzfoo", "<null>~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""},
+    {true, true, true, false, false, false, false, false, false, false, false, false});
 
   auto results = cudf::strings::concatenate(
     cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col), sep_rep);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true);
+
+  auto exp_results2 = cudf::test::strings_column_wrapper(
+    {"eeexyzfoo", "<null>~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""},
+    {true, true, true, true, true, true, true, true, true, true, true, true});
+
+  results = cudf::strings::concatenate(cudf::table_view{{col0, col1}},
+                                       cudf::strings_column_view(sep_col),
+                                       sep_rep,
+                                       cudf::string_scalar(""),
+                                       cudf::strings::separator_on_nulls::NO);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true);
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixColumnReplacement)
diff --git a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp
similarity index 82%
rename from cpp/tests/strings/combine/concatenate_list_elements_tests.cpp
rename to cpp/tests/strings/combine/join_list_elements_tests.cpp
index b6afd588dfb..e2f7c3e36a2 100644
--- a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp
+++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp
@@ -58,7 +58,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput)
   {
     auto const string_lists = INT_LISTS{{1, 2, 3}, {4, 5, 6}}.release();
     auto const string_lv    = cudf::lists_column_view(string_lists->view());
-    EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv), cudf::logic_error);
+    EXPECT_THROW(cudf::strings::join_list_elements(string_lv), cudf::logic_error);
   }
 
   // Invalid scalar separator
@@ -66,9 +66,8 @@ TEST_F(StringsListsConcatenateTest, InvalidInput)
     auto const string_lists =
       STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release();
     auto const string_lv = cudf::lists_column_view(string_lists->view());
-    EXPECT_THROW(
-      cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("", false)),
-      cudf::logic_error);
+    EXPECT_THROW(cudf::strings::join_list_elements(string_lv, cudf::string_scalar("", false)),
+                 cudf::logic_error);
   }
 
   // Invalid column separators
@@ -77,7 +76,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput)
       STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release();
     auto const string_lv  = cudf::lists_column_view(string_lists->view());
     auto const separators = STR_COL{"+++"}.release();  // size doesn't match with lists column size
-    EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv, separators->view()),
+    EXPECT_THROW(cudf::strings::join_list_elements(string_lv, separators->view()),
                  cudf::logic_error);
   }
 }
@@ -87,26 +86,26 @@ TEST_F(StringsListsConcatenateTest, EmptyInput)
   auto const string_lists = STR_LISTS{}.release();
   auto const string_lv    = cudf::lists_column_view(string_lists->view());
   auto const expected     = STR_COL{};
-  auto results            = cudf::strings::concatenate_list_elements(string_lv);
+  auto results            = cudf::strings::join_list_elements(string_lv);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
 
   auto const separators = STR_COL{}.release();
-  results               = cudf::strings::concatenate_list_elements(string_lv, separators->view());
+  results               = cudf::strings::join_list_elements(string_lv, separators->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
 }
 
 TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput)
 {
   auto const string_lists =
-    STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release();
+    STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}, STR_LISTS{}}.release();
   auto const string_lv = cudf::lists_column_view(string_lists->view());
-  auto const expected  = STR_COL{"", "", ""};
+  auto const expected  = STR_COL{"", "", "", ""};
 
-  auto results = cudf::strings::concatenate_list_elements(string_lv);
+  auto results = cudf::strings::join_list_elements(string_lv);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
 
-  auto const separators = STR_COL{"", "", ""}.release();
-  results               = cudf::strings::concatenate_list_elements(string_lv, separators->view());
+  auto const separators = STR_COL{"", "", "", ""}.release();
+  results               = cudf::strings::join_list_elements(string_lv, separators->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
 }
 
@@ -120,29 +119,35 @@ TEST_F(StringsListsConcatenateTest, AllNullsStringsInput)
   auto const string_lv = cudf::lists_column_view(string_lists->view());
   auto const expected  = STR_COL{{"", "", ""}, all_nulls()};
 
-  auto results = cudf::strings::concatenate_list_elements(string_lv);
+  auto results = cudf::strings::join_list_elements(string_lv);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
 
   auto const separators = STR_COL{{"", "", ""}, all_nulls()}.release();
-  results               = cudf::strings::concatenate_list_elements(string_lv, separators->view());
+  results               = cudf::strings::join_list_elements(string_lv, separators->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
 }
 
+auto null_at(std::initializer_list<cudf::size_type> indices)
+{
+  return cudf::detail::make_counting_transform_iterator(
+    0, [indices](auto i) { return std::find(indices.begin(), indices.end(), i) == indices.end(); });
+}
+
 TEST_F(StringsListsConcatenateTest, ScalarSeparator)
 {
   auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)},
                                        STR_LISTS{}, /*NULL*/
                                        STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)},
-                                       STR_LISTS{"zzz", "xxxxx"}},
+                                       STR_LISTS{"zzz", "xxxxx"},
+                                       STR_LISTS{{"v", "", "", "w"}, null_at({1, 2})}},
                                       null_at(1)}
                               .release();
   auto const string_lv = cudf::lists_column_view(string_lists->view());
 
   // No null replacement
   {
-    auto const results =
-      cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++"));
-    std::vector<const char*> h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"};
+    auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
+    std::vector<const char*> h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx", nullptr};
     auto const expected =
       STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
@@ -150,10 +155,22 @@ TEST_F(StringsListsConcatenateTest, ScalarSeparator)
 
   // With null replacement
   {
-    auto const results = cudf::strings::concatenate_list_elements(
+    auto const results = cudf::strings::join_list_elements(
       string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
     std::vector<const char*> h_expected{
-      "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"};
+      "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx", "v+++___+++___+++w"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Turn off separator-on-nulls
+  {
+    auto const results = cudf::strings::join_list_elements(string_lv,
+                                                           cudf::string_scalar("+++"),
+                                                           cudf::string_scalar(""),
+                                                           cudf::strings::separator_on_nulls::NO);
+    std::vector<const char*> h_expected{"a+++ccc", nullptr, "efgh+++ijk", "zzz+++xxxxx", "v+++w"};
     auto const expected =
       STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
@@ -181,8 +198,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator)
   // Sliced the entire lists column, no null replacement
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]);
-    auto const results =
-      cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++"));
+    auto const results   = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
     std::vector<const char*> h_expected{nullptr,
                                         nullptr,
                                         nullptr,
@@ -202,7 +218,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator)
   // Sliced the entire lists column, with null replacement
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(
+    auto const results   = cudf::strings::join_list_elements(
       string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
     std::vector<const char*> h_expected{"a+++___+++ccc",
                                         nullptr,
@@ -223,8 +239,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator)
   // Sliced the first half of the lists column, no null replacement
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]);
-    auto const results =
-      cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++"));
+    auto const results   = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
     std::vector<const char*> h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"};
     auto const expected =
       STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
@@ -234,7 +249,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator)
   // Sliced the first half of the lists column, with null replacement
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(
+    auto const results   = cudf::strings::join_list_elements(
       string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
     std::vector<const char*> h_expected{
       "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"};
@@ -246,8 +261,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator)
   // Sliced the second half of the lists column, no null replacement
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]);
-    auto const results =
-      cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++"));
+    auto const results   = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
     std::vector<const char*> h_expected{
       nullptr, nullptr, "0a0b0c+++5x5y5z", nullptr, "ééé+++12345abcdef", "aaaééébbbéééccc+++12345"};
     auto const expected =
@@ -258,7 +272,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator)
   // Sliced the second half of the lists column, with null replacement
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(
+    auto const results   = cudf::strings::join_list_elements(
       string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
     std::vector<const char*> h_expected{"abcdef+++012345+++___+++xxx000",
                                         "___+++11111+++00000",
@@ -274,8 +288,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator)
   // Sliced the middle part of the lists column, no null replacement
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]);
-    auto const results =
-      cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++"));
+    auto const results   = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
     std::vector<const char*> h_expected{
       "zzz+++xxxxx", nullptr, nullptr, nullptr, "0a0b0c+++5x5y5z"};
     auto const expected =
@@ -286,7 +299,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator)
   // Sliced the middle part of the lists column, with null replacement
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(
+    auto const results   = cudf::strings::join_list_elements(
       string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
     std::vector<const char*> h_expected{"zzz+++xxxxx",
                                         nullptr,
@@ -318,7 +331,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators)
 
   // No null replacement
   {
-    auto const results = cudf::strings::concatenate_list_elements(string_lv, separators->view());
+    auto const results = cudf::strings::join_list_elements(string_lv, separators->view());
     std::vector<const char*> h_expected{nullptr, nullptr, nullptr, nullptr, nullptr, "zzz^^^xxxxx"};
     auto const expected =
       STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
@@ -327,8 +340,8 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators)
 
   // With null replacement for separators
   {
-    auto const results = cudf::strings::concatenate_list_elements(
-      string_lv, separators->view(), cudf::string_scalar("|||"));
+    auto const results =
+      cudf::strings::join_list_elements(string_lv, separators->view(), cudf::string_scalar("|||"));
     std::vector<const char*> h_expected{
       nullptr, nullptr, "0a0b0c|||xyzééé", nullptr, nullptr, "zzz^^^xxxxx"};
     auto const expected =
@@ -338,7 +351,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators)
 
   // With null replacement for strings
   {
-    auto const results = cudf::strings::concatenate_list_elements(
+    auto const results = cudf::strings::join_list_elements(
       string_lv, separators->view(), cudf::string_scalar("", false), cudf::string_scalar("XXXXX"));
     std::vector<const char*> h_expected{
       "a+++XXXXX+++ccc", nullptr, nullptr, nullptr, "XXXXX%%%ááá%%%ííí", "zzz^^^xxxxx"};
@@ -349,7 +362,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators)
 
   // With null replacement for both separators and strings
   {
-    auto const results = cudf::strings::concatenate_list_elements(
+    auto const results = cudf::strings::join_list_elements(
       string_lv, separators->view(), cudf::string_scalar("|||"), cudf::string_scalar("XXXXX"));
     std::vector<const char*> h_expected{"a+++XXXXX+++ccc",
                                         nullptr,
@@ -361,6 +374,20 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators)
       STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
   }
+
+  // Turn off separator-on-nulls
+  {
+    auto const results = cudf::strings::join_list_elements(string_lv,
+                                                           separators->view(),
+                                                           cudf::string_scalar("+++"),
+                                                           cudf::string_scalar(""),
+                                                           cudf::strings::separator_on_nulls::NO);
+    std::vector<const char*> h_expected{
+      "a+++ccc", nullptr, "0a0b0c+++xyzééé", "efgh+++ijk", "ááá%%%ííí", "zzz^^^xxxxx"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
 }
 
 TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
@@ -390,7 +417,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]);
     auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(string_lv, sep_col);
+    auto const results   = cudf::strings::join_list_elements(string_lv, sep_col);
     std::vector<const char*> h_expected{nullptr,
                                         nullptr,
                                         nullptr,
@@ -411,7 +438,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]);
     auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(
+    auto const results   = cudf::strings::join_list_elements(
       string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___"));
     std::vector<const char*> h_expected{"a+++___+++ccc",
                                         nullptr,
@@ -433,7 +460,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]);
     auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(string_lv, sep_col);
+    auto const results   = cudf::strings::join_list_elements(string_lv, sep_col);
     std::vector<const char*> h_expected{nullptr, nullptr, nullptr, nullptr};
     auto const expected =
       STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
@@ -444,7 +471,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]);
     auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(
+    auto const results   = cudf::strings::join_list_elements(
       string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___"));
     std::vector<const char*> h_expected{
       "a+++___+++ccc", nullptr, "___|||efgh|||ijk", "zzz|||xxxxx"};
@@ -457,7 +484,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]);
     auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(string_lv, sep_col);
+    auto const results   = cudf::strings::join_list_elements(string_lv, sep_col);
     std::vector<const char*> h_expected{
       nullptr, nullptr, "0a0b0c###5x5y5z", nullptr, "ééé-+-12345abcdef", "aaaééébbbéééccc=+=12345"};
     auto const expected =
@@ -469,7 +496,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]);
     auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(
+    auto const results   = cudf::strings::join_list_elements(
       string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___"));
     std::vector<const char*> h_expected{"abcdef^^^012345^^^___^^^xxx000",
                                         "___~!~11111~!~00000",
@@ -486,7 +513,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]);
     auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(string_lv, sep_col);
+    auto const results   = cudf::strings::join_list_elements(string_lv, sep_col);
     std::vector<const char*> h_expected{nullptr, nullptr, nullptr, nullptr, "0a0b0c###5x5y5z"};
     auto const expected =
       STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
@@ -497,7 +524,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
   {
     auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]);
     auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]);
-    auto const results   = cudf::strings::concatenate_list_elements(
+    auto const results   = cudf::strings::join_list_elements(
       string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___"));
     std::vector<const char*> h_expected{"zzz|||xxxxx",
                                         nullptr,
diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd
index 250c6441882..51c706b68d0 100644
--- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd
@@ -18,13 +18,13 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
         string_scalar separator,
         string_scalar narep) except +
 
-    cdef unique_ptr[column] concatenate_list_elements(
+    cdef unique_ptr[column] join_list_elements(
         column_view lists_strings_column,
         column_view separators,
         string_scalar separator_narep,
         string_scalar string_narep) except +
 
-    cdef unique_ptr[column] concatenate_list_elements(
+    cdef unique_ptr[column] join_list_elements(
         column_view lists_strings_column,
         string_scalar separator,
         string_scalar narep) except +
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 25619de3ed0..0d7dfb5c619 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -16,7 +16,7 @@ from cudf._lib.table cimport Table
 from cudf._lib.cpp.strings.combine cimport (
     concatenate as cpp_concatenate,
     join_strings as cpp_join_strings,
-    concatenate_list_elements as cpp_concatenate_list_elements
+    join_list_elements as cpp_join_list_elements
 )
 
 
@@ -105,7 +105,7 @@ def join_lists_with_scalar(
     )
 
     with nogil:
-        c_result = move(cpp_concatenate_list_elements(
+        c_result = move(cpp_join_list_elements(
             source_view,
             scalar_separator[0],
             scalar_narep[0]
@@ -142,7 +142,7 @@ def join_lists_with_column(
     )
 
     with nogil:
-        c_result = move(cpp_concatenate_list_elements(
+        c_result = move(cpp_join_list_elements(
             source_view,
             separator_view,
             scalar_separator_narep[0],

From b9588d1fe570c09ad333ada52210ad1e1c998da7 Mon Sep 17 00:00:00 2001
From: Liangcai Li <firestarmanllc@gmail.com>
Date: Mon, 24 May 2021 21:11:03 +0800
Subject: [PATCH 11/24] JNI: Refactor the code of making column from scalar
 (#8310)

This small PR is to replace the JNI implementation with the corresponding cudf API `make_column_from_scalar`.

The PR https://github.com/rapidsai/cudf/pull/8185/ has added the support for nested type, so it is ok to do this now.

Signed-off-by: Firestarman <firestarmanllc@gmail.com>

Authors:
  - Liangcai Li (https://github.com/firestarman)

Approvers:
  - Bobby Wang (https://github.com/wbo4958)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/8310
---
 java/src/main/native/src/ColumnVectorJni.cpp | 43 ++------------------
 1 file changed, 4 insertions(+), 39 deletions(-)

diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index a09de5c61e3..2953a6221e8 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -220,49 +220,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, j
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env, jclass,
                                                                     jlong j_scalar,
                                                                     jint row_count) {
-  using ScalarType = cudf::scalar_type_t<cudf::size_type>;
   JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto scalar_val = reinterpret_cast<cudf::scalar const *>(j_scalar);
-    auto dtype = scalar_val->type();
-    cudf::mask_state mask_state =
-        scalar_val->is_valid() ? cudf::mask_state::UNALLOCATED : cudf::mask_state::ALL_NULL;
     std::unique_ptr<cudf::column> col;
-    if (dtype.id() == cudf::type_id::LIST) {
-      // Neither 'cudf::make_empty_column' nor 'cudf::make_column_from_scalar' supports
-      // LIST type for now (https://github.com/rapidsai/cudf/issues/8088), so the list
-      // precedes the others and takes care of the empty column itself.
-      auto s_list = reinterpret_cast<cudf::list_scalar const *>(scalar_val);
-      cudf::column_view s_val = s_list->view();
-
-      // Offsets: [0, list_size, list_size*2, ..., list_szie*row_count]
-      auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-      auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-      zero->set_valid(true);
-      step->set_valid(true);
-      static_cast<ScalarType *>(zero.get())->set_value(0);
-      static_cast<ScalarType *>(step.get())->set_value(s_val.size());
-      std::unique_ptr<cudf::column> offsets = cudf::sequence(row_count + 1, *zero, *step);
-      // Data:
-      // Builds the data column by leveraging `cudf::concatenate` to repeat the 's_val'
-      // 'row_count' times, because 'cudf::make_column_from_scalar' does not support list
-      // type.
-      // (Assumes the `row_count` is not big, otherwise there would be a performance issue.)
-      // Checks the `row_count` because `cudf::concatenate` does not support no rows.
-      auto data_col = row_count > 0
-          ? cudf::concatenate(std::vector<cudf::column_view>(row_count, s_val))
-          : cudf::empty_like(s_val);
-      col = cudf::make_lists_column(row_count, std::move(offsets), std::move(data_col),
-                                    cudf::state_null_count(mask_state, row_count),
-                                    cudf::create_null_mask(row_count, mask_state));
-    } else if (row_count == 0) {
-      col = cudf::make_empty_column(dtype);
-    } else if (cudf::is_fixed_width(dtype)) {
-      col = cudf::make_fixed_width_column(dtype, row_count, mask_state);
-      auto mut_view = col->mutable_view();
-      cudf::fill_in_place(mut_view, 0, row_count, *scalar_val);
-    } else if (dtype.id() == cudf::type_id::STRING) {
+    if (scalar_val->type().id() == cudf::type_id::STRING) {
+      // Tests fail when using the cudf implementation, complaining no child for string column.
+      // So here take care of the String type itself.
       // create a string column of all empty strings to fill (cheapest string column to create)
       auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, row_count + 1,
                                                cudf::mask_state::UNALLOCATED);
@@ -273,7 +238,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env,
 
       col = cudf::fill(str_col->view(), 0, row_count, *scalar_val);
     } else {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
+      col = cudf::make_column_from_scalar(*scalar_val, row_count);
     }
     return reinterpret_cast<jlong>(col.release());
   }

From 936b02d3c8966c059317a6306a96297637fe545d Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 24 May 2021 06:47:55 -0700
Subject: [PATCH 12/24] Add description of the cuIO GDS integration (#8293)

Adds a document to describe cuIO behavior with respect to the GDS library use.
Also includes a disclaimer about the current state of the integration.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Keith Kraus (https://github.com/kkraus14)

URL: https://github.com/rapidsai/cudf/pull/8293
---
 docs/cudf/source/io-gds-integration.rst | 22 ++++++++++++++++++++++
 docs/cudf/source/io.rst                 |  3 ++-
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 docs/cudf/source/io-gds-integration.rst

diff --git a/docs/cudf/source/io-gds-integration.rst b/docs/cudf/source/io-gds-integration.rst
new file mode 100644
index 00000000000..9ccf773b2e4
--- /dev/null
+++ b/docs/cudf/source/io-gds-integration.rst
@@ -0,0 +1,22 @@
+GPUDirect Storage Integration
+=============================
+
+Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. 
+GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. 
+GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. 
+The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
+
+Use of GPUDirect Storage in cuDF is disabled by default, and can be enabled through environment variable ``LIBCUDF_CUFILE_POLICY``. 
+This variable also controls the GDS compatibility mode. There are two special values for the environment variable:
+
+- "GDS": Use of GDS is enabled; GDS compatibility mode is *off*.
+- "ALWAYS": Use of GDS is enabled; GDS compatibility mode is *on*.
+
+Any other value (or no value set) will keep the GDS disabled for use in cuDF and IO will be done using cuDF's CPU bounce buffers.
+
+This environment variable also affects how cuDF treats GDS errors.
+When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
+When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), 
+cuDF throws an exception to propagate the error to te user.
+
+NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases.
\ No newline at end of file
diff --git a/docs/cudf/source/io.rst b/docs/cudf/source/io.rst
index 5186473ae10..e88162d8f52 100644
--- a/docs/cudf/source/io.rst
+++ b/docs/cudf/source/io.rst
@@ -8,4 +8,5 @@ This page contains Input / Output related APIs in cuDF.
    :maxdepth: 2
    :caption: Contents:
 
-   io-supported-types.rst
\ No newline at end of file
+   io-supported-types.rst
+   io-gds-integration.rst
\ No newline at end of file

From 259d69ba4916d62a9d345e741b6c2be5ae4183fd Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Mon, 24 May 2021 09:19:57 -0500
Subject: [PATCH 13/24] Revert "patch thrust to fix intmax num elements
 limitation in scan_by_key" (#8263)

Reverts #8199

According to @allisonvacanti (NVIDIA/thrust#1424 (comment)) this patch will likely have adverse effect on performance. We should revert it until a better solution can be found.

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Keith Kraus (https://github.com/kkraus14)
  - Elias Stehle (https://github.com/elstehle)

URL: https://github.com/rapidsai/cudf/pull/8263
---
 cpp/cmake/thrust.patch | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch
index c14b8cdafe5..2f9201d8ab4 100644
--- a/cpp/cmake/thrust.patch
+++ b/cpp/cmake/thrust.patch
@@ -81,25 +81,3 @@ index c0c6d59..937ee31 100644
      {
          typedef AgentScanPolicy<
                  128, 15,                                        ///< Threads per block, items per thread
-diff --git a/thrust/system/cuda/detail/scan_by_key.h b/thrust/system/cuda/detail/scan_by_key.h
-index fe4b321c..b3974c69 100644
---- a/thrust/system/cuda/detail/scan_by_key.h
-+++ b/thrust/system/cuda/detail/scan_by_key.h
-@@ -513,7 +513,7 @@ namespace __scan_by_key {
-             scan_op(scan_op_)
-       {
-         int  tile_idx      = blockIdx.x;
--        Size tile_base     = ITEMS_PER_TILE * tile_idx;
-+        Size tile_base     = ITEMS_PER_TILE * static_cast<Size>(tile_idx);
-         Size num_remaining = num_items - tile_base;
- 
-         if (num_remaining > ITEMS_PER_TILE)
-@@ -734,7 +734,7 @@ namespace __scan_by_key {
-                              ScanOp                     scan_op,
-                              AddInitToScan              add_init_to_scan)
-   {
--    int          num_items    = static_cast<int>(thrust::distance(keys_first, keys_last));
-+    size_t       num_items    = static_cast<size_t>(thrust::distance(keys_first, keys_last));
-     size_t       storage_size = 0;
-     cudaStream_t stream       = cuda_cub::stream(policy);
-     bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;

From 3da0d121b0296d8baba92133f078fe108ac5b72c Mon Sep 17 00:00:00 2001
From: shaneding <shane200195@gmail.com>
Date: Mon, 24 May 2021 10:23:06 -0400
Subject: [PATCH 14/24] added _is_homogeneous property (#8299)

This PR closes #7067.
This was implemented by adding the `_is_homogeneous` property to `DataFrame`. Included are appropriate test cases.

Authors:
  - https://github.com/shaneding

Approvers:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/8299
---
 python/cudf/cudf/core/frame.py           |  9 +++
 python/cudf/cudf/tests/test_dataframe.py | 97 ++++++++++++++++++++++++
 2 files changed, 106 insertions(+)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index cda4e8cbd4c..1c6c1ed85e6 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -157,6 +157,15 @@ def size(self):
         """
         return self._num_columns * self._num_rows
 
+    @property
+    def _is_homogeneous(self):
+        # make sure that the dataframe has columns
+        if not self._data.columns:
+            return True
+
+        first_type = self._data.columns[0].dtype.name
+        return all(x.dtype.name == first_type for x in self._data.columns)
+
     @property
     def empty(self):
         """
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index e5e36ba7e21..0b73f32e94d 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8579,3 +8579,100 @@ def test_dataframe_init_from_series(data, columns, index):
         actual,
         check_index_type=False if len(expected) == 0 else True,
     )
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        ({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, False),
+        ({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True),
+        ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False),
+        ({"a": [True, False, False], "b": [False, False, True]}, True),
+        ({"a": [True, False, False]}, True),
+        ({"a": [[1, 2], [3, 4]]}, True),
+        ({"a": [[1, 2], [3, 4]], "b": ["a", "b"]}, False),
+        ({"a": [{"c": 5}, {"e": 5}], "b": [{"c": 5}, {"g": 7}]}, True),
+        ({}, True),
+    ],
+)
+def test_is_homogeneous_dataframe(data, expected):
+    actual = cudf.DataFrame(data)._is_homogeneous
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "data, indexes, expected",
+    [
+        (
+            {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]},
+            ["a", "b"],
+            True,
+        ),
+        (
+            {
+                "a": [1, 2, 3, 4],
+                "b": [5, 6, 7, 8],
+                "c": [1.2, 1, 2, 3],
+                "d": ["hello", "world", "cudf", "rapids"],
+            },
+            ["a", "b"],
+            False,
+        ),
+        (
+            {
+                "a": ["a", "b", "c"],
+                "b": [4, 5, 6],
+                "c": [7, 8, 9],
+                "d": [1, 2, 3],
+            },
+            ["a", "b"],
+            True,
+        ),
+    ],
+)
+def test_is_homogeneous_multiIndex_dataframe(data, indexes, expected):
+    test_dataframe = cudf.DataFrame(data).set_index(indexes)
+    actual = cudf.DataFrame(test_dataframe)._is_homogeneous
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "data, expected", [([1, 2, 3, 4], True), ([True, False], True)]
+)
+def test_is_homogeneous_series(data, expected):
+    actual = cudf.Series(data)._is_homogeneous
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "levels, codes, expected",
+    [
+        (
+            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
+            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
+            True,
+        ),
+        (
+            [[1, 2, 3], [True, False, True]],
+            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
+            False,
+        ),
+    ],
+)
+def test_is_homogeneous_multiIndex(levels, codes, expected):
+    actual = cudf.MultiIndex(levels=levels, codes=codes)._is_homogeneous
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [([1, 2, 3], True), (["Hello", "World"], True), ([True, False], True)],
+)
+def test_is_homogeneous_index(data, expected):
+    actual = cudf.Index(data)._is_homogeneous
+
+    assert actual == expected

From 63faf2f14eb6114997eb7406b67eb174d26cfdf8 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Mon, 24 May 2021 10:03:29 -0500
Subject: [PATCH 15/24] Use empty_like in scatter (#8314)

This prevents things like partition from working with deeply nested arrays.

I marked this as non-breaking, but I am happy to change it to breaking because I removed a detailed API that is not used anywhere else and is flawed.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - https://github.com/nvdbaranec
  - Conor Hoekstra (https://github.com/codereport)
  - Jason Lowe (https://github.com/jlowe)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/8314
---
 cpp/include/cudf/lists/detail/copying.hpp | 17 +----------------
 cpp/include/cudf/lists/detail/scatter.cuh |  5 +----
 cpp/src/lists/copying/copying.cu          | 13 -------------
 cpp/tests/partitioning/partition_test.cpp | 20 ++++++++++++++++++++
 4 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index 548fec7e7f6..3760294f079 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,21 +48,6 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr);
 
-/**
- * @brief Create a single-level empty lists column.
- *
- * An empty lists column contains empty children so the column's
- * basic type is recorded.
- *
- * @param child_type The type used for the child column.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New empty lists column.
- */
-std::unique_ptr<cudf::column> make_empty_lists_column(data_type child_type,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr);
-
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index b179ccf228b..aec45d260bf 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -526,10 +526,7 @@ struct list_child_constructor {
 
     if (num_child_rows == 0) {
       // make an empty lists column using the input child type
-      return make_empty_lists_column(
-        source_lists_column_view.child().child(lists_column_view::child_column_index).type(),
-        stream,
-        mr);
+      return empty_like(source_lists_column_view.child());
     }
 
     auto child_list_views = rmm::device_uvector<unbound_list_view>(num_child_rows, stream, mr);
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index 3275a496cfd..ff4649f4945 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -84,19 +84,6 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                            std::move(null_mask));
 }
 
-std::unique_ptr<cudf::column> make_empty_lists_column(data_type child_type,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
-{
-  return cudf::make_lists_column(0,
-                                 make_empty_column(data_type{type_to_id<offset_type>()}),
-                                 make_empty_column(child_type),
-                                 0,                                  // Null count
-                                 rmm::device_buffer{0, stream, mr},  // Null mask
-                                 stream,
-                                 mr);
-}
-
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp
index bdd5e7bc780..669d406d80a 100644
--- a/cpp/tests/partitioning/partition_test.cpp
+++ b/cpp/tests/partitioning/partition_test.cpp
@@ -310,3 +310,23 @@ TEST_F(PartitionTestNotTyped, ListOfListOfIntEmpty)
   CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view());
   EXPECT_EQ(3, result.second.size());
 }
+
+TEST_F(PartitionTestNotTyped, ListOfListOfListOfIntEmpty)
+{
+  cudf::test::lists_column_wrapper<int32_t> level_3_list{};
+
+  fixed_width_column_wrapper<int32_t> level_2_offsets{};
+  std::unique_ptr<cudf::column> level_2_list =
+    cudf::make_lists_column(0, level_2_offsets.release(), level_3_list.release(), 0, {});
+
+  fixed_width_column_wrapper<int32_t> level_1_offsets{0, 0};
+  std::unique_ptr<cudf::column> level_1_list =
+    cudf::make_lists_column(1, level_1_offsets.release(), std::move(level_2_list), 0, {});
+
+  auto table_to_partition = cudf::table_view{{*level_1_list}};
+  fixed_width_column_wrapper<int32_t> map{0};
+
+  auto result = cudf::partition(table_to_partition, map, 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view());
+  EXPECT_EQ(3, result.second.size());
+}

From e555643b00d166bc43d8fbfaeccc9513dd7f15e1 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Mon, 24 May 2021 11:47:44 -0400
Subject: [PATCH 16/24] Update environment variable used to determine
 `cuda_version` (#8321)

This PR updates the environment variable thats used to determine the `cuda_version` varaible in our conda recipes.

The `CUDA` environment variable is explicitly set by the Ops team in our Jenkins jobs, whereas `CUDA_VERSION` comes from the `nvidia/cuda` Docker images that we base our images from.

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/8321
---
 conda/recipes/cudf/meta.yaml       | 2 +-
 conda/recipes/cudf_kafka/meta.yaml | 2 +-
 conda/recipes/custreamz/meta.yaml  | 2 +-
 conda/recipes/dask-cudf/meta.yaml  | 2 +-
 conda/recipes/libcudf/meta.yaml    | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 39f2ba3188c..631ebf16aea 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: cudf
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 35dfb1791d8..b59a49b0db7 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: cudf_kafka
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 0ae0ce830ad..bb5186d7057 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: custreamz
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index e66b4c930ec..14376f54ba1 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: dask-cudf
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index d42daf3194c..a8abe5b09f0 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -2,7 +2,7 @@
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: libcudf

From b1d7788edb41cc32965fa9d2b31347976ee4caec Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@apache.org>
Date: Mon, 24 May 2021 12:05:55 -0500
Subject: [PATCH 17/24] Update Java string concatenate test for single column
 (#8330)

to stringConcatenate when using a scalar separator.

Reference https://github.com/rapidsai/cudf/pull/8282 changed to throw an exception if only a single column is passed in to the stringConcatenate using scalar separator.  Update our Java test for that functionality.

Signed-off-by: Thomas Graves <tgraves@nvidia.com>

Authors:
  - Thomas Graves (https://github.com/tgravescs)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/8330
---
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 83795799a24..8da70afc6f3 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -2099,15 +2099,16 @@ void testStringConcatWithNulls() {
       assertColumnsAreEqual(concat, e_concat);
     }
 
-    try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf",
-        "g\nH", "IJ\"\u0100\u0101\u0500\u0501",
-        "kl m", "Nop1", "\\qRs2", null,
-        "3tuV\'", "wX4Yz", "\ud720\ud721");
-         Scalar emptyString = Scalar.fromString("");
-         Scalar nullSubstitute = Scalar.fromString("NULL");
-         ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v})) {
-      assertColumnsAreEqual(v, concat);
-    }
+    assertThrows(CudfException.class, () -> {
+      try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf",
+          "g\nH", "IJ\"\u0100\u0101\u0500\u0501",
+          "kl m", "Nop1", "\\qRs2", null,
+          "3tuV\'", "wX4Yz", "\ud720\ud721");
+           Scalar emptyString = Scalar.fromString("");
+           Scalar nullSubstitute = Scalar.fromString("NULL");
+           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v})) {
+      }
+    });
   }
 
   @Test

From 5c0a75b3ab23da656762b95cc984cfff1db88323 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 24 May 2021 12:24:16 -0500
Subject: [PATCH 18/24] Fix cudf release version in readme (#8331)

As part of this commit https://github.com/rapidsai/cudf/commit/84065228e0c0b5d94cdc6a44518eb9c353290c89 we accidentally changed the release version of readme to `21.06`, whereas the stable version currently in `rapidsai` channel is `0.19`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Keith Kraus (https://github.com/kkraus14)

URL: https://github.com/rapidsai/cudf/pull/8331
---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 587f18d2603..545e3331681 100644
--- a/README.md
+++ b/README.md
@@ -65,15 +65,15 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel:
 
-For `cudf version == 21.06` :
+For `cudf version == 0.19.2` :
 ```bash
-# for CUDA 11.0
+# for CUDA 10.1
 conda install -c rapidsai -c nvidia -c numba -c conda-forge \
-    cudf=21.06 python=3.7 cudatoolkit=11.0
+    cudf=0.19 python=3.7 cudatoolkit=10.1
 
-# or, for CUDA 11.2
+# or, for CUDA 10.2
 conda install -c rapidsai -c nvidia -c numba -c conda-forge \
-    cudf=21.06 python=3.7 cudatoolkit=11.2
+    cudf=0.19 python=3.7 cudatoolkit=10.2
 
 ```
 

From 691dd111d3c2f1bf1e2f1403be0465331f33d481 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Mon, 24 May 2021 15:10:22 -0500
Subject: [PATCH 19/24] Refactor of rolling_window implementation. (#8158)

This is an attempt to significantly reduce the complexity of the logic of the SFINAE and various functors/functions inside of rolling_detail.cuh.  There are 2 major components:

- It introduces the idea of device "rolling operators".  These operators are essentially just the implementations of what were formerly the `process_rolling_window()` functtions.  However, they provide they key mechanism for removing the complex SFINAE out of the core logic.  They do this by providing their own logic that can throw for invalid aggregation/type pairs at construction time, internally.

- It refactors the type and aggregation-dispatched functors to use the collector/finalize paradigm used by groupby.  Specifically, the rolling operation is broken down into three parts.   1.) Preprocess incoming aggregation/type pairs, potentially transforming them into different operations.   2.) Perform the rolling window operation on the transformed inputs.    3.) Postprocess the output from the rolling rolling window operation to obtain the final result.

Combined, these two changes dramatically reduce the amount of dispatch and gpu rolling implementation code one has to read through.

The implementation of the collect list rolling operation has been moved into `rolling_collect_list.cuh`

There are a couple of other things worth mentioning:

- Each device rolling operator implements an `is_supported()` constexpr function which are stripped down, type-specific versions of the old `is_rolling_supported()` global function.  It might be possible to eliminate this with further fundamental type traits.  Looking for opinions here.

- `is_rolling_supported()` has been removed from the code, however the various tests relied on it pretty heavily.  So for now I just transplanted it into the test code in a common place.  It's definitely not an ideal solution, but maybe ok for now.

- It might be worth moving the device rolling operators into their own module to further shrink `rolling_detail.cuh`.  Also looking for opinions here.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - MithunR (https://github.com/mythrocks)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/8158
---
 .../cudf/detail/aggregation/aggregation.cuh   |   18 +
 cpp/src/rolling/lead_lag_nested_detail.cuh    |  104 +-
 cpp/src/rolling/rolling_collect_list.cuh      |  358 ++++
 cpp/src/rolling/rolling_detail.cuh            | 1742 +++++++----------
 cpp/src/rolling/rolling_detail.hpp            |   59 -
 cpp/tests/rolling/grouped_rolling_test.cpp    |   12 +-
 cpp/tests/rolling/rolling_test.cpp            |    6 +-
 cpp/tests/rolling/rolling_test.hpp            |   75 +
 8 files changed, 1238 insertions(+), 1136 deletions(-)
 create mode 100644 cpp/src/rolling/rolling_collect_list.cuh
 create mode 100644 cpp/tests/rolling/rolling_test.hpp

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index d4833bcf91e..09763d66403 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -53,6 +53,14 @@ struct corresponding_operator<aggregation::MAX> {
   using type = DeviceMax;
 };
 template <>
+struct corresponding_operator<aggregation::ARGMIN> {
+  using type = DeviceMin;
+};
+template <>
+struct corresponding_operator<aggregation::ARGMAX> {
+  using type = DeviceMax;
+};
+template <>
 struct corresponding_operator<aggregation::ANY> {
   using type = DeviceMax;
 };
@@ -81,6 +89,10 @@ struct corresponding_operator<aggregation::VARIANCE> {
   using type = DeviceSum;
 };
 template <>
+struct corresponding_operator<aggregation::MEAN> {
+  using type = DeviceSum;
+};
+template <>
 struct corresponding_operator<aggregation::COUNT_VALID> {
   using type = DeviceCount;
 };
@@ -92,6 +104,12 @@ struct corresponding_operator<aggregation::COUNT_ALL> {
 template <aggregation::Kind k>
 using corresponding_operator_t = typename corresponding_operator<k>::type;
 
+template <aggregation::Kind k>
+constexpr bool has_corresponding_operator()
+{
+  return !std::is_same<typename corresponding_operator<k>::type, void>::value;
+}
+
 template <typename Source,
           aggregation::Kind k,
           bool target_has_nulls,
diff --git a/cpp/src/rolling/lead_lag_nested_detail.cuh b/cpp/src/rolling/lead_lag_nested_detail.cuh
index a202626fc24..4cff3053aa2 100644
--- a/cpp/src/rolling/lead_lag_nested_detail.cuh
+++ b/cpp/src/rolling/lead_lag_nested_detail.cuh
@@ -27,55 +27,6 @@
 
 namespace cudf::detail {
 namespace {
-/**
- * @brief Functor to calculate the gather map used for calculating LEAD/LAG.
- *
- * @tparam op Aggregation Kind (LEAD vs LAG)
- * @tparam PrecedingIterator Iterator to retrieve preceding window bounds
- * @tparam FollowingIterator Iterator to retrieve following window bounds
- */
-template <aggregation::Kind op, typename PrecedingIterator, typename FollowingIterator>
-class lead_lag_gather_map_builder {
- public:
-  lead_lag_gather_map_builder(size_type input_size,
-                              size_type row_offset,
-                              PrecedingIterator preceding,
-                              FollowingIterator following)
-    : _input_size{input_size},
-      _null_index{input_size},  // Out of input range. Gather returns null.
-      _row_offset{row_offset},
-      _preceding{preceding},
-      _following{following}
-  {
-  }
-
-  template <aggregation::Kind o = op, CUDF_ENABLE_IF(o == aggregation::LEAD)>
-  size_type __device__ operator()(size_type i)
-  {
-    // Note: grouped_*rolling_window() trims preceding/following to
-    // the beginning/end of the group. `rolling_window()` does not.
-    // Must trim _following[i] so as not to go past the column end.
-    auto following = min(_following[i], _input_size - i - 1);
-    return (_row_offset > following) ? _null_index : (i + _row_offset);
-  }
-
-  template <aggregation::Kind o = op, CUDF_ENABLE_IF(o == aggregation::LAG)>
-  size_type __device__ operator()(size_type i)
-  {
-    // Note: grouped_*rolling_window() trims preceding/following to
-    // the beginning/end of the group. `rolling_window()` does not.
-    // Must trim _preceding[i] so as not to go past the column start.
-    auto preceding = min(_preceding[i], i + 1);
-    return (_row_offset > (preceding - 1)) ? _null_index : (i - _row_offset);
-  }
-
- private:
-  size_type const _input_size;   // Number of rows in input to LEAD/LAG.
-  size_type const _null_index;   // Index value to use to output NULL for LEAD/LAG calculation.
-  size_type const _row_offset;   // LEAD/LAG offset. E.g. For LEAD(2), _row_offset == 2.
-  PrecedingIterator _preceding;  // Iterator to retrieve preceding window offset.
-  FollowingIterator _following;  // Iterator to retrieve following window offset.
-};
 
 /**
  * @brief Predicate to find indices at which LEAD/LAG evaluated to null.
@@ -110,33 +61,31 @@ is_null_index_predicate_impl<GatherMapIter> is_null_index_predicate(size_type in
 /**
  * @brief Helper function to calculate LEAD/LAG for nested-type input columns.
  *
- * @tparam op The sort of aggregation being done (LEAD vs LAG)
- * @tparam InputType The datatype of the input column being aggregated
  * @tparam PrecedingIterator Iterator-type that returns the preceding bounds
  * @tparam FollowingIterator Iterator-type that returns the following bounds
+ * @param[in] op Aggregation kind.
  * @param[in] input Nested-type input column for LEAD/LAG calculation
  * @param[in] default_outputs Default values to use as outputs, if LEAD/LAG
  *                            offset crosses column/group boundaries
  * @param[in] preceding Iterator to retrieve preceding window bounds
  * @param[in] following Iterator to retrieve following window bounds
- * @param[in] offset Lead/Lag offset, indicating which row after/before
- *                   the current row is to be returned
+ * @param[in] row_offset Lead/Lag offset, indicating which row after/before
+ *                       the current row is to be returned
  * @param[in] stream CUDA stream for device memory operations/allocations
  * @param[in] mr device_memory_resource for device memory allocations
  */
-template <aggregation::Kind op,
-          typename InputType,
-          typename PrecedingIter,
-          typename FollowingIter,
-          CUDF_ENABLE_IF(!cudf::is_fixed_width<InputType>())>
-std::unique_ptr<column> compute_lead_lag_for_nested(column_view const& input,
+template <typename PrecedingIter, typename FollowingIter>
+std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
+                                                    column_view const& input,
                                                     column_view const& default_outputs,
                                                     PrecedingIter preceding,
                                                     FollowingIter following,
-                                                    size_type offset,
+                                                    size_type row_offset,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(op == aggregation::LEAD || op == aggregation::LAG,
+               "Unexpected aggregation type in compute_lead_lag_for_nested");
   CUDF_EXPECTS(default_outputs.type().id() == input.type().id(),
                "Defaults column type must match input column.");  // Because LEAD/LAG.
 
@@ -145,7 +94,7 @@ std::unique_ptr<column> compute_lead_lag_for_nested(column_view const& input,
 
   // For LEAD(0)/LAG(0), no computation need be performed.
   // Return copy of input.
-  if (offset == 0) { return std::make_unique<column>(input, stream, mr); }
+  if (row_offset == 0) { return std::make_unique<column>(input, stream, mr); }
 
   // Algorithm:
   //
@@ -174,12 +123,33 @@ std::unique_ptr<column> compute_lead_lag_for_nested(column_view const& input,
     make_numeric_column(size_data_type, input.size(), mask_state::UNALLOCATED, stream);
   auto gather_map = gather_map_column->mutable_view();
 
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator(size_type{0}),
-                    thrust::make_counting_iterator(size_type{input.size()}),
-                    gather_map.begin<size_type>(),
-                    lead_lag_gather_map_builder<op, PrecedingIter, FollowingIter>{
-                      input.size(), offset, preceding, following});
+  auto const input_size = input.size();
+  auto const null_index = input.size();
+  if (op == aggregation::LEAD) {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(size_type{0}),
+                      thrust::make_counting_iterator(size_type{input.size()}),
+                      gather_map.begin<size_type>(),
+                      [following, input_size, null_index, row_offset] __device__(size_type i) {
+                        // Note: grouped_*rolling_window() trims preceding/following to
+                        // the beginning/end of the group. `rolling_window()` does not.
+                        // Must trim _following[i] so as not to go past the column end.
+                        auto _following = min(following[i], input_size - i - 1);
+                        return (row_offset > _following) ? null_index : (i + row_offset);
+                      });
+  } else {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(size_type{0}),
+                      thrust::make_counting_iterator(size_type{input.size()}),
+                      gather_map.begin<size_type>(),
+                      [preceding, input_size, null_index, row_offset] __device__(size_type i) {
+                        // Note: grouped_*rolling_window() trims preceding/following to
+                        // the beginning/end of the group. `rolling_window()` does not.
+                        // Must trim _preceding[i] so as not to go past the column start.
+                        auto _preceding = min(preceding[i], i + 1);
+                        return (row_offset > (_preceding - 1)) ? null_index : (i - row_offset);
+                      });
+  }
 
   auto output_with_nulls =
     cudf::detail::gather(table_view{std::vector<column_view>{input}},
diff --git a/cpp/src/rolling/rolling_collect_list.cuh b/cpp/src/rolling/rolling_collect_list.cuh
new file mode 100644
index 00000000000..f5a2e59fd2a
--- /dev/null
+++ b/cpp/src/rolling/rolling_collect_list.cuh
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/strings/detail/utilities.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/reduce.h>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+/**
+ * @brief Creates the offsets child of the result of the `COLLECT_LIST` window aggregation
+ *
+ * Given the input column, the preceding/following window bounds, and `min_periods`,
+ * the sizes of each list row may be computed. These values can then be used to
+ * calculate the offsets for the result of `COLLECT_LIST`.
+ *
+ * Note: If `min_periods` exceeds the number of observations for a window, the size
+ * is set to `0` (since the result is `null`).
+ */
+template <typename PrecedingIter, typename FollowingIter>
+std::unique_ptr<column> create_collect_offsets(size_type input_size,
+                                               PrecedingIter preceding_begin,
+                                               FollowingIter following_begin,
+                                               size_type min_periods,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  // Materialize offsets column.
+  auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
+  auto sizes =
+    make_fixed_width_column(size_data_type, input_size, mask_state::UNALLOCATED, stream, mr);
+  auto mutable_sizes = sizes->mutable_view();
+
+  // Consider the following preceding/following values:
+  //    preceding = [1,2,2,2,2]
+  //    following = [1,1,1,1,0]
+  // The sum of the vectors should yield the window sizes:
+  //  prec + foll = [2,3,3,3,2]
+  //
+  // If min_periods=2, all rows have at least `min_periods` observations.
+  // But if min_periods=3, rows at indices 0 and 4 have too few observations, and must return
+  // null. The sizes at these positions must be 0, i.e.
+  //  prec + foll = [0,3,3,3,0]
+  thrust::transform(rmm::exec_policy(stream),
+                    preceding_begin,
+                    preceding_begin + input_size,
+                    following_begin,
+                    mutable_sizes.begin<size_type>(),
+                    [min_periods] __device__(auto const preceding, auto const following) {
+                      return (preceding + following) < min_periods ? 0 : (preceding + following);
+                    });
+
+  // Convert `sizes` to an offsets column, via inclusive_scan():
+  return strings::detail::make_offsets_child_column(
+    sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
+}
+
+/**
+ * @brief Generate mapping of each row in the COLLECT_LIST result's child column
+ * to the index of the row it belongs to.
+ *
+ *  If
+ *         input col == [A,B,C,D,E]
+ *    and  preceding == [1,2,2,2,2],
+ *    and  following == [1,1,1,1,0],
+ *  then,
+ *        collect result       == [ [A,B], [A,B,C], [B,C,D], [C,D,E], [D,E] ]
+ *   i.e. result offset column == [0,2,5,8,11,13],
+ *    and result child  column == [A,B,A,B,C,B,C,D,C,D,E,D,E].
+ *  Mapping back to `input`    == [0,1,0,1,2,1,2,3,2,3,4,3,4]
+ */
+std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view const& offsets,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr)
+{
+  auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
+
+  // First, reduce offsets column by key, to identify the number of times
+  // an offset appears.
+  // Next, scatter the count for each offset (except the first and last),
+  // into a column of N `0`s, where N == number of child rows.
+  // For the example above:
+  //   offsets        == [0, 2, 5, 8, 11, 13]
+  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
+  //
+  // If the above example had an empty list row at index 2,
+  // the same columns would look as follows:
+  //   offsets        == [0, 2, 5, 5, 8, 11, 13]
+  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
+  //
+  // Note: To correctly handle null list rows at the beginning of
+  // the output column, care must be taken to skip the first `0`
+  // in the offsets column, when running `reduce_by_key()`.
+  // This accounts for the `0` added by default to the offsets
+  // column, marking the beginning of the column.
+
+  auto const num_child_rows{
+    cudf::detail::get_value<size_type>(offsets, offsets.size() - 1, stream)};
+
+  auto scatter_values =
+    make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
+  auto scatter_keys =
+    make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
+  auto reduced_by_key =
+    thrust::reduce_by_key(rmm::exec_policy(stream),
+                          offsets.template begin<size_type>() + 1,  // Skip first 0 in offsets.
+                          offsets.template end<size_type>(),
+                          thrust::make_constant_iterator<size_type>(1),
+                          scatter_keys->mutable_view().template begin<size_type>(),
+                          scatter_values->mutable_view().template begin<size_type>());
+  auto scatter_values_end = reduced_by_key.second;
+  auto scatter_output =
+    make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill_n(rmm::exec_policy(stream),
+                 scatter_output->mutable_view().template begin<size_type>(),
+                 num_child_rows,
+                 0);  // [0,0,0,...0]
+  thrust::scatter(rmm::exec_policy(stream),
+                  scatter_values->mutable_view().template begin<size_type>(),
+                  scatter_values_end,
+                  scatter_keys->view().template begin<size_type>(),
+                  scatter_output->mutable_view().template begin<size_type>());  // [0,0,1,0,0,1,...]
+
+  // Next, generate mapping with inclusive_scan() on scatter() result.
+  // For the example above:
+  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
+  //   inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4]
+  //
+  // For the case with an empty list at index 3:
+  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
+  //   inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5]
+  auto per_row_mapping =
+    make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         scatter_output->view().template begin<size_type>(),
+                         scatter_output->view().template end<size_type>(),
+                         per_row_mapping->mutable_view().template begin<size_type>());
+  return per_row_mapping;
+}
+
+/**
+ * @brief Create gather map to generate the child column of the result of
+ * the `COLLECT_LIST` window aggregation.
+ */
+template <typename PrecedingIter>
+std::unique_ptr<column> create_collect_gather_map(column_view const& child_offsets,
+                                                  column_view const& per_row_mapping,
+                                                  PrecedingIter preceding_iter,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  auto gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
+                                            per_row_mapping.size(),
+                                            mask_state::UNALLOCATED,
+                                            stream,
+                                            mr);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(per_row_mapping.size()),
+    gather_map->mutable_view().template begin<size_type>(),
+    [d_offsets =
+       child_offsets.template begin<size_type>(),  // E.g. [0,   2,     5,     8,     11, 13]
+     d_groups =
+       per_row_mapping.template begin<size_type>(),  // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4]
+     d_prev = preceding_iter] __device__(auto i) {
+      auto group              = d_groups[i];
+      auto group_start_offset = d_offsets[group];
+      auto relative_index     = i - group_start_offset;
+
+      return (group - d_prev[group] + 1) + relative_index;
+    });
+  return gather_map;
+}
+
+/**
+ * @brief Count null entries in result of COLLECT_LIST.
+ */
+size_type count_child_nulls(column_view const& input,
+                            std::unique_ptr<column> const& gather_map,
+                            rmm::cuda_stream_view stream)
+{
+  auto input_device_view = column_device_view::create(input, stream);
+
+  auto input_row_is_null = [d_input = *input_device_view] __device__(auto i) {
+    return d_input.is_null_nocheck(i);
+  };
+
+  return thrust::count_if(rmm::exec_policy(stream),
+                          gather_map->view().template begin<size_type>(),
+                          gather_map->view().template end<size_type>(),
+                          input_row_is_null);
+}
+
+/**
+ * @brief Purge entries for null inputs from gather_map, and adjust offsets.
+ */
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
+  column_view const& input,
+  column_view const& gather_map,
+  column_view const& offsets,
+  size_type num_child_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto input_device_view = column_device_view::create(input, stream);
+
+  auto input_row_not_null = [d_input = *input_device_view] __device__(auto i) {
+    return d_input.is_valid_nocheck(i);
+  };
+
+  // Purge entries in gather_map that correspond to null input.
+  auto new_gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
+                                                gather_map.size() - num_child_nulls,
+                                                mask_state::UNALLOCATED,
+                                                stream,
+                                                mr);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  gather_map.template begin<size_type>(),
+                  gather_map.template end<size_type>(),
+                  new_gather_map->mutable_view().template begin<size_type>(),
+                  input_row_not_null);
+
+  // Recalculate offsets after null entries are purged.
+  auto new_sizes = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, input.size(), mask_state::UNALLOCATED, stream, mr);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(input.size()),
+                    new_sizes->mutable_view().template begin<size_type>(),
+                    [d_gather_map  = gather_map.template begin<size_type>(),
+                     d_old_offsets = offsets.template begin<size_type>(),
+                     input_row_not_null] __device__(auto i) {
+                      return thrust::count_if(thrust::seq,
+                                              d_gather_map + d_old_offsets[i],
+                                              d_gather_map + d_old_offsets[i + 1],
+                                              input_row_not_null);
+                    });
+
+  auto new_offsets =
+    strings::detail::make_offsets_child_column(new_sizes->view().template begin<size_type>(),
+                                               new_sizes->view().template end<size_type>(),
+                                               stream,
+                                               mr);
+
+  return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(std::move(new_gather_map),
+                                                                          std::move(new_offsets));
+}
+
+}  // anonymous namespace
+
+template <typename PrecedingIter, typename FollowingIter>
+std::unique_ptr<column> rolling_collect_list(column_view const& input,
+                                             column_view const& default_outputs,
+                                             PrecedingIter preceding_begin_raw,
+                                             FollowingIter following_begin_raw,
+                                             size_type min_periods,
+                                             rolling_aggregation const& agg,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(default_outputs.is_empty(),
+               "COLLECT_LIST window function does not support default values.");
+
+  if (input.is_empty()) return empty_like(input);
+
+  // Fix up preceding/following iterators to respect column boundaries,
+  // similar to gpu_rolling().
+  // `rolling_window()` does not fix up preceding/following so as not to read past
+  // column boundaries.
+  // `grouped_rolling_window()` and `time_range_based_grouped_rolling_window() do.
+  auto preceding_begin = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_type>(0), [preceding_begin_raw] __device__(auto i) {
+      return thrust::min(preceding_begin_raw[i], i + 1);
+    });
+  auto following_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
+                                    [following_begin_raw, size = input.size()] __device__(auto i) {
+                                      return thrust::min(following_begin_raw[i], size - i - 1);
+                                    });
+
+  // Materialize collect list's offsets.
+  auto offsets =
+    create_collect_offsets(input.size(), preceding_begin, following_begin, min_periods, stream, mr);
+
+  // Map each element of the collect() result's child column
+  // to the index where it appears in the input.
+  auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream, mr);
+
+  // Generate gather map to produce the collect() result's child column.
+  auto gather_map = create_collect_gather_map(
+    offsets->view(), per_row_mapping->view(), preceding_begin, stream, mr);
+
+  // If gather_map collects null elements, and null_policy == EXCLUDE,
+  // those elements must be filtered out, and offsets recomputed.
+  auto null_handling = dynamic_cast<collect_list_aggregation const&>(agg)._null_handling;
+  if (null_handling == null_policy::EXCLUDE && input.has_nulls()) {
+    auto num_child_nulls = count_child_nulls(input, gather_map, stream);
+    if (num_child_nulls != 0) {
+      std::tie(gather_map, offsets) =
+        purge_null_entries(input, *gather_map, *offsets, num_child_nulls, stream, mr);
+    }
+  }
+
+  // gather(), to construct child column.
+  auto gather_output =
+    cudf::gather(table_view{std::vector<column_view>{input}}, gather_map->view());
+
+  rmm::device_buffer null_mask;
+  size_type null_count;
+  std::tie(null_mask, null_count) = valid_if(
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(input.size()),
+    [preceding_begin, following_begin, min_periods] __device__(auto i) {
+      return (preceding_begin[i] + following_begin[i]) >= min_periods;
+    },
+    stream,
+    mr);
+
+  return make_lists_column(input.size(),
+                           std::move(offsets),
+                           std::move(gather_output->release()[0]),
+                           null_count,
+                           std::move(null_mask),
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 1192b9cad87..9e6d135b153 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -17,6 +17,9 @@
 #pragma once
 
 #include "lead_lag_nested_detail.cuh"
+#include "rolling/rolling_collect_list.cuh"
+#include "rolling/rolling_detail.hpp"
+#include "rolling/rolling_jit_detail.hpp"
 #include "rolling_detail.hpp"
 
 #include <cudf/aggregation.hpp>
@@ -28,7 +31,6 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -64,375 +66,695 @@
 namespace cudf {
 
 namespace detail {
+
 namespace {  // anonymous
+
 /**
- * @brief Only COUNT_VALID operation is executed and count is updated
- *        depending on `min_periods` and returns true if it was
- *        valid, else false.
+ * @brief Operator for applying a generic (non-specialized) rolling aggregation on a single window.
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<op == aggregation::COUNT_VALID>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-  // for CUDA 10.0 and below (fixed in CUDA 10.1)
-  volatile cudf::size_type count = 0;
+template <typename InputType, aggregation::Kind op>
+struct DeviceRolling {
+  size_type min_periods;
 
-  bool output_is_valid = ((end_index - start_index) >= min_periods);
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = op>
+  static constexpr bool is_supported()
+  {
+    return cudf::detail::is_valid_aggregation<T, O>() && has_corresponding_operator<O>() &&
+           // TODO: Delete all this extra logic once is_valid_aggregation<> cleans up some edge
+           // cases it isn't handling.
+           // MIN/MAX supports all fixed width types
+           (((O == aggregation::MIN || O == aggregation::MAX) && cudf::is_fixed_width<T>()) ||
 
-  if (output_is_valid) {
-    if (!has_nulls) {
-      count = end_index - start_index;
-    } else {
-      count = thrust::count_if(thrust::seq,
-                               thrust::make_counting_iterator(start_index),
-                               thrust::make_counting_iterator(end_index),
-                               [&input](auto i) { return input.is_valid_nocheck(i); });
+            // SUM supports all fixed width types except timestamps
+            ((O == aggregation::SUM) && (cudf::is_fixed_width<T>() && !cudf::is_timestamp<T>())) ||
+
+            // MEAN supports numeric and duration
+            ((O == aggregation::MEAN) && (cudf::is_numeric<T>() || cudf::is_duration<T>())));
+  }
+
+  // operations we do support
+  template <typename T = InputType, aggregation::Kind O = op>
+  DeviceRolling(size_type _min_periods, typename std::enable_if_t<is_supported<T, O>()>* = nullptr)
+    : min_periods(_min_periods)
+  {
+  }
+
+  // operations we don't support
+  template <typename T = InputType, aggregation::Kind O = op>
+  DeviceRolling(size_type _min_periods, typename std::enable_if_t<!is_supported<T, O>()>* = nullptr)
+    : min_periods(_min_periods)
+  {
+    CUDF_FAIL("Invalid aggregation/type pair");
+  }
+
+  // perform the windowing operation
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    using AggOp = typename corresponding_operator<op>::type;
+    AggOp agg_op;
+
+    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
+    // for CUDA 10.0 and below (fixed in CUDA 10.1)
+    volatile cudf::size_type count = 0;
+    OutputType val                 = AggOp::template identity<OutputType>();
+
+    for (size_type j = start_index; j < end_index; j++) {
+      if (!has_nulls || input.is_valid(j)) {
+        OutputType element = input.element<device_storage_type_t<InputType>>(j);
+        val                = agg_op(element, val);
+        count++;
+      }
+    }
+
+    bool output_is_valid = (count >= min_periods);
+
+    // store the output value, one per thread
+    cudf::detail::rolling_store_output_functor<OutputType, op == aggregation::MEAN>{}(
+      output.element<OutputType>(current_index), val, count);
+
+    return output_is_valid;
+  }
+};
+
+/**
+ * @brief Operator for applying an ARGMAX/ARGMIN rolling aggregation on a single window.
+ */
+template <typename InputType, aggregation::Kind op>
+struct DeviceRollingArgMinMax {
+  size_type min_periods;
+
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = op>
+  static constexpr bool is_supported()
+  {
+    // strictly speaking, I think it would be ok to make this work
+    // for comparable types as well.  but right now the only use case is
+    // for MIN/MAX on strings.
+    return std::is_same<T, cudf::string_view>::value;
+  }
+
+  DeviceRollingArgMinMax(size_type _min_periods) : min_periods(_min_periods) {}
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    using AggOp = typename corresponding_operator<op>::type;
+    AggOp agg_op;
+
+    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
+    // for CUDA 10.0 and below (fixed in CUDA 10.1)
+    volatile cudf::size_type count = 0;
+    InputType val                  = AggOp::template identity<InputType>();
+    OutputType val_index = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL;
+
+    for (size_type j = start_index; j < end_index; j++) {
+      if (!has_nulls || input.is_valid(j)) {
+        InputType element = input.element<InputType>(j);
+        val               = agg_op(element, val);
+        if (val == element) { val_index = j; }
+        count++;
+      }
+    }
+
+    bool output_is_valid = (count >= min_periods);
+    // -1 will help identify null elements while gathering for Min and Max
+    // In case of count, this would be null, so doesn't matter.
+    output.element<OutputType>(current_index) = (output_is_valid) ? val_index : -1;
+
+    // The gather mask shouldn't contain null values, so
+    // always return zero
+    return true;
+  }
+};
+
+/**
+ * @brief Operator for applying a COUNT_VALID rolling aggregation on a single window.
+ */
+template <typename InputType>
+struct DeviceRollingCountValid {
+  size_type min_periods;
+
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::COUNT_VALID>
+  static constexpr bool is_supported()
+  {
+    return true;
+  }
+
+  DeviceRollingCountValid(size_type _min_periods) : min_periods(_min_periods) {}
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
+    // for CUDA 10.0 and below (fixed in CUDA 10.1)
+    volatile cudf::size_type count = 0;
+
+    bool output_is_valid = ((end_index - start_index) >= min_periods);
+
+    if (output_is_valid) {
+      if (!has_nulls) {
+        count = end_index - start_index;
+      } else {
+        count = thrust::count_if(thrust::seq,
+                                 thrust::make_counting_iterator(start_index),
+                                 thrust::make_counting_iterator(end_index),
+                                 [&input](auto i) { return input.is_valid_nocheck(i); });
+      }
+      output.element<OutputType>(current_index) = count;
     }
+
+    return output_is_valid;
+  }
+};
+
+/**
+ * @brief Operator for applying a COUNT_ALL rolling aggregation on a single window.
+ */
+template <typename InputType>
+struct DeviceRollingCountAll {
+  size_type min_periods;
+
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::COUNT_ALL>
+  static constexpr bool is_supported()
+  {
+    return true;
+  }
+
+  DeviceRollingCountAll(size_type _min_periods) : min_periods(_min_periods) {}
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    cudf::size_type count = end_index - start_index;
+
+    bool output_is_valid                      = count >= min_periods;
     output.element<OutputType>(current_index) = count;
+
+    return output_is_valid;
   }
+};
 
-  return output_is_valid;
-}
+/**
+ * @brief Operator for applying a ROW_NUMBER rolling aggregation on a single window.
+ */
+template <typename InputType>
+struct DeviceRollingRowNumber {
+  size_type min_periods;
+
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::ROW_NUMBER>
+  static constexpr bool is_supported()
+  {
+    return true;
+  }
+
+  DeviceRollingRowNumber(size_type _min_periods) : min_periods(_min_periods) {}
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    bool output_is_valid                      = end_index - start_index >= min_periods;
+    output.element<OutputType>(current_index) = current_index - start_index + 1;
+
+    return output_is_valid;
+  }
+};
 
 /**
- * @brief Only COUNT_ALL operation is executed and count is updated
- *        depending on `min_periods` and returns true if it was
- *        valid, else false.
+ * @brief Operator for applying a LEAD rolling aggregation on a single window.
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<op == aggregation::COUNT_ALL>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  cudf::size_type count = end_index - start_index;
+template <typename InputType>
+struct DeviceRollingLead {
+  size_type row_offset;
 
-  bool output_is_valid                      = count >= min_periods;
-  output.element<OutputType>(current_index) = count;
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::LEAD>
+  static constexpr bool is_supported()
+  {
+    return cudf::is_fixed_width<T>();
+  }
 
-  return output_is_valid;
-}
+  template <typename T = InputType, typename std::enable_if_t<is_supported<T>()>* = nullptr>
+  DeviceRollingLead(size_type _row_offset) : row_offset(_row_offset)
+  {
+  }
+
+  template <typename T = InputType, typename std::enable_if_t<!is_supported<T>()>* = nullptr>
+  DeviceRollingLead(size_type _row_offset) : row_offset(_row_offset)
+  {
+    CUDF_FAIL("Invalid aggregation/type pair");
+  }
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    // Offsets have already been normalized.
+
+    // Check if row is invalid.
+    if (row_offset > (end_index - current_index - 1)) {
+      // Invalid row marked. Use default value, if available.
+      if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+
+      output.element<OutputType>(current_index) =
+        default_outputs.element<OutputType>(current_index);
+      return true;
+    }
+
+    // Not an invalid row.
+    auto index   = current_index + row_offset;
+    auto is_null = input.is_null(index);
+    if (!is_null) {
+      output.element<OutputType>(current_index) =
+        input.element<device_storage_type_t<InputType>>(index);
+    }
+    return !is_null;
+  }
+};
 
 /**
- * @brief Calculates row-number of current index within [start_index, end_index). Count is updated
- *        depending on `min_periods`. Returns `true` if it was valid, else `false`.
+ * @brief Operator for applying a LAG rolling aggregation on a single window.
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<op == aggregation::ROW_NUMBER>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  bool output_is_valid                      = end_index - start_index >= min_periods;
-  output.element<OutputType>(current_index) = current_index - start_index + 1;
+template <typename InputType>
+struct DeviceRollingLag {
+  size_type row_offset;
 
-  return output_is_valid;
-}
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::LAG>
+  static constexpr bool is_supported()
+  {
+    return cudf::is_fixed_width<T>();
+  }
+
+  template <typename T = InputType, typename std::enable_if_t<is_supported<T>()>* = nullptr>
+  DeviceRollingLag(size_type _row_offset) : row_offset(_row_offset)
+  {
+  }
+
+  template <typename T = InputType, typename std::enable_if_t<!is_supported<T>()>* = nullptr>
+  DeviceRollingLag(size_type _row_offset) : row_offset(_row_offset)
+  {
+    CUDF_FAIL("Invalid aggregation/type pair");
+  }
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    // Offsets have already been normalized.
+
+    // Check if row is invalid.
+    if (row_offset > (current_index - start_index)) {
+      // Invalid row marked. Use default value, if available.
+      if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+
+      output.element<OutputType>(current_index) =
+        default_outputs.element<OutputType>(current_index);
+      return true;
+    }
+
+    // Not an invalid row.
+    auto index   = current_index - row_offset;
+    auto is_null = input.is_null(index);
+    if (!is_null) {
+      output.element<OutputType>(current_index) =
+        input.element<device_storage_type_t<InputType>>(index);
+    }
+    return !is_null;
+  }
+};
 
 /**
- * @brief LEAD(N): Returns the row from the input column, at the specified offset past the
- *        current row.
- * If the offset crosses the grouping boundary or column boundary for
- * a given row, a "default" value is returned. The "default" value is null, by default.
+ * @brief Maps an `InputType and `aggregation::Kind` value to it's corresponding
+ * rolling window operator.
  *
- * E.g. Consider an input column with the following values and grouping:
- *      [10, 11, 12, 13,   20, 21, 22, 23]
- *      <------G1----->   <------G2------>
- *
- * LEAD(input_col, 1) yields:
- *      [11, 12, 13, null,  21, 22, 23, null]
- *
- * LEAD(input_col, 1, 99) (where 99 indicates the default) yields:
- *      [11, 12, 13, 99,  21, 22, 23, 99]
+ * @tparam InputType The input type to map to its corresponding operator
+ * @tparam k The `aggregation::Kind` value to map to its corresponding operator
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls>
-std::enable_if_t<(op == aggregation::LEAD) && (cudf::is_fixed_width<InputType>()), bool> __device__
-process_rolling_window(column_device_view input,
-                       column_device_view default_outputs,
-                       mutable_column_device_view output,
-                       size_type start_index,
-                       size_type end_index,
-                       size_type current_index,
-                       size_type min_periods,
-                       agg_op device_agg_op)
-{
-  // Offsets have already been normalized.
-  auto row_offset = device_agg_op.row_offset;
+template <typename InputType, aggregation::Kind k>
+struct corresponding_rolling_operator {
+  using type = DeviceRolling<InputType, k>;
+};
 
-  // Check if row is invalid.
-  if (row_offset > (end_index - current_index - 1)) {
-    // Invalid row marked. Use default value, if available.
-    if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::ARGMIN> {
+  using type = DeviceRollingArgMinMax<InputType, aggregation::ARGMIN>;
+};
 
-    output.element<OutputType>(current_index) = default_outputs.element<OutputType>(current_index);
-    return true;
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::ARGMAX> {
+  using type = DeviceRollingArgMinMax<InputType, aggregation::ARGMAX>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::COUNT_VALID> {
+  using type = DeviceRollingCountValid<InputType>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::COUNT_ALL> {
+  using type = DeviceRollingCountAll<InputType>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::ROW_NUMBER> {
+  using type = DeviceRollingRowNumber<InputType>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::Kind::LEAD> {
+  using type = DeviceRollingLead<InputType>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::Kind::LAG> {
+  using type = DeviceRollingLag<InputType>;
+};
+
+/**
+ * @brief Functor for creating a device rolling operator based on input type and aggregation type.
+ */
+template <typename InputType, aggregation::Kind op, typename Enable = void>
+struct create_rolling_operator {
+  auto operator()(size_type min_periods, rolling_aggregation const& agg)
+  {
+    CUDF_FAIL("Invalid aggregation/type pair");
   }
+};
 
-  // Not an invalid row.
-  auto index   = current_index + row_offset;
-  auto is_null = input.is_null(index);
-  if (!is_null) { output.element<OutputType>(current_index) = input.element<InputType>(index); }
-  return !is_null;
-}
+template <typename InputType, aggregation::Kind op>
+struct create_rolling_operator<
+  InputType,
+  op,
+  std::enable_if_t<corresponding_rolling_operator<InputType, op>::type::is_supported()>> {
+  template <
+    typename T                                                                     = InputType,
+    aggregation::Kind O                                                            = op,
+    std::enable_if_t<O != aggregation::Kind::LEAD && O != aggregation::Kind::LAG>* = nullptr>
+  auto operator()(size_type min_periods, rolling_aggregation const& agg)
+  {
+    return typename corresponding_rolling_operator<InputType, op>::type(min_periods);
+  }
+
+  template <typename T                                      = InputType,
+            aggregation::Kind O                             = op,
+            std::enable_if_t<O == aggregation::Kind::LEAD>* = nullptr>
+  auto operator()(size_type min_periods, rolling_aggregation const& agg)
+  {
+    return DeviceRollingLead<InputType>{
+      dynamic_cast<cudf::detail::lead_lag_aggregation const&>(agg).row_offset};
+  }
+
+  template <typename T                                     = InputType,
+            aggregation::Kind O                            = op,
+            std::enable_if_t<O == aggregation::Kind::LAG>* = nullptr>
+  auto operator()(size_type min_periods, rolling_aggregation const& agg)
+  {
+    return DeviceRollingLag<InputType>{
+      dynamic_cast<cudf::detail::lead_lag_aggregation const&>(agg).row_offset};
+  }
+};
 
 /**
- * @brief LAG(N): returns the row from the input column at the specified offset preceding
- *        the current row.
- * If the offset crosses the grouping boundary or column boundary for
- * a given row, a "default" value is returned. The "default" value is null, by default.
+ * @brief Rolling window specific implementation of simple_aggregations_collector.
+ *
+ * The purpose of this class is to preprocess incoming aggregation/type pairs and
+ * potentially transform them into other aggregation/type pairs. Typically when this
+ * happens, the equivalent aggregation/type implementation of finalize() will perform
+ * some postprocessing step.
+ *
+ * An example of this would be applying a MIN aggregation to strings.  This cannot be done
+ * directly in the rolling operation, so instead the following happens:
  *
- * E.g. Consider an input column with the following values and grouping:
- *      [10, 11, 12, 13,   20, 21, 22, 23]
- *      <------G1----->   <------G2------>
+ * - the rolling_aggregation_preprocessor transforms the incoming MIN/string pair to
+ *   an ARGMIN/int pair.
+ * - The ARGMIN/int has the rolling operation applied to it, generating a list of indices
+ *   that can then be used as a gather map.
+ * - The rolling_aggregation_postprocessor then takes this gather map and performs a final
+ *   gather() on the input string data to generate the final output.
+ *
+ * Another example is COLLECT_LIST.  COLLECT_LIST is odd in that it doesn't go through the
+ * normal gpu rolling kernel at all.  It has a completely custom implementation.  So the
+ * following happens:
+ *
+ * - the rolling_aggregation_preprocessor transforms the COLLECT_LIST aggregation into nothing,
+ *   since no actual rolling window operation will be performed.
+ * - the rolling_aggregation_postprocessor calls the specialized rolling_collect_list()
+ *   function to generate the final output.
  *
- * LAG(input_col, 2) yields:
- *      [null, null, 10, 11, null, null, 20, 21]
- * LAG(input_col, 2, 99) yields:
- *      [99, 99, 10, 11, 99, 99, 20, 21]
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls>
-std::enable_if_t<(op == aggregation::LAG) && (cudf::is_fixed_width<InputType>()), bool> __device__
-process_rolling_window(column_device_view input,
-                       column_device_view default_outputs,
-                       mutable_column_device_view output,
-                       size_type start_index,
-                       size_type end_index,
-                       size_type current_index,
-                       size_type min_periods,
-                       agg_op device_agg_op)
-{
-  // Offsets have already been normalized.
-  auto row_offset = device_agg_op.row_offset;
+class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggregations_collector {
+ public:
+  using cudf::detail::simple_aggregations_collector::visit;
+
+  // NOTE : all other aggregations are passed through unchanged via the default
+  // visit() function in the simple_aggregations_collector.
+
+  // MIN aggregations with strings are processed in 2 passes. The first pass performs
+  // the rolling operation on a ARGMIN aggregation to generate indices instead of values.
+  // Then a second pass uses those indices to gather the final strings.  This step
+  // translates the the MIN -> ARGMIN aggregation
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::min_aggregation const& agg) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
+                                                    : make_min_aggregation());
+    return aggs;
+  }
 
-  // Check if row is invalid.
-  if (row_offset > (current_index - start_index)) {
-    // Invalid row marked. Use default value, if available.
-    if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+  // MAX aggregations with strings are processed in 2 passes. The first pass performs
+  // the rolling operation on a ARGMAX aggregation to generate indices instead of values.
+  // Then a second pass uses those indices to gather the final strings.  This step
+  // translates the the MAX -> ARGMAX aggregation
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::max_aggregation const& agg) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
+                                                    : make_max_aggregation());
+    return aggs;
+  }
 
-    output.element<OutputType>(current_index) = default_outputs.element<OutputType>(current_index);
-    return true;
+  // COLLECT_LIST aggregations do not peform a rolling operation at all. They get processed
+  // entirely in the finalize() step.
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, cudf::detail::collect_list_aggregation const& agg) override
+  {
+    return {};
   }
 
-  // Not an invalid row.
-  auto index   = current_index - row_offset;
-  auto is_null = input.is_null(index);
-  if (!is_null) { output.element<OutputType>(current_index) = input.element<InputType>(index); }
-  return !is_null;
-}
+  // LEAD and LAG have custom behaviors for non fixed-width types.
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, cudf::detail::lead_lag_aggregation const& agg) override
+  {
+    // no rolling operation for non-fixed width.  just a postprocess step at the end
+    if (!cudf::is_fixed_width(col_type)) { return {}; }
+    // otherwise, pass through
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(agg.clone());
+    return aggs;
+  }
+};
 
 /**
- * @brief Only used for `string_view` type to get ARGMIN and ARGMAX, which
- *        will be used to gather MIN and MAX. And returns true if the
- *        operation was valid, else false.
+ * @brief Rolling window specific implementation of aggregation_finalizer.
+ *
+ * The purpose of this class is to postprocess rolling window data depending on the
+ * aggregation/type pair. See the description of rolling_aggregation_preprocessor for
+ * a detailed description.
+ *
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<(op == aggregation::ARGMIN or op == aggregation::ARGMAX) and
-                           std::is_same<InputType, cudf::string_view>::value>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-  // for CUDA 10.0 and below (fixed in CUDA 10.1)
-  volatile cudf::size_type count = 0;
-  InputType val                  = agg_op::template identity<InputType>();
-  OutputType val_index           = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL;
-
-  for (size_type j = start_index; j < end_index; j++) {
-    if (!has_nulls || input.is_valid(j)) {
-      InputType element = input.element<InputType>(j);
-      val               = agg_op{}(element, val);
-      if (val == element) { val_index = j; }
-      count++;
-    }
+template <typename PrecedingWindowIterator, typename FollowingWindowIterator>
+class rolling_aggregation_postprocessor final : public cudf::detail::aggregation_finalizer {
+ public:
+  using cudf::detail::aggregation_finalizer::visit;
+
+  rolling_aggregation_postprocessor(column_view const& _input,
+                                    column_view const& _default_outputs,
+                                    data_type _result_type,
+                                    PrecedingWindowIterator _preceding_window_begin,
+                                    FollowingWindowIterator _following_window_begin,
+                                    int _min_periods,
+                                    std::unique_ptr<column>&& _intermediate,
+                                    rmm::cuda_stream_view _stream,
+                                    rmm::mr::device_memory_resource* _mr)
+    :
+
+      input(_input),
+      default_outputs(_default_outputs),
+      result_type(_result_type),
+      preceding_window_begin(_preceding_window_begin),
+      following_window_begin(_following_window_begin),
+      min_periods(_min_periods),
+      intermediate(std::move(_intermediate)),
+      result(nullptr),
+      stream(_stream),
+      mr(_mr)
+  {
   }
 
-  bool output_is_valid = (count >= min_periods);
-  // -1 will help identify null elements while gathering for Min and Max
-  // In case of count, this would be null, so doesn't matter.
-  output.element<OutputType>(current_index) = (output_is_valid) ? val_index : -1;
+  // all non-specialized aggregation types simply pass the intermediate result through.
+  void visit(aggregation const& agg) override { result = std::move(intermediate); }
 
-  // The gather mask shouldn't contain null values, so
-  // always return zero
-  return true;
-}
+  // perform a final gather on the generated ARGMIN data
+  void visit(cudf::detail::min_aggregation const& agg) override
+  {
+    if (result_type.id() == type_id::STRING) {
+      // The rows that represent null elements will have negative values in gather map,
+      // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
+      auto output_table = detail::gather(table_view{{input}},
+                                         intermediate->view(),
+                                         cudf::out_of_bounds_policy::NULLIFY,
+                                         detail::negative_index_policy::NOT_ALLOWED,
+                                         stream,
+                                         mr);
+      result            = std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
+    } else {
+      result = std::move(intermediate);
+    }
+  }
 
-/**
- * @brief Operates on only fixed-width types and returns true if the
- *        operation was valid, else false.
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<!std::is_same<InputType, cudf::string_view>::value and
-                           !(op == aggregation::COUNT_VALID || op == aggregation::COUNT_ALL ||
-                             op == aggregation::ROW_NUMBER || op == aggregation::LEAD ||
-                             op == aggregation::LAG || op == aggregation::COLLECT_LIST)>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-  // for CUDA 10.0 and below (fixed in CUDA 10.1)
-  volatile cudf::size_type count = 0;
-  OutputType val                 = agg_op::template identity<OutputType>();
-
-  for (size_type j = start_index; j < end_index; j++) {
-    if (!has_nulls || input.is_valid(j)) {
-      OutputType element = input.element<InputType>(j);
-      val                = agg_op{}(element, val);
-      count++;
+  // perform a final gather on the generated ARGMAX data
+  void visit(cudf::detail::max_aggregation const& agg) override
+  {
+    if (result_type.id() == type_id::STRING) {
+      // The rows that represent null elements will have negative values in gather map,
+      // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
+      auto output_table = detail::gather(table_view{{input}},
+                                         intermediate->view(),
+                                         cudf::out_of_bounds_policy::NULLIFY,
+                                         detail::negative_index_policy::NOT_ALLOWED,
+                                         stream,
+                                         mr);
+      result            = std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
+    } else {
+      result = std::move(intermediate);
     }
   }
 
-  bool output_is_valid = (count >= min_periods);
+  // perform the actual COLLECT_LIST operation entirely.
+  void visit(cudf::detail::collect_list_aggregation const& agg) override
+  {
+    result = rolling_collect_list(input,
+                                  default_outputs,
+                                  preceding_window_begin,
+                                  following_window_begin,
+                                  min_periods,
+                                  agg,
+                                  stream,
+                                  mr);
+  }
 
-  // store the output value, one per thread
-  cudf::detail::rolling_store_output_functor<OutputType, op == aggregation::MEAN>{}(
-    output.element<OutputType>(current_index), val, count);
+  std::unique_ptr<column> get_result()
+  {
+    CUDF_EXPECTS(result != nullptr,
+                 "Calling result on rolling aggregation postprocessor that has not been visited in "
+                 "rolling_window");
+    return std::move(result);
+  }
 
-  return output_is_valid;
-}
+  // LEAD and LAG have custom behaviors for non fixed-width types.
+  void visit(cudf::detail::lead_lag_aggregation const& agg) override
+  {
+    // if this is non-fixed width, run the custom lead-lag code
+    if (!cudf::is_fixed_width(result_type)) {
+      result =
+        cudf::detail::compute_lead_lag_for_nested<PrecedingWindowIterator, FollowingWindowIterator>(
+          agg.kind,
+          input,
+          default_outputs,
+          preceding_window_begin,
+          following_window_begin,
+          agg.row_offset,
+          stream,
+          mr);
+    }
+    // otherwise just pass through the intermediate
+    else {
+      result = std::move(intermediate);
+    }
+  }
+
+ private:
+  column_view input;
+  column_view default_outputs;
+  data_type result_type;
+  PrecedingWindowIterator preceding_window_begin;
+  FollowingWindowIterator following_window_begin;
+  int min_periods;
+  std::unique_ptr<column> intermediate;
+  std::unique_ptr<column> result;
+  rmm::cuda_stream_view stream;
+  rmm::mr::device_memory_resource* mr;
+};
 
 /**
  * @brief Computes the rolling window function
  *
  * @tparam InputType  Datatype of `input`
  * @tparam OutputType  Datatype of `output`
- * @tparam agg_op  A functor that defines the aggregation operation
  * @tparam op The aggregation operator (enum value)
  * @tparam block_size CUDA block size for the kernel
  * @tparam has_nulls true if the input column has nulls
+ * @tparam DeviceRollingOperator An operator that performs a single windowing operation
  * @tparam PrecedingWindowIterator iterator type (inferred)
  * @tparam FollowingWindowIterator iterator type (inferred)
  * @param input Input column device view
+ * @param default_outputs A column of per-row default values to be returned instead
+ *                        of nulls for certain aggregation types.
  * @param output Output column device view
+ * @param output_valid_count Output count of valid values
+ * @param device_operator The operator used to perform a single window operation
  * @param preceding_window_begin[in] Rolling window size iterator, accumulates from
  *                in_col[i-preceding_window] to in_col[i] inclusive
  * @param following_window_begin[in] Rolling window size iterator in the forward
  *                direction, accumulates from in_col[i] to
  *                in_col[i+following_window] inclusive
- * @param min_periods[in]  Minimum number of observations in window required to
- *                have a value, otherwise 0 is stored in the valid bit mask
  */
 template <typename InputType,
           typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          int block_size,
-          bool has_nulls,
-          typename PrecedingWindowIterator,
-          typename FollowingWindowIterator>
-__launch_bounds__(block_size) __global__
-  void gpu_rolling(column_device_view input,
-                   column_device_view default_outputs,
-                   mutable_column_device_view output,
-                   size_type* __restrict__ output_valid_count,
-                   PrecedingWindowIterator preceding_window_begin,
-                   FollowingWindowIterator following_window_begin,
-                   size_type min_periods)
-{
-  size_type i      = blockIdx.x * block_size + threadIdx.x;
-  size_type stride = block_size * gridDim.x;
-
-  size_type warp_valid_count{0};
-
-  auto active_threads = __ballot_sync(0xffffffff, i < input.size());
-  while (i < input.size()) {
-    size_type preceding_window = preceding_window_begin[i];
-    size_type following_window = following_window_begin[i];
-
-    // compute bounds
-    size_type start       = min(input.size(), max(0, i - preceding_window + 1));
-    size_type end         = min(input.size(), max(0, i + following_window + 1));
-    size_type start_index = min(start, end);
-    size_type end_index   = max(start, end);
-
-    // aggregate
-    // TODO: We should explore using shared memory to avoid redundant loads.
-    //       This might require separating the kernel into a special version
-    //       for dynamic and static sizes.
-
-    volatile bool output_is_valid = false;
-    output_is_valid = process_rolling_window<InputType, OutputType, agg_op, op, has_nulls>(
-      input, default_outputs, output, start_index, end_index, i, min_periods);
-
-    // set the mask
-    cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
-
-    // only one thread writes the mask
-    if (0 == threadIdx.x % cudf::detail::warp_size) {
-      output.set_mask_word(cudf::word_index(i), result_mask);
-      warp_valid_count += __popc(result_mask);
-    }
-
-    // process next element
-    i += stride;
-    active_threads = __ballot_sync(active_threads, i < input.size());
-  }
-
-  // sum the valid counts across the whole block
-  size_type block_valid_count =
-    cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-
-  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
-}
-
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
           aggregation::Kind op,
           int block_size,
           bool has_nulls,
+          typename DeviceRollingOperator,
           typename PrecedingWindowIterator,
           typename FollowingWindowIterator>
 __launch_bounds__(block_size) __global__
@@ -440,10 +762,9 @@ __launch_bounds__(block_size) __global__
                    column_device_view default_outputs,
                    mutable_column_device_view output,
                    size_type* __restrict__ output_valid_count,
+                   DeviceRollingOperator device_operator,
                    PrecedingWindowIterator preceding_window_begin,
-                   FollowingWindowIterator following_window_begin,
-                   size_type min_periods,
-                   agg_op device_agg_op)
+                   FollowingWindowIterator following_window_begin)
 {
   size_type i      = blockIdx.x * block_size + threadIdx.x;
   size_type stride = block_size * gridDim.x;
@@ -467,8 +788,8 @@ __launch_bounds__(block_size) __global__
     //       for dynamic and static sizes.
 
     volatile bool output_is_valid = false;
-    output_is_valid = process_rolling_window<InputType, OutputType, agg_op, op, has_nulls>(
-      input, default_outputs, output, start_index, end_index, i, min_periods, device_agg_op);
+    output_is_valid               = device_operator.template operator()<OutputType, has_nulls>(
+      input, default_outputs, output, start_index, end_index, i);
 
     // set the mask
     cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
@@ -491,726 +812,108 @@ __launch_bounds__(block_size) __global__
   if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
 }
 
+/**
+ * @brief Type/aggregation dispatched functor for launching the gpu rolling window
+ *        kernel.
+ */
 template <typename InputType>
 struct rolling_window_launcher {
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  size_type kernel_launcher(column_view const& input,
-                            column_view const& default_outputs,
-                            mutable_column_view& output,
-                            PrecedingWindowIterator preceding_window_begin,
-                            FollowingWindowIterator following_window_begin,
-                            size_type min_periods,
-                            rolling_aggregation const& agg,
-                            rmm::cuda_stream_view stream)
-  {
-    using Type    = device_storage_type_t<T>;
-    using OutType = device_storage_type_t<target_type_t<InputType, op>>;
-
-    constexpr cudf::size_type block_size = 256;
-    cudf::detail::grid_1d grid(input.size(), block_size);
-
-    auto input_device_view           = column_device_view::create(input, stream);
-    auto output_device_view          = mutable_column_device_view::create(output, stream);
-    auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
-
-    rmm::device_scalar<size_type> device_valid_count{0, stream};
-
-    if (input.has_nulls()) {
-      gpu_rolling<Type, OutType, agg_op, op, block_size, true>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods);
-    } else {
-      gpu_rolling<Type, OutType, agg_op, op, block_size, false>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods);
-    }
-
-    size_type valid_count = device_valid_count.value(stream);
-
-    // check the stream for debugging
-    CHECK_CUDA(stream.value());
-
-    return valid_count;
-  }
-
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  size_type kernel_launcher(column_view const& input,
-                            column_view const& default_outputs,
-                            mutable_column_view& output,
-                            PrecedingWindowIterator preceding_window_begin,
-                            FollowingWindowIterator following_window_begin,
-                            size_type min_periods,
-                            rolling_aggregation const& agg,
-                            agg_op const& device_agg_op,
-                            rmm::cuda_stream_view stream)
-  {
-    using Type    = device_storage_type_t<T>;
-    using OutType = device_storage_type_t<target_type_t<InputType, op>>;
-
-    constexpr cudf::size_type block_size = 256;
-    cudf::detail::grid_1d grid(input.size(), block_size);
-
-    auto input_device_view           = column_device_view::create(input, stream);
-    auto output_device_view          = mutable_column_device_view::create(output, stream);
-    auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
-
-    rmm::device_scalar<size_type> device_valid_count{0, stream};
-
-    if (input.has_nulls()) {
-      gpu_rolling<Type, OutType, agg_op, op, block_size, true>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods,
-                                                             device_agg_op);
-    } else {
-      gpu_rolling<Type, OutType, agg_op, op, block_size, false>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods,
-                                                             device_agg_op);
-    }
-
-    size_type valid_count = device_valid_count.value(stream);
-
-    // check the stream for debugging
-    CHECK_CUDA(stream.value());
-
-    return valid_count;
-  }
-
-  // This launch is only for fixed width columns with valid aggregation option
-  // numeric: All
-  // timestamp: MIN, MAX, COUNT_VALID, COUNT_ALL, ROW_NUMBER
-  // string, dictionary, list : COUNT_VALID, COUNT_ALL, ROW_NUMBER
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, op>() and
-                     !cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         rolling_aggregation const& agg,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    auto output = make_fixed_width_column(
-      target_type(input.type(), op), input.size(), mask_state::UNINITIALIZED, stream, mr);
-
-    cudf::mutable_column_view output_view = output->mutable_view();
-    auto valid_count =
-      kernel_launcher<T, agg_op, op, PrecedingWindowIterator, FollowingWindowIterator>(
-        input,
-        default_outputs,
-        output_view,
-        preceding_window_begin,
-        following_window_begin,
-        min_periods,
-        agg,
-        stream);
-
-    output->set_null_count(output->size() - valid_count);
-
-    return output;
-  }
-
-  // This launch is only for string specializations
-  // string: MIN, MAX
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         rolling_aggregation const& agg,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    auto output = make_numeric_column(cudf::data_type{cudf::type_to_id<size_type>()},
-                                      input.size(),
-                                      cudf::mask_state::UNINITIALIZED,
-                                      stream,
-                                      mr);
-
-    cudf::mutable_column_view output_view = output->mutable_view();
-
-    // Passing the agg_op and aggregation::Kind as constant to group them in pair, else it
-    // evolves to error when try to use agg_op as compiler tries different combinations
-    if (op == aggregation::MIN) {
-      kernel_launcher<T,
-                      DeviceMin,
-                      aggregation::ARGMIN,
-                      PrecedingWindowIterator,
-                      FollowingWindowIterator>(input,
-                                               default_outputs,
-                                               output_view,
-                                               preceding_window_begin,
-                                               following_window_begin,
-                                               min_periods,
-                                               agg,
-                                               stream);
-    } else if (op == aggregation::MAX) {
-      kernel_launcher<T,
-                      DeviceMax,
-                      aggregation::ARGMAX,
-                      PrecedingWindowIterator,
-                      FollowingWindowIterator>(input,
-                                               default_outputs,
-                                               output_view,
-                                               preceding_window_begin,
-                                               following_window_begin,
-                                               min_periods,
-                                               agg,
-                                               stream);
-    } else {
-      CUDF_FAIL("MIN and MAX are the only supported aggregation types for string columns");
-    }
-
-    // The rows that represent null elements will be having negative values in gather map,
-    // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
-    auto output_table = detail::gather(table_view{{input}},
-                                       output->view(),
-                                       cudf::out_of_bounds_policy::NULLIFY,
-                                       detail::negative_index_policy::NOT_ALLOWED,
-                                       stream,
-                                       mr);
-    return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
-  }
-
-  // Deals with invalid column and/or aggregation options
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, op>() and
-                     !cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         rolling_aggregation const& agg,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL("Aggregation operator and/or input type combination is invalid");
-  }
-
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<cudf::is_fixed_width<T>() and
-                     (op == aggregation::LEAD || op == aggregation::LAG),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding,
-         FollowingWindowIterator following,
-         size_type min_periods,
-         rolling_aggregation const& agg,
-         agg_op const& device_agg_op,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    auto output = make_fixed_width_column(
-      target_type(input.type(), op), input.size(), mask_state::UNINITIALIZED, stream, mr);
-
-    cudf::mutable_column_view output_view = output->mutable_view();
-    auto valid_count =
-      kernel_launcher<T, agg_op, op, PrecedingWindowIterator, FollowingWindowIterator>(
-        input,
-        default_outputs,
-        output_view,
-        preceding,
-        following,
-        min_periods,
-        agg,
-        device_agg_op,
-        stream);
-
-    output->set_null_count(output->size() - valid_count);
-
-    return output;
-  }
-
   template <aggregation::Kind op,
             typename PrecedingWindowIterator,
             typename FollowingWindowIterator>
-  std::enable_if_t<!(op == aggregation::MEAN || op == aggregation::LEAD || op == aggregation::LAG ||
-                     op == aggregation::COLLECT_LIST),
+  std::enable_if_t<corresponding_rolling_operator<InputType, op>::type::is_supported(),
                    std::unique_ptr<column>>
   operator()(column_view const& input,
              column_view const& default_outputs,
              PrecedingWindowIterator preceding_window_begin,
              FollowingWindowIterator following_window_begin,
-             size_type min_periods,
+             int min_periods,
              rolling_aggregation const& agg,
              rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr)
   {
-    CUDF_EXPECTS(default_outputs.is_empty(),
-                 "Only LEAD/LAG window functions support default values.");
+    auto const output_type = target_type(input.type(), op);
+    auto device_operator   = create_rolling_operator<InputType, op>{}(min_periods, agg);
 
-    return launch<InputType,
-                  typename corresponding_operator<op>::type,
-                  op,
-                  PrecedingWindowIterator,
-                  FollowingWindowIterator>(input,
-                                           default_outputs,
-                                           preceding_window_begin,
-                                           following_window_begin,
-                                           min_periods,
-                                           agg,
-                                           stream,
-                                           mr);
-  }
+    auto output =
+      make_fixed_width_column(output_type, input.size(), mask_state::UNINITIALIZED, stream, mr);
 
-  // This variant is just to handle mean
-  template <aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<(op == aggregation::MEAN), std::unique_ptr<column>> operator()(
-    column_view const& input,
-    column_view const& default_outputs,
-    PrecedingWindowIterator preceding_window_begin,
-    FollowingWindowIterator following_window_begin,
-    size_type min_periods,
-    rolling_aggregation const& agg,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    return launch<InputType, cudf::DeviceSum, op, PrecedingWindowIterator, FollowingWindowIterator>(
-      input,
-      default_outputs,
-      preceding_window_begin,
-      following_window_begin,
-      min_periods,
-      agg,
-      stream,
-      mr);
-  }
+    cudf::mutable_column_view output_view = output->mutable_view();
 
-  template <aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<cudf::is_fixed_width<InputType>() &&
-                     (op == aggregation::LEAD || op == aggregation::LAG),
-                   std::unique_ptr<column>>
-  operator()(column_view const& input,
-             column_view const& default_outputs,
-             PrecedingWindowIterator preceding_window_begin,
-             FollowingWindowIterator following_window_begin,
-             size_type min_periods,
-             rolling_aggregation const& agg,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
-  {
-    return launch<InputType,
-                  cudf::DeviceLeadLag,
-                  op,
-                  PrecedingWindowIterator,
-                  FollowingWindowIterator>(
-      input,
-      default_outputs,
-      preceding_window_begin,
-      following_window_begin,
-      min_periods,
-      agg,
-      cudf::DeviceLeadLag{dynamic_cast<cudf::detail::lead_lag_aggregation const&>(agg).row_offset},
-      stream,
-      mr);
+    size_type valid_count{0};
+    {
+      using Type    = device_storage_type_t<InputType>;
+      using OutType = device_storage_type_t<target_type_t<InputType, op>>;
+
+      constexpr cudf::size_type block_size = 256;
+      cudf::detail::grid_1d grid(input.size(), block_size);
+
+      auto input_device_view           = column_device_view::create(input, stream);
+      auto output_device_view          = mutable_column_device_view::create(output_view, stream);
+      auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
+
+      rmm::device_scalar<size_type> device_valid_count{0, stream};
+
+      if (input.has_nulls()) {
+        gpu_rolling<Type, OutType, op, block_size, true>
+          <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                               *default_outputs_device_view,
+                                                               *output_device_view,
+                                                               device_valid_count.data(),
+                                                               device_operator,
+                                                               preceding_window_begin,
+                                                               following_window_begin);
+      } else {
+        gpu_rolling<Type, OutType, op, block_size, false>
+          <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                               *default_outputs_device_view,
+                                                               *output_device_view,
+                                                               device_valid_count.data(),
+                                                               device_operator,
+                                                               preceding_window_begin,
+                                                               following_window_begin);
+      }
+
+      valid_count = device_valid_count.value(stream);
+
+      // check the stream for debugging
+      CHECK_CUDA(stream.value());
+    }
+
+    output->set_null_count(output->size() - valid_count);
+
+    return output;
   }
 
   template <aggregation::Kind op,
             typename PrecedingWindowIterator,
             typename FollowingWindowIterator>
-  std::enable_if_t<!cudf::is_fixed_width<InputType>() &&
-                     (op == aggregation::LEAD || op == aggregation::LAG),
+  std::enable_if_t<!corresponding_rolling_operator<InputType, op>::type::is_supported(),
                    std::unique_ptr<column>>
   operator()(column_view const& input,
              column_view const& default_outputs,
              PrecedingWindowIterator preceding_window_begin,
              FollowingWindowIterator following_window_begin,
-             size_type min_periods,
+             int min_periods,
              rolling_aggregation const& agg,
              rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr)
   {
-    return cudf::detail::
-      compute_lead_lag_for_nested<op, InputType, PrecedingWindowIterator, FollowingWindowIterator>(
-        input,
-        default_outputs,
-        preceding_window_begin,
-        following_window_begin,
-        dynamic_cast<cudf::detail::lead_lag_aggregation const&>(agg).row_offset,
-        stream,
-        mr);
-  }
-
-  /**
-   * @brief Creates the offsets child of the result of the `COLLECT_LIST` window aggregation
-   *
-   * Given the input column, the preceding/following window bounds, and `min_periods`,
-   * the sizes of each list row may be computed. These values can then be used to
-   * calculate the offsets for the result of `COLLECT_LIST`.
-   *
-   * Note: If `min_periods` exceeds the number of observations for a window, the size
-   * is set to `0` (since the result is `null`).
-   */
-  template <typename PrecedingIter, typename FollowingIter>
-  std::unique_ptr<column> create_collect_offsets(size_type input_size,
-                                                 PrecedingIter preceding_begin,
-                                                 FollowingIter following_begin,
-                                                 size_type min_periods,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-  {
-    // Materialize offsets column.
-    auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
-    auto sizes =
-      make_fixed_width_column(size_data_type, input_size, mask_state::UNALLOCATED, stream, mr);
-    auto mutable_sizes = sizes->mutable_view();
-
-    // Consider the following preceding/following values:
-    //    preceding = [1,2,2,2,2]
-    //    following = [1,1,1,1,0]
-    // The sum of the vectors should yield the window sizes:
-    //  prec + foll = [2,3,3,3,2]
-    //
-    // If min_periods=2, all rows have at least `min_periods` observations.
-    // But if min_periods=3, rows at indices 0 and 4 have too few observations, and must return
-    // null. The sizes at these positions must be 0, i.e.
-    //  prec + foll = [0,3,3,3,0]
-    thrust::transform(rmm::exec_policy(stream),
-                      preceding_begin,
-                      preceding_begin + input_size,
-                      following_begin,
-                      mutable_sizes.begin<size_type>(),
-                      [min_periods] __device__(auto preceding, auto following) {
-                        return (preceding + following) < min_periods ? 0 : (preceding + following);
-                      });
-
-    // Convert `sizes` to an offsets column, via inclusive_scan():
-    return strings::detail::make_offsets_child_column(
-      sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
-  }
-
-  /**
-   * @brief Generate mapping of each row in the COLLECT_LIST result's child column
-   * to the index of the row it belongs to.
-   *
-   *  If
-   *         input col == [A,B,C,D,E]
-   *    and  preceding == [1,2,2,2,2],
-   *    and  following == [1,1,1,1,0],
-   *  then,
-   *        collect result       == [ [A,B], [A,B,C], [B,C,D], [C,D,E], [D,E] ]
-   *   i.e. result offset column == [0,2,5,8,11,13],
-   *    and result child  column == [A,B,A,B,C,B,C,D,C,D,E,D,E].
-   *  Mapping back to `input`    == [0,1,0,1,2,1,2,3,2,3,4,3,4]
-   */
-  std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view const& offsets,
-                                                             rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr)
-  {
-    auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
-
-    // First, reduce offsets column by key, to identify the number of times
-    // an offset appears.
-    // Next, scatter the count for each offset (except the first and last),
-    // into a column of N `0`s, where N == number of child rows.
-    // For the example above:
-    //   offsets        == [0, 2, 5, 8, 11, 13]
-    //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
-    //
-    // If the above example had an empty list row at index 2,
-    // the same columns would look as follows:
-    //   offsets        == [0, 2, 5, 5, 8, 11, 13]
-    //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
-    //
-    // Note: To correctly handle null list rows at the beginning of
-    // the output column, care must be taken to skip the first `0`
-    // in the offsets column, when running `reduce_by_key()`.
-    // This accounts for the `0` added by default to the offsets
-    // column, marking the beginning of the column.
-
-    auto const num_child_rows{
-      cudf::detail::get_value<size_type>(offsets, offsets.size() - 1, stream)};
-
-    auto scatter_values =
-      make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
-    auto scatter_keys =
-      make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
-    auto reduced_by_key =
-      thrust::reduce_by_key(rmm::exec_policy(stream),
-                            offsets.template begin<size_type>() + 1,  // Skip first 0 in offsets.
-                            offsets.template end<size_type>(),
-                            thrust::make_constant_iterator<size_type>(1),
-                            scatter_keys->mutable_view().template begin<size_type>(),
-                            scatter_values->mutable_view().template begin<size_type>());
-    auto scatter_values_end = reduced_by_key.second;
-    auto scatter_output =
-      make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
-    thrust::fill_n(rmm::exec_policy(stream),
-                   scatter_output->mutable_view().template begin<size_type>(),
-                   num_child_rows,
-                   0);  // [0,0,0,...0]
-    thrust::scatter(
-      rmm::exec_policy(stream),
-      scatter_values->mutable_view().template begin<size_type>(),
-      scatter_values_end,
-      scatter_keys->view().template begin<size_type>(),
-      scatter_output->mutable_view().template begin<size_type>());  // [0,0,1,0,0,1,...]
-
-    // Next, generate mapping with inclusive_scan() on scatter() result.
-    // For the example above:
-    //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
-    //   inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4]
-    //
-    // For the case with an empty list at index 3:
-    //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
-    //   inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5]
-    auto per_row_mapping =
-      make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           scatter_output->view().template begin<size_type>(),
-                           scatter_output->view().template end<size_type>(),
-                           per_row_mapping->mutable_view().template begin<size_type>());
-    return per_row_mapping;
-  }
-
-  /**
-   * @brief Create gather map to generate the child column of the result of
-   * the `COLLECT_LIST` window aggregation.
-   */
-  template <typename PrecedingIter>
-  std::unique_ptr<column> create_collect_gather_map(column_view const& child_offsets,
-                                                    column_view const& per_row_mapping,
-                                                    PrecedingIter preceding_iter,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
-  {
-    auto gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                              per_row_mapping.size(),
-                                              mask_state::UNALLOCATED,
-                                              stream,
-                                              mr);
-    thrust::transform(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(per_row_mapping.size()),
-      gather_map->mutable_view().template begin<size_type>(),
-      [d_offsets =
-         child_offsets.template begin<size_type>(),  // E.g. [0,   2,     5,     8,     11, 13]
-       d_groups =
-         per_row_mapping.template begin<size_type>(),  // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4]
-       d_prev = preceding_iter] __device__(auto i) {
-        auto group              = d_groups[i];
-        auto group_start_offset = d_offsets[group];
-        auto relative_index     = i - group_start_offset;
-
-        return (group - d_prev[group] + 1) + relative_index;
-      });
-    return gather_map;
-  }
-
-  /**
-   * @brief Count null entries in result of COLLECT_LIST.
-   */
-  size_type count_child_nulls(column_view const& input,
-                              std::unique_ptr<column> const& gather_map,
-                              rmm::cuda_stream_view stream)
-  {
-    auto input_device_view = column_device_view::create(input, stream);
-
-    auto input_row_is_null = [d_input = *input_device_view] __device__(auto i) {
-      return d_input.is_null_nocheck(i);
-    };
-
-    return thrust::count_if(rmm::exec_policy(stream),
-                            gather_map->view().template begin<size_type>(),
-                            gather_map->view().template end<size_type>(),
-                            input_row_is_null);
-  }
-
-  /**
-   * @brief Purge entries for null inputs from gather_map, and adjust offsets.
-   */
-  std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
-    column_view const& input,
-    column_view const& gather_map,
-    column_view const& offsets,
-    size_type num_child_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    auto input_device_view = column_device_view::create(input, stream);
-
-    auto input_row_not_null = [d_input = *input_device_view] __device__(auto i) {
-      return d_input.is_valid_nocheck(i);
-    };
-
-    // Purge entries in gather_map that correspond to null input.
-    auto new_gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                                  gather_map.size() - num_child_nulls,
-                                                  mask_state::UNALLOCATED,
-                                                  stream,
-                                                  mr);
-    thrust::copy_if(rmm::exec_policy(stream),
-                    gather_map.template begin<size_type>(),
-                    gather_map.template end<size_type>(),
-                    new_gather_map->mutable_view().template begin<size_type>(),
-                    input_row_not_null);
-
-    // Recalculate offsets after null entries are purged.
-    auto new_sizes = make_fixed_width_column(
-      data_type{type_to_id<size_type>()}, input.size(), mask_state::UNALLOCATED, stream, mr);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(input.size()),
-                      new_sizes->mutable_view().template begin<size_type>(),
-                      [d_gather_map  = gather_map.template begin<size_type>(),
-                       d_old_offsets = offsets.template begin<size_type>(),
-                       input_row_not_null] __device__(auto i) {
-                        return thrust::count_if(thrust::seq,
-                                                d_gather_map + d_old_offsets[i],
-                                                d_gather_map + d_old_offsets[i + 1],
-                                                input_row_not_null);
-                      });
-
-    auto new_offsets =
-      strings::detail::make_offsets_child_column(new_sizes->view().template begin<size_type>(),
-                                                 new_sizes->view().template end<size_type>(),
-                                                 stream,
-                                                 mr);
-
-    return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(
-      std::move(new_gather_map), std::move(new_offsets));
-  }
-
-  template <aggregation::Kind op, typename PrecedingIter, typename FollowingIter>
-  std::enable_if_t<(op == aggregation::COLLECT_LIST), std::unique_ptr<column>> operator()(
-    column_view const& input,
-    column_view const& default_outputs,
-    PrecedingIter preceding_begin_raw,
-    FollowingIter following_begin_raw,
-    size_type min_periods,
-    rolling_aggregation const& agg,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_EXPECTS(default_outputs.is_empty(),
-                 "COLLECT_LIST window function does not support default values.");
-
-    if (input.is_empty()) return empty_like(input);
-
-    // Fix up preceding/following iterators to respect column boundaries,
-    // similar to gpu_rolling().
-    // `rolling_window()` does not fix up preceding/following so as not to read past
-    // column boundaries.
-    // `grouped_rolling_window()` and `time_range_based_grouped_rolling_window() do.
-    auto preceding_begin = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0), [preceding_begin_raw] __device__(auto i) {
-        return thrust::min(preceding_begin_raw[i], i + 1);
-      });
-    auto following_begin = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0),
-      [following_begin_raw, size = input.size()] __device__(auto i) {
-        return thrust::min(following_begin_raw[i], size - i - 1);
-      });
-
-    // Materialize collect list's offsets.
-    auto offsets = create_collect_offsets(
-      input.size(), preceding_begin, following_begin, min_periods, stream, mr);
-
-    // Map each element of the collect() result's child column
-    // to the index where it appears in the input.
-    auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream, mr);
-
-    // Generate gather map to produce the collect() result's child column.
-    auto gather_map = create_collect_gather_map(
-      offsets->view(), per_row_mapping->view(), preceding_begin, stream, mr);
-
-    // If gather_map collects null elements, and null_policy == EXCLUDE,
-    // those elements must be filtered out, and offsets recomputed.
-    auto null_handling = dynamic_cast<collect_list_aggregation const&>(agg)._null_handling;
-    if (null_handling == null_policy::EXCLUDE && input.has_nulls()) {
-      auto num_child_nulls = count_child_nulls(input, gather_map, stream);
-      if (num_child_nulls != 0) {
-        std::tie(gather_map, offsets) =
-          purge_null_entries(input, *gather_map, *offsets, num_child_nulls, stream, mr);
-      }
-    }
-
-    // gather(), to construct child column.
-    auto gather_output =
-      cudf::gather(table_view{std::vector<column_view>{input}}, gather_map->view());
-
-    rmm::device_buffer null_mask;
-    size_type null_count;
-    std::tie(null_mask, null_count) = valid_if(
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(input.size()),
-      [preceding_begin, following_begin, min_periods] __device__(auto i) {
-        return (preceding_begin[i] + following_begin[i]) >= min_periods;
-      },
-      stream,
-      mr);
-
-    return make_lists_column(input.size(),
-                             std::move(offsets),
-                             std::move(gather_output->release()[0]),
-                             null_count,
-                             std::move(null_mask),
-                             stream,
-                             mr);
+    CUDF_FAIL("Invalid aggregation type/pair");
   }
 };
 
+/**
+ * @brief Functor for performing the high level rolling logic.
+ *
+ * This does 3 basic things:
+ *
+ * - It calls the preprocess step on incoming aggregation/type pairs
+ * - It calls the aggregation-dispatched gpu-rolling operation
+ * - It calls the final postprocess step
+ */
 struct dispatch_rolling {
-  template <typename T, typename PrecedingWindowIterator, typename FollowingWindowIterator>
+  template <typename InputType, typename PrecedingWindowIterator, typename FollowingWindowIterator>
   std::unique_ptr<column> operator()(column_view const& input,
                                      column_view const& default_outputs,
                                      PrecedingWindowIterator preceding_window_begin,
@@ -1220,16 +923,40 @@ struct dispatch_rolling {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    return aggregation_dispatcher(agg.kind,
-                                  rolling_window_launcher<T>{},
-                                  input,
-                                  default_outputs,
-                                  preceding_window_begin,
-                                  following_window_begin,
-                                  min_periods,
-                                  agg,
-                                  stream,
-                                  mr);
+    // do any preprocessing of aggregations (eg, MIN -> ARGMIN, COLLECT_LIST -> nothing)
+    rolling_aggregation_preprocessor preprocessor;
+    auto preprocessed_aggs = agg.get_simple_aggregations(input.type(), preprocessor);
+    CUDF_EXPECTS(preprocessed_aggs.size() <= 1,
+                 "Encountered a non-trivial rolling aggregation result");
+
+    // perform the rolling window if we produced an aggregation to use
+    auto intermediate = preprocessed_aggs.size() > 0
+                          ? aggregation_dispatcher(
+                              dynamic_cast<rolling_aggregation const&>(*preprocessed_aggs[0]).kind,
+                              rolling_window_launcher<InputType>{},
+                              input,
+                              default_outputs,
+                              preceding_window_begin,
+                              following_window_begin,
+                              min_periods,
+                              dynamic_cast<rolling_aggregation const&>(*preprocessed_aggs[0]),
+                              stream,
+                              mr)
+                          : nullptr;
+
+    // finalize.
+    auto const result_type = target_type(input.type(), agg.kind);
+    rolling_aggregation_postprocessor postprocessor(input,
+                                                    default_outputs,
+                                                    result_type,
+                                                    preceding_window_begin,
+                                                    following_window_begin,
+                                                    min_periods,
+                                                    std::move(intermediate),
+                                                    stream,
+                                                    mr);
+    agg.finalize(postprocessor);
+    return postprocessor.get_result();
   }
 };
 
@@ -1250,8 +977,9 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
 
-  if (input.has_nulls())
+  if (input.has_nulls()) {
     CUDF_FAIL("Currently the UDF version of rolling window does NOT support inputs with nulls.");
+  }
 
   min_periods = std::max(min_periods, 0);
 
@@ -1333,14 +1061,20 @@ std::unique_ptr<column> rolling_window(column_view const& input,
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
 
-  if (input.is_empty()) return empty_like(input);
+  if (input.is_empty()) { return empty_like(input); }
 
-  if (cudf::is_dictionary(input.type()))
+  if (cudf::is_dictionary(input.type())) {
     CUDF_EXPECTS(agg.kind == aggregation::COUNT_ALL || agg.kind == aggregation::COUNT_VALID ||
                    agg.kind == aggregation::ROW_NUMBER || agg.kind == aggregation::MIN ||
                    agg.kind == aggregation::MAX || agg.kind == aggregation::LEAD ||
                    agg.kind == aggregation::LAG,
                  "Invalid aggregation for dictionary column");
+  }
+
+  if (agg.kind != aggregation::LEAD && agg.kind != aggregation::LAG) {
+    CUDF_EXPECTS(default_outputs.is_empty(),
+                 "Only LEAD/LAG window functions support default values.");
+  }
 
   min_periods = std::max(min_periods, 0);
 
@@ -1358,12 +1092,14 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                       agg,
                                       stream,
                                       mr);
+
   if (!cudf::is_dictionary(input.type())) return output;
 
   // dictionary column post processing
   if (agg.kind == aggregation::COUNT_ALL || agg.kind == aggregation::COUNT_VALID ||
-      agg.kind == aggregation::ROW_NUMBER)
+      agg.kind == aggregation::ROW_NUMBER) {
     return output;
+  }
 
   // output is new dictionary indices (including nulls)
   auto keys = std::make_unique<column>(dictionary_column_view(input).keys(), stream, mr);
diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp
index 18bd0ea2217..bd64cc39f47 100644
--- a/cpp/src/rolling/rolling_detail.hpp
+++ b/cpp/src/rolling/rolling_detail.hpp
@@ -25,65 +25,6 @@ namespace cudf {
 // helper functions - used in the rolling window implementation and tests
 
 namespace detail {
-// return true the aggregation is valid for the specified ColumnType
-// valid aggregations may still be further specialized (eg, is_string_specialized)
-template <typename ColumnType, class AggOp, aggregation::Kind op>
-static constexpr bool is_rolling_supported()
-{
-  if (!cudf::detail::is_valid_aggregation<ColumnType, op>()) {
-    return false;
-  } else if (cudf::is_numeric<ColumnType>() or cudf::is_duration<ColumnType>()) {
-    constexpr bool is_comparable_countable_op = std::is_same<AggOp, DeviceMin>::value or
-                                                std::is_same<AggOp, DeviceMax>::value or
-                                                std::is_same<AggOp, DeviceCount>::value;
-
-    constexpr bool is_operation_supported =
-      (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
-      (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-      (op == aggregation::MEAN) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
-      (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
-
-    constexpr bool is_valid_numeric_agg =
-      (cudf::is_numeric<ColumnType>() or cudf::is_duration<ColumnType>() or
-       is_comparable_countable_op) and
-      is_operation_supported;
-
-    return is_valid_numeric_agg;
-
-  } else if (cudf::is_timestamp<ColumnType>()) {
-    return (op == aggregation::MIN) or (op == aggregation::MAX) or
-           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
-           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
-  } else if (cudf::is_fixed_point<ColumnType>()) {
-    return (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
-           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
-           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
-  } else if (std::is_same<ColumnType, cudf::string_view>()) {
-    return (op == aggregation::MIN) or (op == aggregation::MAX) or
-           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
-
-  } else if (std::is_same<ColumnType, cudf::list_view>()) {
-    return (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
-  } else if (std::is_same<ColumnType, cudf::struct_view>()) {
-    // TODO: Add support for COUNT_VALID, COUNT_ALL, ROW_NUMBER.
-    return op == aggregation::COLLECT_LIST;
-  } else {
-    return false;
-  }
-}
-
-// return true if this Op is specialized for strings.
-template <typename ColumnType, class AggOp, aggregation::Kind Op>
-static constexpr bool is_rolling_string_specialization()
-{
-  return std::is_same<ColumnType, cudf::string_view>::value and
-         ((aggregation::MIN == Op and std::is_same<AggOp, DeviceMin>::value) or
-          (aggregation::MAX == Op and std::is_same<AggOp, DeviceMax>::value));
-}
 
 // store functor
 template <typename T, bool is_mean = false>
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index 804fd715951..cb123114fd8 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "rolling_test.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -336,7 +338,7 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(cudf::column_view const& input,
                                                         std::vector<size_type> const& group_offsets,
                                                         size_type const& preceding_window,
@@ -393,7 +395,7 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<!is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(cudf::column_view const& input,
                                                         std::vector<size_type> const& group_offsets,
                                                         size_type const& preceding_window_col,
@@ -953,7 +955,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(cudf::column_view const& timestamp_column,
                                                         cudf::order const& timestamp_order,
                                                         cudf::column_view const& input,
@@ -1037,7 +1039,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<!is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(cudf::column_view const& timestamp_column,
                                                         cudf::order const& timestamp_order,
                                                         cudf::column_view const& input,
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index c22acf6b022..a67e670acb7 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "rolling_test.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -298,7 +300,7 @@ class RollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(
     cudf::column_view const& input,
     std::vector<size_type> const& preceding_window_col,
@@ -353,7 +355,7 @@ class RollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<!is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(
     cudf::column_view const& input,
     std::vector<size_type> const& preceding_window_col,
diff --git a/cpp/tests/rolling/rolling_test.hpp b/cpp/tests/rolling/rolling_test.hpp
new file mode 100644
index 00000000000..cca82b15826
--- /dev/null
+++ b/cpp/tests/rolling/rolling_test.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/utilities/traits.hpp>
+
+// return true if the aggregation is valid for the specified ColumnType
+// valid aggregations may still be further specialized (eg, is_string_specialized)
+template <typename ColumnType, class AggOp, cudf::aggregation::Kind op>
+static constexpr bool is_rolling_supported()
+{
+  using namespace cudf;
+
+  if (!cudf::detail::is_valid_aggregation<ColumnType, op>()) {
+    return false;
+  } else if (cudf::is_numeric<ColumnType>() or cudf::is_duration<ColumnType>()) {
+    constexpr bool is_comparable_countable_op = std::is_same<AggOp, DeviceMin>::value or
+                                                std::is_same<AggOp, DeviceMax>::value or
+                                                std::is_same<AggOp, DeviceCount>::value;
+
+    constexpr bool is_operation_supported =
+      (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
+      (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+      (op == aggregation::MEAN) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
+      (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
+
+    constexpr bool is_valid_numeric_agg =
+      (cudf::is_numeric<ColumnType>() or cudf::is_duration<ColumnType>() or
+       is_comparable_countable_op) and
+      is_operation_supported;
+
+    return is_valid_numeric_agg;
+
+  } else if (cudf::is_timestamp<ColumnType>()) {
+    return (op == aggregation::MIN) or (op == aggregation::MAX) or
+           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
+           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
+  } else if (cudf::is_fixed_point<ColumnType>()) {
+    return (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
+           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
+           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
+  } else if (std::is_same<ColumnType, cudf::string_view>()) {
+    return (op == aggregation::MIN) or (op == aggregation::MAX) or
+           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
+
+  } else if (std::is_same<ColumnType, cudf::list_view>()) {
+    return (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
+  } else if (std::is_same<ColumnType, cudf::struct_view>()) {
+    // TODO: Add support for COUNT_VALID, COUNT_ALL, ROW_NUMBER.
+    return op == aggregation::COLLECT_LIST;
+  } else {
+    return false;
+  }
+}

From 7e725b5274b32c235a314e2ecff3460b56255a0b Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Mon, 24 May 2021 15:25:51 -0500
Subject: [PATCH 20/24] Do not add nulls to the hash table when
 null_equality::NOT_EQUAL is passed to left_semi_join and left_anti_join
 (#8277)

Fixes  https://github.com/rapidsai/cudf/issues/7300

This is fundamentally the same issue and fix as https://github.com/rapidsai/cudf/pull/6943/files from @hyperbolic2346

When nulls are considered not equal (`null_equality::NOT_EQUAL`) there is no point in adding them to the hash table used for the join as they will never compare as true against anything.  Adding large numbers of nulls was causing huge performance issues.

Includes a fix to doxygen comments for `left_anti_join`

Performance gain is tremendous.

Before:

```
Benchmark                                                                             Time             CPU   Iterations
-----------------------------------------------------------------------------------------------------------------------
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/100000/100000/manual_time        1072 ms         1072 ms            1
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/200000/400000/manual_time        4253 ms         4253 ms            1
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/300000/1000000/manual_time      14016 ms        14016 ms            1
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/100000/100000/manual_time         932 ms          932 ms            1
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/200000/400000/manual_time        4481 ms         4481 ms            1
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/300000/1000000/manual_time      14172 ms        14172 ms            1
```


After:
```
-----------------------------------------------------------------------------------------------------------------------
Benchmark                                                                             Time             CPU   Iterations
-----------------------------------------------------------------------------------------------------------------------
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/100000/100000/manual_time       0.143 ms        0.162 ms         4996
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/200000/400000/manual_time       0.255 ms        0.275 ms         2780
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/300000/1000000/manual_time      0.514 ms        0.532 ms         1368
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/100000/100000/manual_time       0.135 ms        0.155 ms         5203
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/200000/400000/manual_time       0.206 ms        0.224 ms         3325
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/300000/1000000/manual_time      0.368 ms        0.385 ms         1903
```

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Robert Maynard (https://github.com/robertmaynard)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/8277
---
 cpp/benchmarks/join/join_benchmark.cu   | 135 ++++++++++++++-
 cpp/include/cudf/join.hpp               |   4 +-
 cpp/src/join/semi_join.cu               |  22 ++-
 cpp/tests/CMakeLists.txt                |   2 +-
 cpp/tests/join/semi_anti_join_tests.cpp | 217 ++++++++++++++++++++++++
 cpp/tests/join/semi_join_tests.cpp      | 109 ------------
 6 files changed, 365 insertions(+), 124 deletions(-)
 create mode 100644 cpp/tests/join/semi_anti_join_tests.cpp
 delete mode 100644 cpp/tests/join/semi_join_tests.cpp

diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu
index d1c11696ddd..a7c109db9b4 100644
--- a/cpp/benchmarks/join/join_benchmark.cu
+++ b/cpp/benchmarks/join/join_benchmark.cu
@@ -37,8 +37,8 @@ template <typename key_type, typename payload_type>
 class Join : public cudf::benchmark {
 };
 
-template <typename key_type, typename payload_type, bool Nullable>
-static void BM_join(benchmark::State &state)
+template <typename key_type, typename payload_type, bool Nullable, typename Join>
+static void BM_join(benchmark::State& state, Join JoinFunc)
 {
   const cudf::size_type build_table_size{(cudf::size_type)state.range(0)};
   const cudf::size_type probe_table_size{(cudf::size_type)state.range(1)};
@@ -105,20 +105,69 @@ static void BM_join(benchmark::State &state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto result = cudf::inner_join(
+    auto result = JoinFunc(
       probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL);
   }
 }
 
-#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)     \
-  (::benchmark::State & st) { BM_join<key_type, payload_type, nullable>(st); }
+#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)         \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)             \
+  (::benchmark::State & st)                                                   \
+  {                                                                           \
+    auto join = [](cudf::table_view const& left,                              \
+                   cudf::table_view const& right,                             \
+                   std::vector<cudf::size_type> const& left_on,               \
+                   std::vector<cudf::size_type> const& right_on,              \
+                   cudf::null_equality compare_nulls) {                       \
+      return cudf::inner_join(left, right, left_on, right_on, compare_nulls); \
+    };                                                                        \
+    BM_join<key_type, payload_type, nullable>(st, join);                      \
+  }
 
 JOIN_BENCHMARK_DEFINE(join_32bit, int32_t, int32_t, false);
 JOIN_BENCHMARK_DEFINE(join_64bit, int64_t, int64_t, false);
 JOIN_BENCHMARK_DEFINE(join_32bit_nulls, int32_t, int32_t, true);
 JOIN_BENCHMARK_DEFINE(join_64bit_nulls, int64_t, int64_t, true);
 
+#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)                 \
+  (::benchmark::State & st)                                                       \
+  {                                                                               \
+    auto join = [](cudf::table_view const& left,                                  \
+                   cudf::table_view const& right,                                 \
+                   std::vector<cudf::size_type> const& left_on,                   \
+                   std::vector<cudf::size_type> const& right_on,                  \
+                   cudf::null_equality compare_nulls) {                           \
+      return cudf::left_anti_join(left, right, left_on, right_on, compare_nulls); \
+    };                                                                            \
+    BM_join<key_type, payload_type, nullable>(st, join);                          \
+  }
+
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, int32_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, int64_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, int32_t, true);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, int64_t, true);
+
+#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)                 \
+  (::benchmark::State & st)                                                       \
+  {                                                                               \
+    auto join = [](cudf::table_view const& left,                                  \
+                   cudf::table_view const& right,                                 \
+                   std::vector<cudf::size_type> const& left_on,                   \
+                   std::vector<cudf::size_type> const& right_on,                  \
+                   cudf::null_equality compare_nulls) {                           \
+      return cudf::left_semi_join(left, right, left_on, right_on, compare_nulls); \
+    };                                                                            \
+    BM_join<key_type, payload_type, nullable>(st, join);                          \
+  }
+
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, int32_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, int64_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, int32_t, true);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, int64_t, true);
+
+// join -----------------------------------------------------------------------
 BENCHMARK_REGISTER_F(Join, join_32bit)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
@@ -154,3 +203,77 @@ BENCHMARK_REGISTER_F(Join, join_64bit_nulls)
   ->Args({50'000'000, 50'000'000})
   ->Args({40'000'000, 120'000'000})
   ->UseManualTime();
+
+// left anti-join -------------------------------------------------------------
+BENCHMARK_REGISTER_F(Join, left_anti_join_32bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_anti_join_64bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_anti_join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_anti_join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
+
+// left semi-join -------------------------------------------------------------
+BENCHMARK_REGISTER_F(Join, left_semi_join_32bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_semi_join_64bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_semi_join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_semi_join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 5a2c913d4c3..428a4195bf8 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -424,13 +424,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
  * TableB: {{1, 2, 3},  {1, 2, 5}}
  * left_on: {0}
  * right_on: {1}
- * Result: {{0}, {1}}
+ * Result: {{0}}
  *
  * TableA: {{0, 1, 2}, {1, 2, 5}}
  * TableB: {{1, 2, 3}}
  * left_on: {0}
  * right_on: {0}
- * Result: { {0} {1} }
+ * Result: { {0}, {1} }
  * @endcode
  *
  * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index c029f0272da..cc34aed33ea 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -92,12 +92,22 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
                                                 equality_build);
   auto hash_table     = *hash_table_ptr;
 
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     right_num_rows,
-                     [hash_table] __device__(size_type idx) mutable {
-                       hash_table.insert(thrust::make_pair(idx, true));
-                     });
+  // if compare_nulls == UNEQUAL, we can simply ignore any rows that
+  // contain a NULL in any column as they will never compare to equal.
+  auto const row_bitmask = (compare_nulls == null_equality::EQUAL)
+                             ? rmm::device_buffer{}
+                             : cudf::detail::bitmask_and(right_flattened_keys, stream);
+  // skip rows that are null here.
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    right_num_rows,
+    [hash_table, row_bitmask = static_cast<bitmask_type const*>(row_bitmask.data())] __device__(
+      size_type idx) mutable {
+      if (!row_bitmask || cudf::bit_is_set(row_bitmask, idx)) {
+        hash_table.insert(thrust::make_pair(idx, true));
+      }
+    });
 
   //
   // Now we have a hash table, we need to iterate over the rows of the left table
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index bbcfd69a52b..b0377d8d2dc 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -84,7 +84,7 @@ ConfigureTest(GROUPBY_TEST
 ConfigureTest(JOIN_TEST
     join/join_tests.cpp
     join/cross_join_tests.cpp
-    join/semi_join_tests.cpp)
+    join/semi_anti_join_tests.cpp)
 
 ###################################################################################################
 # - is_sorted tests -------------------------------------------------------------------------------
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
new file mode 100644
index 00000000000..5b38bafb122
--- /dev/null
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/join.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+
+template <typename T>
+using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
+using strcol_wrapper = cudf::test::strings_column_wrapper;
+using column_vector  = std::vector<std::unique_ptr<cudf::column>>;
+using Table          = cudf::table;
+
+struct JoinTest : public cudf::test::BaseFixture {
+};
+
+std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> get_saj_tables(
+  std::vector<bool> const& left_is_human_nulls, std::vector<bool> const& right_is_human_nulls)
+{
+  column_wrapper<int32_t> col0_0{{99, 1, 2, 0, 2}, {0, 1, 1, 1, 1}};
+  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
+  auto col0_names_col = strcol_wrapper{
+    "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
+  auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
+
+  auto col0_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, false}, left_is_human_nulls.begin()};
+
+  auto col0_3 = cudf::test::structs_column_wrapper{
+    {col0_names_col, col0_ages_col, col0_is_human_col}, {1, 1, 1, 1, 1}};
+
+  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, -99}, {1, 1, 1, 1, 0}};
+  strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
+  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  auto col1_names_col = strcol_wrapper{"Carrot Ironfoundersson",
+                                       "Angua von Überwald",
+                                       "Detritus",
+                                       "Carrot Ironfoundersson",
+                                       "Samuel Vimes"};
+  auto col1_ages_col  = column_wrapper<int32_t>{{351, 25, 27, 31, 48}};
+
+  auto col1_is_human_col =
+    column_wrapper<bool>{{true, false, false, false, true}, right_is_human_nulls.begin()};
+
+  auto col1_3 =
+    cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
+
+  column_vector cols0, cols1;
+  cols0.push_back(col0_0.release());
+  cols0.push_back(col0_1.release());
+  cols0.push_back(col0_2.release());
+  cols0.push_back(col0_3.release());
+  cols1.push_back(col1_0.release());
+  cols1.push_back(col1_1.release());
+  cols1.push_back(col1_2.release());
+  cols1.push_back(col1_3.release());
+
+  return {std::make_unique<Table>(std::move(cols0)), std::make_unique<Table>(std::move(cols1))};
+}
+
+TEST_F(JoinTest, SemiJoinWithStructsAndNulls)
+{
+  auto tables = get_saj_tables({1, 1, 0, 1, 0}, {1, 0, 0, 1, 1});
+
+  auto result = cudf::left_semi_join(
+    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{99, 2}, {0, 1}};
+  strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
+  column_wrapper<int32_t> col_gold_2{{0, 1}};
+  auto col_gold_3_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
+  auto col_gold_3_ages_col  = column_wrapper<int32_t>{{48, 25}};
+
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};
+
+  auto col_gold_3 = cudf::test::structs_column_wrapper{
+    {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
+
+  column_vector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+}
+
+TEST_F(JoinTest, SemiJoinWithStructsAndNullsNotEqual)
+{
+  auto tables = get_saj_tables({1, 1, 0, 1, 1}, {1, 1, 0, 1, 1});
+
+  auto result = cudf::left_semi_join(
+    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::UNEQUAL);
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{2}, {1}};
+  strcol_wrapper col_gold_1({"s0"}, {1});
+  column_wrapper<int32_t> col_gold_2{{1}};
+  auto col_gold_3_names_col = strcol_wrapper{"Angua von Überwald"};
+  auto col_gold_3_ages_col  = column_wrapper<int32_t>{{25}};
+
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{false}, {1}};
+
+  auto col_gold_3 = cudf::test::structs_column_wrapper{
+    {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
+
+  column_vector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+}
+
+TEST_F(JoinTest, AntiJoinWithStructsAndNulls)
+{
+  auto tables = get_saj_tables({1, 1, 0, 1, 0}, {1, 0, 0, 1, 1});
+
+  auto result = cudf::left_anti_join(
+    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{1, 2, 0}, {1, 1, 1}};
+  strcol_wrapper col_gold_1({"s1", "s0", "s4"}, {1, 0, 1});
+  column_wrapper<int32_t> col_gold_2{{1, 2, 4}};
+  auto col_gold_3_names_col = strcol_wrapper{"Carrot Ironfoundersson", "Detritus", "Samuel Vimes"};
+  auto col_gold_3_ages_col  = column_wrapper<int32_t>{{27, 351, 31}};
+
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false, false}, {1, 0, 1}};
+
+  auto col_gold_3 = cudf::test::structs_column_wrapper{
+    {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
+
+  column_vector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+}
+
+TEST_F(JoinTest, AntiJoinWithStructsAndNullsNotEqual)
+{
+  auto tables = get_saj_tables({1, 1, 0, 1, 1}, {1, 1, 0, 1, 1});
+
+  auto result = cudf::left_anti_join(
+    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::UNEQUAL);
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{99, 1, 2, 0}, {0, 1, 1, 1}};
+  strcol_wrapper col_gold_1({"s1", "s1", "s0", "s4"}, {1, 1, 0, 1});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4}};
+  auto col_gold_3_names_col =
+    strcol_wrapper{"Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes"};
+  auto col_gold_3_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31}};
+
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, true, false, false}, {1, 1, 0, 1}};
+
+  auto col_gold_3 = cudf::test::structs_column_wrapper{
+    {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
+
+  column_vector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+}
diff --git a/cpp/tests/join/semi_join_tests.cpp b/cpp/tests/join/semi_join_tests.cpp
deleted file mode 100644
index 178a26dfdba..00000000000
--- a/cpp/tests/join/semi_join_tests.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/dictionary/encode.hpp>
-#include <cudf/join.hpp>
-#include <cudf/sorting.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <thrust/iterator/transform_iterator.h>
-
-template <typename T>
-using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
-using strcol_wrapper = cudf::test::strings_column_wrapper;
-using column_vector  = std::vector<std::unique_ptr<cudf::column>>;
-using Table          = cudf::table;
-
-struct JoinTest : public cudf::test::BaseFixture {
-};
-
-TEST_F(JoinTest, SemiJoinWithStructsAndNulls)
-{
-  column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
-  column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
-  auto col0_names_col = strcol_wrapper{
-    "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
-  auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
-
-  auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};
-
-  auto col0_3 = cudf::test::structs_column_wrapper{
-    {col0_names_col, col0_ages_col, col0_is_human_col}, {1, 1, 1, 1, 1}};
-
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
-  auto col1_names_col = strcol_wrapper{"Carrot Ironfoundersson",
-                                       "Angua von Überwald",
-                                       "Detritus",
-                                       "Carrot Ironfoundersson",
-                                       "Samuel Vimes"};
-  auto col1_ages_col  = column_wrapper<int32_t>{{351, 25, 27, 31, 48}};
-
-  auto col1_is_human_col = column_wrapper<bool>{{true, false, false, false, true}, {1, 0, 0, 1, 1}};
-
-  auto col1_3 =
-    cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
-
-  column_vector cols0, cols1;
-  cols0.push_back(col0_0.release());
-  cols0.push_back(col0_1.release());
-  cols0.push_back(col0_2.release());
-  cols0.push_back(col0_3.release());
-  cols1.push_back(col1_0.release());
-  cols1.push_back(col1_1.release());
-  cols1.push_back(col1_2.release());
-  cols1.push_back(col1_3.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  auto result            = cudf::left_semi_join(t0, t1, {0, 1, 3}, {0, 1, 3});
-  auto result_sort_order = cudf::sorted_order(result->view());
-  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
-
-  column_wrapper<int32_t> col_gold_0{{3, 2}};
-  strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
-  column_wrapper<int32_t> col_gold_2{{0, 1}};
-  auto col_gold_3_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
-  auto col_gold_3_ages_col  = column_wrapper<int32_t>{{48, 25}};
-
-  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};
-
-  auto col_gold_3 = cudf::test::structs_column_wrapper{
-    {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
-
-  column_vector cols_gold;
-  cols_gold.push_back(col_gold_0.release());
-  cols_gold.push_back(col_gold_1.release());
-  cols_gold.push_back(col_gold_2.release());
-  cols_gold.push_back(col_gold_3.release());
-  Table gold(std::move(cols_gold));
-
-  auto gold_sort_order = cudf::sorted_order(gold.view());
-  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
-}

From 7eaf3d7f8e9e4c886505e2aef402af6679f85f4d Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 24 May 2021 13:34:58 -0700
Subject: [PATCH 21/24] Preserve column hierarchy when getting NULL row from
 `LIST` column (#8206)

This PR fixes a bug introduced in #8071, when `get_element` retrieves a NULL row in a nested column, the scalar returned not only should be `is_valid() == false`, but also should preserve the column hierarchy of the row-data, even they are invalid. Because depending libraries may use the column hierarchy to deduce the nested type of the column.

This PR also reverts `make_default_constructed_scalar` API for `LIST` type. A `LIST` type scalar should have complete column hierarchy as part of its type information. There isn't enough information provided to the API to construct that.

Another tiny addition: instead of hard coding the position of child column, use `list_column_view::child_column_index` intead.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Paul Taylor (https://github.com/trxcllnt)

URL: https://github.com/rapidsai/cudf/pull/8206
---
 cpp/src/copying/get_element.cu        |  9 ++--
 cpp/src/scalar/scalar_factories.cpp   |  2 +-
 cpp/tests/copying/get_value_tests.cpp | 74 +++++++++++++++++++++------
 3 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index dc0334bd37b..a4d863d204d 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -125,7 +125,8 @@ struct get_element_functor {
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
-    bool valid = is_element_valid_sync(input, index, stream);
+    bool valid               = is_element_valid_sync(input, index, stream);
+    auto const child_col_idx = lists_column_view::child_column_index;
 
     if (valid) {
       lists_column_view lcv(input);
@@ -134,9 +135,11 @@ struct get_element_functor {
         lists::detail::copy_slice(lcv, index, index + 1, stream, mr)->release();
       // Construct scalar with row data
       return std::make_unique<list_scalar>(
-        std::move(*row_slice_contents.children[1]), valid, stream, mr);
+        std::move(*row_slice_contents.children[child_col_idx]), valid, stream, mr);
     } else {
-      return make_default_constructed_scalar(data_type(type_id::LIST));
+      auto empty_row_contents = empty_like(input)->release();
+      return std::make_unique<list_scalar>(
+        std::move(*empty_row_contents.children[child_col_idx]), valid, stream, mr);
     }
   }
 
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index e1d71b279d6..af78d84d874 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -146,7 +146,7 @@ template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<list_view>(
   rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
-  return std::make_unique<list_scalar>(column(), false, stream, mr);
+  CUDF_FAIL("list_view type not supported");
 }
 
 template <>
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 7d2bc458462..40dc07512eb 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -223,12 +223,17 @@ TYPED_TEST(ListGetFixedWidthValueTest, NonNestedGetNonNullEmpty)
 TYPED_TEST(ListGetFixedWidthValueTest, NonNestedGetNull)
 {
   using LCW = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = cudf::test::fixed_width_column_wrapper<TypeParam>;
+
   LCW col({LCW{1, 2, 34}, LCW{}, LCW{1}, LCW{}}, this->odds_valid());
   size_type index = 2;
 
-  auto s = get_element(col, index);
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
 
   EXPECT_FALSE(s->is_valid());
+  // Test preserve column hierarchy
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(typed_s->view(), FCW{});
 }
 
 TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNonNullNonEmpty)
@@ -301,7 +306,9 @@ TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNonNullEmpty)
 
 TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNull)
 {
-  using LCW = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using FCW      = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<offset_type>;
 
   std::vector<valid_type> valid{1, 0, 1, 0};
   // clang-format off
@@ -315,9 +322,15 @@ TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNull)
   // clang-format on
   size_type index = 1;
 
-  auto s = get_element(col, index);
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  auto expected_data =
+    make_lists_column(0, offset_t{}.release(), FCW{}.release(), 0, rmm::device_buffer{});
 
   EXPECT_FALSE(s->is_valid());
+  // Test preserve column hierarchy
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(typed_s->view(), *expected_data);
 }
 
 struct ListGetStringValueTest : public BaseFixture {
@@ -363,15 +376,18 @@ TEST_F(ListGetStringValueTest, NonNestedGetNonNullEmpty)
 
 TEST_F(ListGetStringValueTest, NonNestedGetNull)
 {
-  using LCW = cudf::test::lists_column_wrapper<string_view>;
+  using LCW      = cudf::test::lists_column_wrapper<string_view>;
+  using StringCW = strings_column_wrapper;
 
   std::vector<valid_type> valid{1, 0, 0, 1};
   LCW col({LCW{"aaa", "Héllo"}, LCW{}, LCW{""}, LCW{"42"}}, valid.begin());
   size_type index = 2;
 
-  auto s = get_element(col, index);
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
 
   EXPECT_FALSE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(typed_s->view(), StringCW{});
 }
 
 TEST_F(ListGetStringValueTest, NestedGetNonNullNonEmpty)
@@ -446,7 +462,9 @@ TEST_F(ListGetStringValueTest, NestedGetNonNullEmpty)
 
 TEST_F(ListGetStringValueTest, NestedGetNull)
 {
-  using LCW = cudf::test::lists_column_wrapper<string_view>;
+  using LCW      = cudf::test::lists_column_wrapper<string_view>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<offset_type>;
+  using StringCW = cudf::test::strings_column_wrapper;
 
   std::vector<valid_type> valid{0, 0, 1, 1};
   // clang-format off
@@ -458,11 +476,16 @@ TEST_F(ListGetStringValueTest, NestedGetNull)
       LCW{}
     }, valid.begin());
   // clang-format on
-  LCW expected_data{};
   size_type index = 0;
 
-  auto s = get_element(col, index);
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  auto expected_data =
+    make_lists_column(0, offset_t{}.release(), StringCW{}.release(), 0, rmm::device_buffer{});
+
   EXPECT_FALSE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_data, typed_s->view());
 }
 
 /**
@@ -480,10 +503,11 @@ struct ListGetStructValueTest : public BaseFixture {
    * in `initializer_list`. However this is an expensive function because it repeatedly
    * calls `cudf::set_null_mask` for each row.
    */
-  std::unique_ptr<cudf::column> make_test_lists_column(size_type num_lists,
-                                                       fixed_width_column_wrapper<int32_t> offsets,
-                                                       std::unique_ptr<cudf::column> child,
-                                                       std::initializer_list<valid_type> null_mask)
+  std::unique_ptr<cudf::column> make_test_lists_column(
+    size_type num_lists,
+    fixed_width_column_wrapper<offset_type> offsets,
+    std::unique_ptr<cudf::column> child,
+    std::initializer_list<valid_type> null_mask)
   {
     size_type null_count = num_lists - std::accumulate(null_mask.begin(), null_mask.end(), 0);
     auto d_null_mask     = cudf::create_null_mask(
@@ -619,7 +643,7 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullEmpty)
   // 3-rows
   // [{1, NULL, NULL}, NULL]
   // [{3, "xyz", [3, 8, 4]}]
-  // [] <- get_element(0)
+  // []                      <- get_element(2)
 
   auto list_column = this->make_test_lists_column(3, {0, 2, 3, 3}, this->leaf_data(), {1, 1, 1});
   size_type index  = 2;
@@ -638,15 +662,21 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullEmpty)
 TYPED_TEST(ListGetStructValueTest, NonNestedGetNull)
 {
   // 2-rows
-  // NULL <- get_element(0)
+  // NULL                    <- get_element(0)
   // [{3, "xyz", [3, 8, 4]}]
 
+  using valid_t = std::vector<valid_type>;
+
   auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {0, 1});
   size_type index  = 0;
 
-  auto s = get_element(list_column->view(), index);
+  auto s       = get_element(list_column->view(), index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  auto expected_data = this->make_test_structs_column({}, {}, {}, valid_t{}.begin());
 
   EXPECT_FALSE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(typed_s->view(), expected_data);
 }
 
 TYPED_TEST(ListGetStructValueTest, NestedGetNonNullNonEmpty)
@@ -716,7 +746,7 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNonNullEmpty)
 {
   // 3-rows
   // [[{1, NULL, NULL}, NULL]]
-  // [] <- get_element(1)
+  // []                        <- get_element(1)
   // [[{3, "xyz", [3, 8, 4]}]]
 
   auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
@@ -741,15 +771,25 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNull)
   // 3-rows
   // [[{1, NULL, NULL}, NULL]]
   // []
-  // NULL <- get_element(1)
+  // NULL                      <- get_element(2)
+
+  using valid_t  = std::vector<valid_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<offset_type>;
+
   auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
   auto list_column_nested =
     this->make_test_lists_column(3, {0, 1, 1, 2}, std::move(list_column), {1, 1, 0});
 
   size_type index = 2;
   auto s          = get_element(list_column_nested->view(), index);
+  auto typed_s    = static_cast<list_scalar const *>(s.get());
+
+  auto nested = this->make_test_structs_column({}, {}, {}, valid_t{}.begin());
+  auto expected_data =
+    make_lists_column(0, offset_t{}.release(), nested.release(), 0, rmm::device_buffer{});
 
   EXPECT_FALSE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_data, typed_s->view());
 }
 
 }  // namespace test

From c398054f21d1a26de3d9c51dba581d1a0c6306d5 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 24 May 2021 13:37:07 -0700
Subject: [PATCH 22/24] Support scattering `list_scalar` (#8256)

This PR adds support for scattering `list_scalar` into a `LIST` column.

This PR refactors current scatter for `LIST` column interface and allows column and scalar scatter share the same code path.

This PR also removes all debugging codes in `scatter.cuh`

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/8256
---
 cpp/include/cudf/lists/detail/scatter.cuh     | 321 +++++++-----
 cpp/src/copying/scatter.cu                    |  11 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 .../copying/scatter_list_scalar_tests.cpp     | 458 ++++++++++++++++++
 4 files changed, 660 insertions(+), 131 deletions(-)
 create mode 100644 cpp/tests/copying/scatter_list_scalar_tests.cpp

diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index aec45d260bf..33d3d1cb09e 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/copying.hpp>
 #include <cudf/lists/list_device_view.cuh>
@@ -33,6 +34,8 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/binary_search.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <cinttypes>
 
@@ -141,19 +144,22 @@ struct unbound_list_view {
   size_type _size{};       // Number of elements in *this* list row.
 };
 
+template <typename IndexIterator>
 rmm::device_uvector<unbound_list_view> list_vector_from_column(
   unbound_list_view::label_type label,
   cudf::detail::lists_column_device_view const& lists_column,
+  IndexIterator index_begin,
+  IndexIterator index_end,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto n_rows = lists_column.size();
+  auto n_rows = thrust::distance(index_begin, index_end);
 
   auto vector = rmm::device_uvector<unbound_list_view>(n_rows, stream, mr);
 
   thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(n_rows),
+                    index_begin,
+                    index_end,
                     vector.begin(),
                     [label, lists_column] __device__(size_type row_index) {
                       return unbound_list_view{label, lists_column, row_index};
@@ -204,43 +210,6 @@ std::pair<rmm::device_buffer, size_type> construct_child_nullmask(
                                 mr);
 }
 
-#ifndef NDEBUG
-void print(std::string const& msg, column_view const& col, rmm::cuda_stream_view stream)
-{
-  if (col.type().id() != type_id::INT32) {
-    std::cout << "[Cannot print non-INT32 column.]" << std::endl;
-    return;
-  }
-
-  std::cout << msg << " = [";
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    col.size(),
-    [c = col.template data<int32_t>()] __device__(auto const& i) { printf("%d,", c[i]); });
-  std::cout << "]" << std::endl;
-}
-
-void print(std::string const& msg,
-           rmm::device_uvector<unbound_list_view> const& scatter,
-           rmm::cuda_stream_view stream)
-{
-  std::cout << msg << " == [";
-
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     scatter.size(),
-                     [s = scatter.begin()] __device__(auto const& i) {
-                       auto si = s[i];
-                       printf("%s[%d](%d), ",
-                              (si.label() == unbound_list_view::label_type::SOURCE ? "S" : "T"),
-                              si.row_index(),
-                              si.size());
-                     });
-  std::cout << "]" << std::endl;
-}
-#endif  // NDEBUG
-
 /**
  * @brief (type_dispatch endpoint) Functor that constructs the child column result
  *        of `scatter()`ing a list column.
@@ -343,15 +312,6 @@ struct list_child_constructor {
             list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
         : std::make_pair(rmm::device_buffer{}, 0);
 
-#ifndef NDEBUG
-    print("list_offsets ", list_offsets, stream);
-    print("source_lists.child() ", source_lists_column_view.child(), stream);
-    print("source_lists.offsets() ", source_lists_column_view.offsets(), stream);
-    print("target_lists.child() ", target_lists_column_view.child(), stream);
-    print("target_lists.offsets() ", target_lists_column_view.offsets(), stream);
-    print("scatter_rows ", list_vector, stream);
-#endif  // NDEBUG
-
     auto child_column = cudf::make_fixed_width_column(source_lists_column_view.child().type(),
                                                       num_child_rows,
                                                       child_null_mask.first,
@@ -359,47 +319,33 @@ struct list_child_constructor {
                                                       stream,
                                                       mr);
 
-    auto copy_child_values_for_list_index = [d_scattered_lists =
-                                               list_vector.begin(),  // unbound_list_view*
-                                             d_child_column =
-                                               child_column->mutable_view().data<T>(),
-                                             d_offsets = list_offsets.template data<int32_t>(),
-                                             source_lists,
-                                             target_lists] __device__(auto const& row_index) {
-      auto const unbound_list_row = d_scattered_lists[row_index];
-      auto const actual_list_row  = unbound_list_row.bind_to_column(source_lists, target_lists);
-      auto const& bound_column =
-        (unbound_list_row.label() == unbound_list_view::label_type::SOURCE ? source_lists
-                                                                           : target_lists);
-      auto const list_begin_offset =
-        bound_column.offsets().template element<size_type>(unbound_list_row.row_index());
-      auto const list_end_offset =
-        bound_column.offsets().template element<size_type>(unbound_list_row.row_index() + 1);
-
-#ifndef NDEBUG
-      printf(
-        "%d: Unbound == %s[%d](%d), Bound size == %d, calc_begin==%d, calc_end=%d, calc_size=%d\n",
-        row_index,
-        (unbound_list_row.label() == unbound_list_view::label_type::SOURCE ? "S" : "T"),
-        unbound_list_row.row_index(),
-        unbound_list_row.size(),
-        actual_list_row.size(),
-        list_begin_offset,
-        list_end_offset,
-        list_end_offset - list_begin_offset);
-#endif  // NDEBUG
-
-      // Copy all elements in this list row, to "appropriate" offset in child-column.
-      auto const destination_start_offset = d_offsets[row_index];
-      thrust::for_each_n(thrust::seq,
-                         thrust::make_counting_iterator<size_type>(0),
-                         actual_list_row.size(),
-                         [actual_list_row, d_child_column, destination_start_offset] __device__(
-                           auto const& list_element_index) {
-                           d_child_column[destination_start_offset + list_element_index] =
-                             actual_list_row.template element<T>(list_element_index);
-                         });
-    };
+    auto copy_child_values_for_list_index =
+      [d_scattered_lists = list_vector.begin(),  // unbound_list_view*
+       d_child_column    = child_column->mutable_view().data<T>(),
+       d_offsets         = list_offsets.template data<int32_t>(),
+       source_lists,
+       target_lists] __device__(auto const& row_index) {
+        auto const unbound_list_row = d_scattered_lists[row_index];
+        auto const actual_list_row  = unbound_list_row.bind_to_column(source_lists, target_lists);
+        auto const& bound_column =
+          (unbound_list_row.label() == unbound_list_view::label_type::SOURCE ? source_lists
+                                                                             : target_lists);
+        auto const list_begin_offset =
+          bound_column.offsets().template element<size_type>(unbound_list_row.row_index());
+        auto const list_end_offset =
+          bound_column.offsets().template element<size_type>(unbound_list_row.row_index() + 1);
+
+        // Copy all elements in this list row, to "appropriate" offset in child-column.
+        auto const destination_start_offset = d_offsets[row_index];
+        thrust::for_each_n(thrust::seq,
+                           thrust::make_counting_iterator<size_type>(0),
+                           actual_list_row.size(),
+                           [actual_list_row, d_child_column, destination_start_offset] __device__(
+                             auto const& list_element_index) {
+                             d_child_column[destination_start_offset + list_element_index] =
+                               actual_list_row.template element<T>(list_element_index);
+                           });
+      };
 
     // For each list-row, copy underlying elements to the child column.
     thrust::for_each_n(rmm::exec_policy(stream),
@@ -712,64 +658,46 @@ struct list_child_constructor {
 void assert_same_data_type(column_view const& lhs, column_view const& rhs)
 {
   CUDF_EXPECTS(lhs.type().id() == rhs.type().id(), "Mismatched Data types.");
-  CUDF_EXPECTS(lhs.num_children() == rhs.num_children(), "Mismatched number of child columns.");
+  // Empty string column has no children
+  CUDF_EXPECTS(lhs.type().id() == type_id::STRING or lhs.num_children() == rhs.num_children(),
+               "Mismatched number of child columns.");
 
   for (int i{0}; i < lhs.num_children(); ++i) { assert_same_data_type(lhs.child(i), rhs.child(i)); }
 }
 
-}  // namespace
-
 /**
- * @brief Scatters lists into a copy of the target column
- * according to a scatter map.
+ * @brief General implementation of scattering into list column
  *
- * The scatter is performed according to the scatter iterator such that row
- * `scatter_map[i]` of the output column is replaced by the source list-row.
- * All other rows of the output column equal corresponding rows of the target table.
+ * Scattering `source` into `target` according to `scatter_map`.
+ * The view order of `source` and `target` can be specified by
+ * `source_vector` and `target_vector` respectively.
  *
- * If the same index appears more than once in the scatter map, the result is
- * undefined.
- *
- * The caller must update the null mask in the output column.
- *
- * @tparam SourceIterator must produce list_view objects
  * @tparam MapIterator must produce index values within the target column.
  *
+ * @param source_vector A vector of `unbound_list_view` into source column
+ * @param target_vector A vector of `unbound_list_view` into target column
+ * @param scatter_map_begin Start iterator of scatter map
+ * @param scatter_map_end End iterator of scatter map
+ * @param source Source column view
+ * @param target Target column view
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New lists column.
  */
 template <typename MapIterator>
-std::unique_ptr<column> scatter(
-  column_view const& source,
+std::unique_ptr<column> scatter_impl(
+  rmm::device_uvector<unbound_list_view> const& source_vector,
+  rmm::device_uvector<unbound_list_view>& target_vector,
   MapIterator scatter_map_begin,
   MapIterator scatter_map_end,
+  column_view const& source,
   column_view const& target,
-  rmm::cuda_stream_view stream        = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto const num_rows = target.size();
-
-  if (num_rows == 0) { return cudf::empty_like(target); }
-
-  auto const child_column_type = lists_column_view(target).child().type();
-
   assert_same_data_type(source, target);
 
-  using lists_column_device_view = cudf::detail::lists_column_device_view;
-  using unbound_list_view        = cudf::lists::detail::unbound_list_view;
-
-  auto const source_device_view = column_device_view::create(source, stream);
-  auto const source_vector      = list_vector_from_column(unbound_list_view::label_type::SOURCE,
-                                                     lists_column_device_view(*source_device_view),
-                                                     stream,
-                                                     mr);
-
-  auto const target_device_view = column_device_view::create(target, stream);
-  auto target_vector            = list_vector_from_column(unbound_list_view::label_type::TARGET,
-                                               lists_column_device_view(*target_device_view),
-                                               stream,
-                                               mr);
+  auto const child_column_type = lists_column_view(target).child().type();
 
   // Scatter.
   thrust::scatter(rmm::exec_policy(stream),
@@ -800,7 +728,7 @@ std::unique_ptr<column> scatter(
   auto null_mask =
     target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr};
 
-  return cudf::make_lists_column(num_rows,
+  return cudf::make_lists_column(target.size(),
                                  std::move(offsets_column),
                                  std::move(child_column),
                                  cudf::UNKNOWN_NULL_COUNT,
@@ -809,6 +737,143 @@ std::unique_ptr<column> scatter(
                                  mr);
 }
 
+}  // namespace
+
+/**
+ * @brief Scatters lists into a copy of the target column
+ * according to a scatter map.
+ *
+ * The scatter is performed according to the scatter iterator such that row
+ * `scatter_map[i]` of the output column is replaced by the source list-row.
+ * All other rows of the output column equal corresponding rows of the target table.
+ *
+ * If the same index appears more than once in the scatter map, the result is
+ * undefined.
+ *
+ * The caller must update the null mask in the output column.
+ *
+ * @tparam MapIterator must produce index values within the target column.
+ *
+ * @param source Source column view
+ * @param scatter_map_begin Start iterator of scatter map
+ * @param scatter_map_end End iterator of scatter map
+ * @param target Target column view
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New lists column.
+ */
+template <typename MapIterator>
+std::unique_ptr<column> scatter(
+  column_view const& source,
+  MapIterator scatter_map_begin,
+  MapIterator scatter_map_end,
+  column_view const& target,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const num_rows = target.size();
+  if (num_rows == 0) { return cudf::empty_like(target); }
+
+  auto const source_device_view = column_device_view::create(source, stream);
+  auto const scatter_map_size   = thrust::distance(scatter_map_begin, scatter_map_end);
+  auto const source_vector =
+    list_vector_from_column(unbound_list_view::label_type::SOURCE,
+                            cudf::detail::lists_column_device_view(*source_device_view),
+                            thrust::make_counting_iterator<size_type>(0),
+                            thrust::make_counting_iterator<size_type>(scatter_map_size),
+                            stream,
+                            mr);
+
+  auto const target_device_view = column_device_view::create(target, stream);
+  auto target_vector =
+    list_vector_from_column(unbound_list_view::label_type::TARGET,
+                            cudf::detail::lists_column_device_view(*target_device_view),
+                            thrust::make_counting_iterator<size_type>(0),
+                            thrust::make_counting_iterator<size_type>(num_rows),
+                            stream,
+                            mr);
+
+  return scatter_impl(
+    source_vector, target_vector, scatter_map_begin, scatter_map_end, source, target, stream, mr);
+}
+
+/**
+ * @brief Scatters list scalar (a single row) into a copy of the target column
+ * according to a scatter map.
+ *
+ * Returns a copy of the target column where every row specified in the `scatter_map`
+ * is replaced by the row value.
+ *
+ * If the same index appears more than once in the scatter map, the result is
+ * undefined.
+ *
+ * The caller must update the null mask in the output column.
+ *
+ * @tparam MapIterator must produce index values within the target column.
+ *
+ * @param slr Source scalar, specifying row data
+ * @param scatter_map_begin Start iterator of scatter map
+ * @param scatter_map_end End iterator of scatter map
+ * @param target Target column view
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New lists column.
+ */
+template <typename MapIterator>
+std::unique_ptr<column> scatter(
+  scalar const& slr,
+  MapIterator scatter_map_begin,
+  MapIterator scatter_map_end,
+  column_view const& target,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const num_rows = target.size();
+  if (num_rows == 0) { return cudf::empty_like(target); }
+
+  auto lv        = static_cast<list_scalar const*>(&slr);
+  bool slr_valid = slr.is_valid(stream);
+  rmm::device_buffer null_mask =
+    slr_valid ? cudf::detail::create_null_mask(1, mask_state::UNALLOCATED, stream, mr)
+              : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
+  auto offset_column = make_numeric_column(
+    data_type{type_to_id<offset_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
+  thrust::sequence(rmm::exec_policy(stream),
+                   offset_column->mutable_view().begin<offset_type>(),
+                   offset_column->mutable_view().end<offset_type>(),
+                   0,
+                   lv->view().size());
+  auto wrapped = column_view(data_type{type_id::LIST},
+                             1,
+                             nullptr,
+                             static_cast<bitmask_type const*>(null_mask.data()),
+                             slr_valid ? 0 : 1,
+                             0,
+                             {offset_column->view(), lv->view()});
+
+  auto const source_device_view = column_device_view::create(wrapped, stream);
+  auto const scatter_map_size   = thrust::distance(scatter_map_begin, scatter_map_end);
+  auto const source_vector =
+    list_vector_from_column(unbound_list_view::label_type::SOURCE,
+                            cudf::detail::lists_column_device_view(*source_device_view),
+                            thrust::make_constant_iterator<size_type>(0),
+                            thrust::make_constant_iterator<size_type>(0) + scatter_map_size,
+                            stream,
+                            mr);
+
+  auto const target_device_view = column_device_view::create(target, stream);
+  auto target_vector =
+    list_vector_from_column(unbound_list_view::label_type::TARGET,
+                            cudf::detail::lists_column_device_view(*target_device_view),
+                            thrust::make_counting_iterator<size_type>(0),
+                            thrust::make_counting_iterator<size_type>(num_rows),
+                            stream,
+                            mr);
+
+  return scatter_impl(
+    source_vector, target_vector, scatter_map_begin, scatter_map_end, wrapped, target, stream, mr);
+}
+
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 3fccc2122cf..994e085585c 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -142,10 +142,15 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
 
 template <typename MapIterator>
 struct column_scalar_scatterer_impl<list_view, MapIterator> {
-  template <typename... Args>
-  std::unique_ptr<column> operator()(Args&&...) const
+  std::unique_ptr<column> operator()(std::reference_wrapper<const scalar> const& source,
+                                     MapIterator scatter_iter,
+                                     size_type scatter_rows,
+                                     column_view const& target,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
-    CUDF_FAIL("scatter scalar to list_view not implemented");
+    return lists::detail::scatter(
+      source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr);
   }
 };
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b0377d8d2dc..7900c5b3274 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -210,6 +210,7 @@ ConfigureTest(COPYING_TEST
     copying/sample_tests.cpp
     copying/scatter_tests.cpp
     copying/scatter_list_tests.cpp
+    copying/scatter_list_scalar_tests.cpp
     copying/scatter_struct_tests.cpp
     copying/segmented_gather_list_tests.cpp
     copying/shift_tests.cpp
diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp
new file mode 100644
index 00000000000..d60fd82af8c
--- /dev/null
+++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+namespace cudf {
+namespace test {
+
+using mask_vector = std::vector<valid_type>;
+using size_column = fixed_width_column_wrapper<size_type>;
+
+class ScatterListScalarTests : public cudf::test::BaseFixture {
+};
+
+std::unique_ptr<column> single_scalar_scatter(column_view const& target,
+                                              scalar const& slr,
+                                              column_view const& scatter_map)
+{
+  std::vector<std::reference_wrapper<const scalar>> slrs{slr};
+  table_view targets{{target}};
+  auto result = scatter(slrs, scatter_map, targets, true);
+  return std::move(result->release()[0]);
+}
+
+template <typename T>
+class ScatterListOfFixedWidthScalarTest : public ScatterListScalarTests {
+};
+
+TYPED_TEST_CASE(ScatterListOfFixedWidthScalarTest, FixedWidthTypesWithoutFixedPoint);
+
+// Test grid
+// Dim1 : {Fixed width, strings, lists, structs}
+// Dim2 : {Null scalar, Non-null empty scalar, Non-null non-empty scalar}
+// Dim3 : {Nullable target, non-nullable target row}
+
+TYPED_TEST(ScatterListOfFixedWidthScalarTest, Basic)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = fixed_width_column_wrapper<TypeParam>;
+
+  auto slr = std::make_unique<list_scalar>(FCW({2, 2, 2}, {1, 0, 1}), true);
+  LCW col{LCW{1, 1, 1}, LCW{8, 8}, LCW{10, 10, 10, 10}, LCW{5}};
+  size_column scatter_map{3, 1, 0};
+
+  LCW expected{LCW({2, 2, 2}, mask_vector{1, 0, 1}.begin()),
+               LCW({2, 2, 2}, mask_vector{1, 0, 1}.begin()),
+               LCW{10, 10, 10, 10},
+               LCW({2, 2, 2}, mask_vector{1, 0, 1}.begin())};
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfFixedWidthScalarTest, EmptyValidScalar)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = fixed_width_column_wrapper<TypeParam>;
+
+  auto slr = std::make_unique<list_scalar>(FCW{}, true);
+  LCW col{LCW{1, 1, 1},
+          LCW{8, 8},
+          LCW({10, 10, 10, 10}, mask_vector{1, 0, 1, 0}.begin()),
+          LCW{5},
+          LCW{42, 42}};
+  size_column scatter_map{1, 0};
+
+  LCW expected{
+    LCW{}, LCW{}, LCW({10, 10, 10, 10}, mask_vector{1, 0, 1, 0}.begin()), LCW{5}, LCW{42, 42}};
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfFixedWidthScalarTest, NullScalar)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = fixed_width_column_wrapper<TypeParam>;
+
+  auto slr = std::make_unique<list_scalar>(FCW{}, false);
+  LCW col{LCW({1, 1, 1}, mask_vector{0, 0, 1}.begin()), LCW{8, 8}, LCW{10, 10, 10, 10}, LCW{5}};
+  size_column scatter_map{3, 1};
+
+  LCW expected({LCW({1, 1, 1}, mask_vector{0, 0, 1}.begin()), LCW{}, LCW{10, 10, 10, 10}, LCW{}},
+               mask_vector{1, 0, 1, 0}.begin());
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfFixedWidthScalarTest, NullableTargetRow)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = fixed_width_column_wrapper<TypeParam>;
+
+  auto slr = std::make_unique<list_scalar>(FCW{9, 9}, true);
+  LCW col({LCW{4, 4}, LCW{}, LCW{8, 8, 8}, LCW{}, LCW{9, 9, 9}},
+          mask_vector{1, 0, 1, 0, 1}.begin());
+  size_column scatter_map{0, 1};
+
+  LCW expected({LCW{9, 9}, LCW{9, 9}, LCW{8, 8, 8}, LCW{}, LCW{9, 9, 9}},
+               mask_vector{1, 1, 1, 0, 1}.begin());
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+class ScatterListOfStringScalarTest : public ScatterListScalarTests {
+};
+
+TEST_F(ScatterListOfStringScalarTest, Basic)
+{
+  using LCW      = lists_column_wrapper<string_view, int32_t>;
+  using StringCW = strings_column_wrapper;
+
+  auto slr = std::make_unique<list_scalar>(
+    StringCW({"Hello!", "", "你好！", "صباح الخير!", "", "こんにちは！"},
+             {true, false, true, true, false, true}),
+    true);
+  LCW col{LCW({"xx", "yy"}, mask_vector{0, 1}.begin()), LCW{""}, LCW{"a", "bab", "bacab"}};
+
+  size_column scatter_map{2, 1};
+
+  LCW expected{LCW({"xx", "yy"}, mask_vector{0, 1}.begin()),
+               LCW({"Hello!", "", "你好！", "صباح الخير!", "", "こんにちは！"},
+                   mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+               LCW({"Hello!", "", "你好！", "صباح الخير!", "", "こんにちは！"},
+                   mask_vector{1, 0, 1, 1, 0, 1}.begin())};
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(ScatterListOfStringScalarTest, EmptyValidScalar)
+{
+  using LCW      = lists_column_wrapper<string_view, int32_t>;
+  using StringCW = strings_column_wrapper;
+
+  auto slr = std::make_unique<list_scalar>(StringCW{}, true);
+
+  LCW col{LCW({"xx", "yy"}, mask_vector{0, 1}.begin()),
+          LCW{""},
+          LCW{"a", "bab", "bacab"},
+          LCW{"888", "777"}};
+
+  size_column scatter_map{0, 3};
+
+  LCW expected{LCW{}, LCW{""}, LCW{"a", "bab", "bacab"}, LCW{}};
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(ScatterListOfStringScalarTest, NullScalar)
+{
+  using LCW      = lists_column_wrapper<string_view, int32_t>;
+  using StringCW = strings_column_wrapper;
+
+  auto slr = std::make_unique<list_scalar>(StringCW{}, false);
+  LCW col{LCW{"xx", "yy"},
+          LCW({""}, mask_vector{0}.begin()),
+          LCW{"a", "bab", "bacab"},
+          LCW{"888", "777"}};
+
+  size_column scatter_map{1, 2};
+
+  LCW expected({LCW{"xx", "yy"}, LCW{}, LCW{}, LCW{"888", "777"}}, mask_vector{1, 0, 0, 1}.begin());
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(ScatterListOfStringScalarTest, NullableTargetRow)
+{
+  using LCW      = lists_column_wrapper<string_view, int32_t>;
+  using StringCW = strings_column_wrapper;
+
+  auto slr = std::make_unique<list_scalar>(
+    StringCW({"Hello!", "", "こんにちは！"}, {true, false, true}), true);
+  LCW col({LCW{"xx", "yy"}, LCW({""}, mask_vector{0}.begin()), LCW{}, LCW{"888", "777"}},
+          mask_vector{1, 1, 0, 1}.begin());
+
+  size_column scatter_map{3, 2};
+
+  LCW expected({LCW{"xx", "yy"},
+                LCW({""}, mask_vector{0}.begin()),
+                LCW({"Hello!", "", "こんにちは！"}, mask_vector{1, 0, 1}.begin()),
+                LCW({"Hello!", "", "こんにちは！"}, mask_vector{1, 0, 1}.begin())},
+               mask_vector{1, 1, 1, 1}.begin());
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+template <typename T>
+class ScatterListOfListScalarTest : public ScatterListScalarTests {
+};
+
+TYPED_TEST_CASE(ScatterListOfListScalarTest, FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(ScatterListOfListScalarTest, Basic)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+
+  auto slr = std::make_unique<list_scalar>(
+    LCW({LCW{1, 2, 3}, LCW{4}, LCW{}, LCW{5, 6}}, mask_vector{1, 1, 0, 1}.begin()), true);
+  LCW col({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+           LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+           LCW{LCW{55, 55}, LCW{}, LCW{10, 10, 10}},
+           LCW{LCW{44, 44}}});
+
+  size_column scatter_map{1, 2, 3};
+
+  LCW expected({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+                LCW({LCW{1, 2, 3}, LCW{4}, LCW{}, LCW{5, 6}}, mask_vector{1, 1, 0, 1}.begin()),
+                LCW({LCW{1, 2, 3}, LCW{4}, LCW{}, LCW{5, 6}}, mask_vector{1, 1, 0, 1}.begin()),
+                LCW({LCW{1, 2, 3}, LCW{4}, LCW{}, LCW{5, 6}}, mask_vector{1, 1, 0, 1}.begin())});
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfListScalarTest, EmptyValidScalar)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+
+  auto slr = std::make_unique<list_scalar>(LCW{}, true);
+  LCW col({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+           LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+           LCW{LCW{55, 55}, LCW{}, LCW{10, 10, 10}},
+           LCW{LCW{44, 44}}});
+
+  size_column scatter_map{3, 0};
+
+  LCW expected({LCW{},
+                LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+                LCW{LCW{55, 55}, LCW{}, LCW{10, 10, 10}},
+                LCW{}});
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfListScalarTest, NullScalar)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+
+  auto slr = std::make_unique<list_scalar>(LCW{}, false);
+  LCW col({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+           LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+           LCW{LCW{44, 44}}});
+
+  size_column scatter_map{1, 0};
+
+  LCW expected({LCW{}, LCW{}, LCW{LCW{44, 44}}}, mask_vector{0, 0, 1}.begin());
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfListScalarTest, NullableTargetRows)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+
+  auto slr = std::make_unique<list_scalar>(
+    LCW({LCW{1, 1, 1}, LCW{3, 3}, LCW{}, LCW{4}}, mask_vector{1, 1, 0, 1}.begin()), true);
+
+  LCW col({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+           LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+           LCW{LCW{44, 44}}},
+          mask_vector{1, 0, 1}.begin());
+
+  size_column scatter_map{1};
+
+  LCW expected({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+                LCW({LCW{1, 1, 1}, LCW{3, 3}, LCW{}, LCW{4}}, mask_vector{1, 1, 0, 1}.begin()),
+                LCW{LCW{44, 44}}},
+               mask_vector{1, 1, 1}.begin());
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+template <typename T>
+class ScatterListOfStructScalarTest : public ScatterListScalarTests {
+ protected:
+  structs_column_wrapper make_test_structs(fixed_width_column_wrapper<T> field0,
+                                           strings_column_wrapper field1,
+                                           lists_column_wrapper<T, int32_t> field2,
+                                           std::vector<valid_type> mask)
+  {
+    return structs_column_wrapper({field0, field1, field2}, mask.begin());
+  }
+};
+
+TYPED_TEST_CASE(ScatterListOfStructScalarTest, FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(ScatterListOfStructScalarTest, Basic)
+{
+  using LCW      = lists_column_wrapper<TypeParam, int32_t>;
+  using offset_t = fixed_width_column_wrapper<offset_type>;
+
+  auto data =
+    this->make_test_structs({{42, 42, 42}, {1, 0, 1}},
+                            {{"hello", "你好！", "bonjour!"}, {false, true, true}},
+                            LCW({LCW{88}, LCW{}, LCW{99, 99}}, mask_vector{1, 0, 1}.begin()),
+                            {1, 1, 0});
+  auto slr = std::make_unique<list_scalar>(data, true);
+
+  auto child = this->make_test_structs(
+    {{1, 1, 2, 3, 3, 3}, {0, 1, 1, 1, 0, 0}},
+    {{"x", "x", "yy", "", "zzz", "zzz"}, {true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{10}, LCW{20, 20}, LCW{}, LCW{30, 30}},
+        mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 0, 0, 1, 1});
+  offset_t offsets{0, 2, 2, 3, 6};
+  auto col = make_lists_column(4, offsets.release(), child.release(), 0, rmm::device_buffer{});
+
+  size_column scatter_map{1, 3};
+
+  auto ex_child = this->make_test_structs(
+    {{1, 1, 42, 42, 42, 2, 42, 42, 42}, {0, 1, 1, 0, 1, 1, 1, 0, 1}},
+    {{"x", "x", "hello", "你好！", "bonjour!", "yy", "hello", "你好！", "bonjour!"},
+     {true, true, false, true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{88}, LCW{}, LCW{99, 99}, LCW{10}, LCW{88}, LCW{}, LCW{99, 99}},
+        mask_vector{1, 0, 1, 0, 1, 1, 1, 0, 1}.begin()),
+    {1, 1, 1, 1, 0, 0, 1, 1, 0});
+  offset_t ex_offsets{0, 2, 5, 6, 9};
+  auto expected =
+    make_lists_column(4, ex_offsets.release(), ex_child.release(), 0, rmm::device_buffer{});
+
+  auto result = single_scalar_scatter(*col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
+
+TYPED_TEST(ScatterListOfStructScalarTest, EmptyValidScalar)
+{
+  using LCW      = lists_column_wrapper<TypeParam, int32_t>;
+  using offset_t = fixed_width_column_wrapper<offset_type>;
+
+  auto data = this->make_test_structs({}, {}, LCW{}, {});
+  auto slr  = std::make_unique<list_scalar>(data, true);
+
+  auto child = this->make_test_structs(
+    {{1, 1, 2, 3, 3, 3}, {0, 1, 1, 1, 0, 0}},
+    {{"x", "x", "yy", "", "zzz", "zzz"}, {true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{10}, LCW{20, 20}, LCW{}, LCW{30, 30}},
+        mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 0, 0, 1, 1});
+  offset_t offsets{0, 2, 2, 3, 6};
+  auto col = make_lists_column(4, offsets.release(), child.release(), 0, rmm::device_buffer{});
+
+  size_column scatter_map{0, 2};
+
+  auto ex_child =
+    this->make_test_structs({{3, 3, 3}, {1, 0, 0}},
+                            {{"", "zzz", "zzz"}, {false, true, true}},
+                            LCW({LCW{20, 20}, LCW{}, LCW{30, 30}}, mask_vector{1, 0, 1}.begin()),
+                            {0, 1, 1});
+  offset_t ex_offsets{0, 0, 0, 0, 3};
+  auto expected =
+    make_lists_column(4, ex_offsets.release(), ex_child.release(), 0, rmm::device_buffer{});
+
+  auto result = single_scalar_scatter(*col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
+
+TYPED_TEST(ScatterListOfStructScalarTest, NullScalar)
+{
+  using LCW      = lists_column_wrapper<TypeParam, int32_t>;
+  using offset_t = fixed_width_column_wrapper<offset_type>;
+
+  auto data = this->make_test_structs({}, {}, {}, {});
+  auto slr  = std::make_unique<list_scalar>(data, false);
+
+  auto child = this->make_test_structs(
+    {{1, 1, 2, 3, 3, 3}, {0, 1, 1, 1, 0, 0}},
+    {{"x", "x", "yy", "", "zzz", "zzz"}, {true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{10}, LCW{20, 20}, LCW{}, LCW{30, 30}},
+        mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 1, 0, 1, 1});
+  offset_t offsets{0, 2, 2, 3, 6};
+  auto col = make_lists_column(4, offsets.release(), child.release(), 0, rmm::device_buffer{});
+
+  size_column scatter_map{3, 1, 0};
+
+  auto ex_child = this->make_test_structs({2}, {"yy"}, LCW({10}, mask_vector{1}.begin()), {1});
+  offset_t ex_offsets{0, 0, 0, 1, 1};
+
+  auto null_mask = create_null_mask(4, mask_state::ALL_NULL);
+  set_null_mask(static_cast<bitmask_type*>(null_mask.data()), 2, 3, true);
+  auto expected =
+    make_lists_column(4, ex_offsets.release(), ex_child.release(), 3, std::move(null_mask));
+
+  auto result = single_scalar_scatter(*col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
+
+TYPED_TEST(ScatterListOfStructScalarTest, NullableTargetRow)
+{
+  using LCW      = lists_column_wrapper<TypeParam, int32_t>;
+  using offset_t = fixed_width_column_wrapper<offset_type>;
+
+  auto data =
+    this->make_test_structs({{42, 42, 42}, {1, 0, 1}},
+                            {{"hello", "你好！", "bonjour!"}, {false, true, true}},
+                            LCW({LCW{88}, LCW{}, LCW{99, 99}}, mask_vector{1, 0, 1}.begin()),
+                            {1, 1, 0});
+  auto slr = std::make_unique<list_scalar>(data, true);
+
+  auto child = this->make_test_structs(
+    {{1, 1, 2, 3, 3, 3}, {0, 1, 1, 1, 0, 0}},
+    {{"x", "x", "yy", "", "zzz", "zzz"}, {true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{10}, LCW{20, 20}, LCW{}, LCW{30, 30}},
+        mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 1, 0, 1, 1});
+  offset_t offsets{0, 2, 2, 3, 6};
+  auto null_mask = create_null_mask(4, mask_state::ALL_VALID);
+  set_null_mask(static_cast<bitmask_type*>(null_mask.data()), 1, 3, false);
+  auto col = make_lists_column(4, offsets.release(), child.release(), 2, std::move(null_mask));
+
+  size_column scatter_map{3, 2};
+
+  auto ex_child = this->make_test_structs(
+    {{1, 1, 42, 42, 42, 42, 42, 42}, {0, 1, 1, 0, 1, 1, 0, 1}},
+    {{"x", "x", "hello", "你好！", "bonjour!", "hello", "你好！", "bonjour!"},
+     {true, true, false, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{88}, LCW{}, LCW{99, 99}, LCW{88}, LCW{}, LCW{99, 99}},
+        mask_vector{1, 0, 1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 1, 1, 0, 1, 1, 0});
+  offset_t ex_offsets{0, 2, 2, 5, 8};
+
+  auto ex_null_mask = create_null_mask(4, mask_state::ALL_VALID);
+  set_null_mask(static_cast<bitmask_type*>(ex_null_mask.data()), 1, 2, false);
+  auto expected =
+    make_lists_column(4, ex_offsets.release(), ex_child.release(), 1, std::move(ex_null_mask));
+
+  auto result = single_scalar_scatter(*col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
+
+}  // namespace test
+}  // namespace cudf

From 6dbf2d58d1947c018480372e966dd5fc5f5e5dc7 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 24 May 2021 14:16:36 -0700
Subject: [PATCH 23/24] Add `groupby::replace_nulls(replace_policy)` api
 (#7118)

Part 1 of #4896, follow up of #6907

This PR provides a groupby version of the `replace_nulls(replace_policy)` function. A regular `replace_nulls(replace_policy)` operation updates the nulls with the first non-null value that precedes/follows the null. The groupby version is similar, with an exception that the non-null value look-up is bounded by groups.

Here is an example to illustrate the API input/output behavior:

```python
#Input:
keys = [2, 1, 2, 1]
values = [3, 4, NULL, NULL]

#Output, group order is not guaranteed:
sorted_keys = [1, 1, 2, 2]
result = [4, 4, 3, 3]
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/nvdbaranec
  - https://github.com/brandon-b-miller
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/7118
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |   1 +
 .../detail/groupby/group_replace_nulls.hpp    |  47 +++
 cpp/include/cudf/detail/replace/nulls.cuh     |  44 +++
 cpp/include/cudf/groupby.hpp                  |  43 ++
 cpp/src/groupby/groupby.cu                    |  30 ++
 cpp/src/groupby/sort/group_replace_nulls.cu   |  82 ++++
 cpp/src/replace/nulls.cu                      |  23 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/groupby/replace_nulls_tests.cpp     | 366 ++++++++++++++++++
 python/cudf/cudf/_lib/cpp/groupby.pxd         |   9 +-
 python/cudf/cudf/_lib/groupby.pyx             |  31 ++
 12 files changed, 660 insertions(+), 18 deletions(-)
 create mode 100644 cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
 create mode 100644 cpp/include/cudf/detail/replace/nulls.cuh
 create mode 100644 cpp/src/groupby/sort/group_replace_nulls.cu
 create mode 100644 cpp/tests/groupby/replace_nulls_tests.cpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index a8abe5b09f0..3fa4cbdff51 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -74,6 +74,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/gather.hpp
     - test -f $PREFIX/include/cudf/detail/groupby.hpp
     - test -f $PREFIX/include/cudf/detail/groupby/sort_helper.hpp
+    - test -f $PREFIX/include/cudf/detail/groupby/group_replace_nulls.hpp
     - test -f $PREFIX/include/cudf/detail/hashing.hpp
     - test -f $PREFIX/include/cudf/detail/interop.hpp
     - test -f $PREFIX/include/cudf/detail/is_element_valid.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index aa3b4406320..3501bb9345c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -209,6 +209,7 @@ add_library(cudf
     src/groupby/sort/group_max_scan.cu
     src/groupby/sort/group_min_scan.cu
     src/groupby/sort/group_sum_scan.cu
+    src/groupby/sort/group_replace_nulls.cu
     src/groupby/sort/sort_helper.cu
     src/hash/hashing.cu
     src/hash/md5_hash.cu
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
new file mode 100644
index 00000000000..5fb7379734f
--- /dev/null
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/exec_policy.hpp>
+namespace cudf {
+namespace groupby {
+namespace detail {
+
+/**
+ * @brief Internal API to replace nulls with preceding/following non-null values in @p value
+ *
+ * @param[in] grouped_value A column whose null values will be replaced.
+ * @param[in] group_labels Group labels for @p grouped_value, corresponding to group keys.
+ * @param[in] replace_policy Specify the position of replacement values relative to null values.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used to allocate device memory of the returned column.
+ */
+std::unique_ptr<column> group_replace_nulls(
+  cudf::column_view const& grouped_value,
+  device_span<size_type const> group_labels,
+  cudf::replace_policy replace_policy,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/replace/nulls.cuh b/cpp/include/cudf/detail/replace/nulls.cuh
new file mode 100644
index 00000000000..1500bdfb0b8
--- /dev/null
+++ b/cpp/include/cudf/detail/replace/nulls.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <thrust/functional.h>
+
+namespace cudf {
+namespace detail {
+
+using idx_valid_pair_t = thrust::tuple<cudf::size_type, bool>;
+
+/**
+ * @brief Functor used by `replace_nulls(replace_policy)` to determine the index to gather from in
+ * the result column.
+ *
+ * Binary functor passed to `inclusive_scan` or `inclusive_scan_by_key`. Arguments are a tuple of
+ * index and validity of a row. Returns a tuple of current index and a discarded boolean if current
+ * row is valid, otherwise a tuple of the nearest non-null row index and a discarded boolean.
+ */
+struct replace_policy_functor {
+  __device__ idx_valid_pair_t operator()(idx_valid_pair_t const& lhs, idx_valid_pair_t const& rhs)
+  {
+    return thrust::get<1>(rhs) ? thrust::make_tuple(thrust::get<0>(rhs), true)
+                               : thrust::make_tuple(thrust::get<0>(lhs), true);
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 800d7c6f1f6..0a08c978715 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -17,10 +17,13 @@
 #pragma once
 
 #include <cudf/aggregation.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <memory>
 #include <rmm/cuda_stream_view.hpp>
 
 #include <utility>
@@ -287,6 +290,46 @@ class groupby {
   groups get_groups(cudf::table_view values             = {},
                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+  /**
+   * @brief Performs grouped replace nulls on @p value
+   *
+   * For each `value[i] == NULL` in group `j`, `value[i]` is replaced with the first non-null value
+   * in group `j` that precedes or follows `value[i]`. If a non-null value is not found in the
+   * specified direction, `value[i]` is left NULL.
+   *
+   * The returned pair contains a column of the sorted keys and the result column. In result column,
+   * values of the same group are in contiguous memory. In each group, the order of values maintain
+   * their original order. The order of groups are not guaranteed.
+   *
+   * Example:
+   * @code{.pseudo}
+   *
+   * //Inputs:
+   * keys:    {3 3 1 3 1 3 4}
+   *          {2 2 1 2 1 2 5}
+   * values:  {3 4 7 @ @ @ @}
+   *          {@ @ @ "x" "tt" @ @}
+   * replace_policies:    {FORWARD, BACKWARD}
+   *
+   * //Outputs (group orders may be different):
+   * keys:    {3 3 3 3 1 1 4}
+   *          {2 2 2 2 1 1 5}
+   * result:  {3 4 4 4 7 7 @}
+   *          {"x" "x" "x" @ "tt" "tt" @}
+   * @endcode
+   *
+   * @param[in] values A table whose column null values will be replaced.
+   * @param[in] replace_policies Specify the position of replacement values relative to null values,
+   * one for each column
+   * @param[in] mr Device memory resource used to allocate device memory of the returned column.
+   *
+   * @return Pair that contains a table with the sorted keys and the result column
+   */
+  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
+    table_view const& values,
+    host_span<cudf::replace_policy const> replace_policies,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
  private:
   table_view _keys;                                      ///< Keys that determine grouping
   null_policy _include_null_keys{null_policy::EXCLUDE};  ///< Include rows in keys
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index f132d6b1511..72fbabe100c 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
+#include <cudf/detail/groupby/group_replace_nulls.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -223,6 +224,35 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
   }
 }
 
+std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls(
+  table_view const& values,
+  host_span<cudf::replace_policy const> replace_policies,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(_keys.num_rows() == values.num_rows(),
+               "Size mismatch between group labels and value.");
+  CUDF_EXPECTS(static_cast<cudf::size_type>(replace_policies.size()) == values.num_columns(),
+               "Size mismatch between num_columns and replace_policies.");
+
+  if (values.is_empty()) { return std::make_pair(empty_like(_keys), empty_like(values)); }
+  auto const stream = rmm::cuda_stream_default;
+
+  auto const& group_labels = helper().group_labels(stream);
+  std::vector<std::unique_ptr<column>> results;
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(values.num_columns()),
+                 std::back_inserter(results),
+                 [&](auto i) {
+                   auto grouped_values = helper().grouped_values(values.column(i), stream);
+                   return detail::group_replace_nulls(
+                     grouped_values->view(), group_labels, replace_policies[i], stream, mr);
+                 });
+
+  return std::make_pair(std::move(helper().sorted_keys(stream, mr)),
+                        std::make_unique<table>(std::move(results)));
+}
+
 // Get the sort helper object
 detail::sort::sort_groupby_helper& groupby::helper()
 {
diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu
new file mode 100644
index 00000000000..56e4cb83f71
--- /dev/null
+++ b/cpp/src/groupby/sort/group_replace_nulls.cu
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/gather.cuh>
+#include <cudf/detail/groupby/group_replace_nulls.hpp>
+#include <cudf/detail/replace/nulls.cuh>
+#include <cudf/replace.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+#include <utility>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+
+std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_value,
+                                            device_span<size_type const> group_labels,
+                                            cudf::replace_policy replace_policy,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  cudf::size_type size = grouped_value.size();
+
+  auto device_in = cudf::column_device_view::create(grouped_value);
+  auto index     = thrust::make_counting_iterator<cudf::size_type>(0);
+  auto valid_it  = cudf::detail::make_validity_iterator(*device_in);
+  auto in_begin  = thrust::make_zip_iterator(thrust::make_tuple(index, valid_it));
+
+  rmm::device_uvector<cudf::size_type> gather_map(size, stream);
+  auto gm_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator()));
+
+  auto func = cudf::detail::replace_policy_functor();
+  thrust::equal_to<cudf::size_type> eq;
+  if (replace_policy == cudf::replace_policy::PRECEDING) {
+    thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                  group_labels.begin(),
+                                  group_labels.begin() + size,
+                                  in_begin,
+                                  gm_begin,
+                                  eq,
+                                  func);
+  } else {
+    auto gl_rbegin = thrust::make_reverse_iterator(group_labels.begin() + size);
+    auto in_rbegin = thrust::make_reverse_iterator(in_begin + size);
+    auto gm_rbegin = thrust::make_reverse_iterator(gm_begin + size);
+    thrust::inclusive_scan_by_key(
+      rmm::exec_policy(stream), gl_rbegin, gl_rbegin + size, in_rbegin, gm_rbegin, eq, func);
+  }
+
+  auto output = cudf::detail::gather(cudf::table_view({grouped_value}),
+                                     gather_map.begin(),
+                                     gather_map.end(),
+                                     cudf::out_of_bounds_policy::DONT_CHECK,
+                                     stream,
+                                     mr);
+
+  return std::move(output->release()[0]);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 1e6c779c51a..f55696317f4 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
+#include <cudf/detail/replace/nulls.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/dictionary/detail/replace.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -40,10 +41,13 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
+#include <thrust/functional.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
 #include <thrust/transform.h>
 
 namespace {  // anonymous
@@ -356,22 +360,6 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   return cudf::dictionary::detail::replace_nulls(dict_input, replacement, stream, mr);
 }
 
-/**
- * @brief Functor used by `inclusive_scan` to determine the index to gather from in
- *        the result column. When current row in input column is NULL, return previous
- *        accumulated index, otherwise return the current index. The second element in
- *        the return tuple is discarded.
- */
-struct replace_policy_functor {
-  __device__ thrust::tuple<cudf::size_type, bool> operator()(
-    thrust::tuple<cudf::size_type, bool> const& lhs,
-    thrust::tuple<cudf::size_type, bool> const& rhs)
-  {
-    return thrust::get<1>(rhs) ? thrust::make_tuple(thrust::get<0>(rhs), true)
-                               : thrust::make_tuple(thrust::get<0>(lhs), true);
-  }
-};
-
 /**
  * @brief Function used by replace_nulls policy
  */
@@ -390,7 +378,7 @@ std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const&
   auto gm_begin = thrust::make_zip_iterator(
     thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator()));
 
-  auto func = replace_policy_functor();
+  auto func = cudf::detail::replace_policy_functor();
   if (replace_policy == cudf::replace_policy::PRECEDING) {
     thrust::inclusive_scan(
       rmm::exec_policy(stream), in_begin, in_begin + input.size(), gm_begin, func);
@@ -414,6 +402,7 @@ std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const&
 
 namespace cudf {
 namespace detail {
+
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
                                             rmm::cuda_stream_view stream,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 7900c5b3274..a3df5989c3b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -72,6 +72,7 @@ ConfigureTest(GROUPBY_TEST
     groupby/nunique_tests.cpp
     groupby/product_tests.cpp
     groupby/quantile_tests.cpp
+    groupby/replace_nulls_tests.cpp
     groupby/shift_tests.cpp
     groupby/std_tests.cpp
     groupby/sum_of_squares_tests.cpp
diff --git a/cpp/tests/groupby/replace_nulls_tests.cpp b/cpp/tests/groupby/replace_nulls_tests.cpp
new file mode 100644
index 00000000000..527c7dba725
--- /dev/null
+++ b/cpp/tests/groupby/replace_nulls_tests.cpp
@@ -0,0 +1,366 @@
+
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+namespace cudf {
+namespace test {
+
+using K = int32_t;
+
+template <typename T>
+struct GroupbyReplaceNullsFixedWidthTest : public BaseFixture {
+};
+
+TYPED_TEST_CASE(GroupbyReplaceNullsFixedWidthTest, FixedWidthTypes);
+
+template <typename K, typename V>
+void TestReplaceNullsGroupbySingle(
+  K const& key, V const& input, K const& expected_key, V const& expected_val, replace_policy policy)
+{
+  groupby::groupby gb_obj(table_view({key}));
+  std::vector<replace_policy> policies{policy};
+  auto p = gb_obj.replace_nulls(table_view({input}), policies);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*p.first, table_view({expected_key}));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*p.second, table_view({expected_val}));
+}
+
+TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, PrecedingFill)
+{
+  // Group 0 value: {42, 24, null}  --> {42, 24, 24}
+  // Group 1 value: {7, null, null} --> {7, 7, 7}
+  fixed_width_column_wrapper<K> key{0, 1, 0, 1, 0, 1};
+  fixed_width_column_wrapper<TypeParam> val({42, 7, 24, 10, 1, 1000}, {1, 1, 1, 0, 0, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> expect_val({42, 24, 24, 7, 7, 7}, all_valid());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, FollowingFill)
+{
+  // Group 0 value: {2, null, 32}               --> {2, 32, 32}
+  // Group 1 value: {8, null, null, 128, 256}   --> {8, 128, 128, 128, 256}
+  fixed_width_column_wrapper<K> key{0, 0, 1, 1, 0, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> val({2, 4, 8, 16, 32, 64, 128, 256},
+                                            {1, 0, 1, 0, 1, 0, 1, 1});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> expect_val({2, 32, 32, 8, 128, 128, 128, 256}, all_valid());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, PrecedingFillLeadingNulls)
+{
+  // Group 0 value: {null, 24, null}    --> {null, 24, 24}
+  // Group 1 value: {null, null, null}  --> {null, null, null}
+  fixed_width_column_wrapper<K> key{0, 1, 0, 1, 0, 1};
+  fixed_width_column_wrapper<TypeParam> val({42, 7, 24, 10, 1, 1000}, {0, 0, 1, 0, 0, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> expect_val({-1, 24, 24, -1, -1, -1}, {0, 1, 1, 0, 0, 0});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, FollowingFillTrailingNulls)
+{
+  // Group 0 value: {2, null, null}                 --> {2, null, null}
+  // Group 1 value: {null, null, 64, null, null}    --> {64, 64, 64, null, null}
+  fixed_width_column_wrapper<K> key{0, 0, 1, 1, 0, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> val({2, 4, 8, 16, 32, 64, 128, 256},
+                                            {1, 0, 0, 0, 0, 1, 0, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> expect_val({2, -1, -1, 64, 64, 64, -1, -1},
+                                                   {1, 0, 0, 1, 1, 1, 0, 0});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+struct GroupbyReplaceNullsStringsTest : public BaseFixture {
+};
+
+TEST_F(GroupbyReplaceNullsStringsTest, PrecedingFill)
+{
+  // Group 0 value: {"y" "42"}  --> {"y", "42"}
+  // Group 1 value: {"xx" @ "zzz" @ "one"} --> {"xx" "xx" "zzz" "zzz" "one"}
+  fixed_width_column_wrapper<K> key{1, 1, 0, 1, 0, 1, 1};
+  strings_column_wrapper val({"xx", "", "y", "zzz", "42", "", "one"},
+                             {true, false, true, true, true, false, true});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
+  strings_column_wrapper expect_val({"y", "42", "xx", "xx", "zzz", "zzz", "one"}, all_valid());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TEST_F(GroupbyReplaceNullsStringsTest, FollowingFill)
+{
+  // Group 0 value: {@ "42"}  --> {"42", "42"}
+  // Group 1 value: {"xx" @ "zzz" @ "one"} --> {"xx" "zzz" "zzz" "one" "one"}
+  fixed_width_column_wrapper<K> key{1, 1, 0, 1, 0, 1, 1};
+  strings_column_wrapper val({"xx", "", "", "zzz", "42", "", "one"},
+                             {true, false, false, true, true, false, true});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
+  strings_column_wrapper expect_val({"42", "42", "xx", "zzz", "zzz", "one", "one"}, all_valid());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+TEST_F(GroupbyReplaceNullsStringsTest, PrecedingFillPrecedingNull)
+{
+  // Group 0 value: {"y" "42"}  --> {"y", "42"}
+  // Group 1 value: {@ @ "zzz" "zzz" "zzz"} --> {@ @ "zzz" "zzz" "zzz"}
+  fixed_width_column_wrapper<K> key{1, 1, 0, 1, 0, 1, 1};
+  strings_column_wrapper val({"", "", "y", "zzz", "42", "", ""},
+                             {false, false, true, true, true, false, false});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
+  strings_column_wrapper expect_val({"y", "42", "", "", "zzz", "zzz", "zzz"},
+                                    {true, true, false, false, true, true, true});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TEST_F(GroupbyReplaceNullsStringsTest, FollowingFillTrailingNull)
+{
+  // Group 0 value: {@ "y"}  --> {"y", "y"}
+  // Group 1 value: {"xx" @ "zzz" @ @} --> {"xx" "zzz" "zzz" @ @}
+  fixed_width_column_wrapper<K> key{1, 1, 0, 1, 0, 1, 1};
+  strings_column_wrapper val({"xx", "", "", "zzz", "y", "", ""},
+                             {true, false, false, true, true, false, false});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
+  strings_column_wrapper expect_val({"y", "y", "xx", "zzz", "zzz", "", ""},
+                                    {true, true, true, true, true, false, false});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+template <typename T>
+struct GroupbyReplaceNullsListsTest : public BaseFixture {
+};
+
+TYPED_TEST_CASE(GroupbyReplaceNullsListsTest, FixedWidthTypes);
+
+TYPED_TEST(GroupbyReplaceNullsListsTest, PrecedingFillNonNested)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  // Group 0 value: {{1 2 3} @ {4 5} @} --> {{1 2 3} {1 2 3} {4 5} {4 5}}, w/o leading nulls
+  // Group 1 value: {@ {} @} --> {@ {} {}}, w/ leading nulls
+  fixed_width_column_wrapper<K> key{0, 1, 0, 0, 1, 1, 0};
+
+  std::vector<valid_type> mask{1, 0, 0, 1, 1, 0, 0};
+  LCW val({{1, 2, 3}, {}, {}, {4, 5}, {}, {}, {}}, mask.begin());
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 0, 1, 1, 1};
+  std::vector<valid_type> expected_mask{1, 1, 1, 1, 0, 1, 1};
+  LCW expect_val({{1, 2, 3}, {1, 2, 3}, {4, 5}, {4, 5}, {-1}, {}, {}}, expected_mask.begin());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsListsTest, FollowingFillNonNested)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  // Group 0 value: {@ {5 6} @ {-1}} --> {{5 6} {5 6} {-1} {-1}}, w/o trailing nulls
+  // Group 1 value: {@ {} @} --> {{} {} @}}, w/ trailing nulls
+  fixed_width_column_wrapper<K> key{0, 1, 0, 0, 1, 1, 0};
+
+  std::vector<valid_type> mask{0, 0, 1, 0, 1, 0, 1};
+  LCW val({{}, {}, {5, 6}, {}, {}, {}, {-1}}, mask.begin());
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 0, 1, 1, 1};
+  std::vector<valid_type> expected_mask{1, 1, 1, 1, 1, 1, 0};
+  LCW expect_val({{5, 6}, {5, 6}, {-1}, {-1}, {}, {}, {}}, expected_mask.begin());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsListsTest, PrecedingFillNested)
+{
+  using LCW    = lists_column_wrapper<TypeParam, int32_t>;
+  using Mask_t = std::vector<valid_type>;
+  // Group 0 value: {{{1 @ 3} @}
+  //                 @
+  //                 {{@} {}}}} -->
+  //                {{{1 @ 3} @}
+  //                 {{1 @ 3} @}
+  //                 {{@} {}}}}, w/o leading nulls
+  // Group 1 value: {@
+  //                 {@ {102 @}}
+  //                 @
+  //                 {{@ 202} {}}}} -->
+  //                {@
+  //                 {@ {102 @}}
+  //                 {@ {102 @}}
+  //                 {{@ 202} {}}}}, w/ leading nulls
+  // Only top level nulls are replaced.
+  fixed_width_column_wrapper<K> key{1, 0, 1, 1, 0, 0, 1};
+
+  // clang-format off
+  LCW val({{},
+           LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+           LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+           {},
+           {},
+           {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+           {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}}},
+           Mask_t{0, 1, 1, 0, 0, 1, 1}.begin());
+  // clang-format on
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
+
+  // clang-format off
+  LCW expect_val({LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+                  LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+                  {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+                  {},
+                  LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+                  LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+                  {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}}},
+           Mask_t{1, 1, 1, 0, 1 ,1 ,1}.begin());
+  // clang-format on
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsListsTest, FollowingFillNested)
+{
+  using LCW    = lists_column_wrapper<TypeParam, int32_t>;
+  using Mask_t = std::vector<valid_type>;
+  // Group 0 value: {{{1 @ 3} @}
+  //                 @
+  //                 {{@} {}}}} -->
+  //                {{{1 @ 3} @}
+  //                 {{@} {}}}}
+  //                 {{@} {}}}}, w/o trailing nulls
+  // Group 1 value: {{@ {102 @}}
+  //                 @
+  //                 {{@ 202} {}}}}
+  //                 @ -->
+  //                {{@ {102 @}}
+  //                 {{@ 202} {}}}
+  //                 {{@ 202} {}}}
+  //                 @}, w/ trailing nulls
+  // Only top level nulls are replaced.
+  fixed_width_column_wrapper<K> key{1, 0, 1, 1, 0, 0, 1};
+
+  // clang-format off
+  LCW val({LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+           LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+           {},
+           {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}},
+           {},
+           {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+           {}},
+           Mask_t{1, 1, 0, 1, 0, 1, 0}.begin());
+  // clang-format on
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
+
+  // clang-format off
+  LCW expect_val({LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+                 {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+                 {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+                 LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+                 {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}},
+                 {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}},
+                 {}},
+           Mask_t{1, 1, 1, 1, 1, 1, 0}.begin());
+  // clang-format on
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+struct GroupbyReplaceNullsStructsTest : public BaseFixture {
+  using SCW = structs_column_wrapper;
+
+  SCW data(fixed_width_column_wrapper<int32_t> field0,
+           strings_column_wrapper field1,
+           lists_column_wrapper<int32_t> field2,
+           std::initializer_list<valid_type> mask)
+  {
+    return SCW({field0, field1, field2}, mask.begin());
+  }
+};
+
+TEST_F(GroupbyReplaceNullsStructsTest, PrecedingFill)
+{
+  using LCW    = lists_column_wrapper<int32_t>;
+  using Mask_t = std::vector<valid_type>;
+  fixed_width_column_wrapper<K> key{1, 0, 0, 1, 0, 1, 1};
+
+  // Only null rows are replaced.
+
+  SCW val =
+    this->data({{1, -1, 3, -1, -1, -1, 7}, {1, 0, 1, 0, 0, 0, 1}},
+               {{"x", "yy", "", "", "", "zz", ""}, {true, true, false, false, false, true, false}},
+               LCW({{1, 2, 3}, {-1}, {}, {}, {42}, {}, {}}, Mask_t{1, 1, 0, 0, 1, 0, 0}.begin()),
+               {1, 1, 0, 0, 1, 1, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
+
+  SCW expect_val = this->data(
+    {{-1, -1, -1, 1, 1, -1, -1}, {0, 0, 0, 1, 1, 0, 0}},
+    {{"yy", "yy", "", "x", "x", "zz", "zz"}, {true, true, false, true, true, true, true}},
+    LCW({LCW{-1}, {-1}, {42}, {1, 2, 3}, {1, 2, 3}, {}, {}}, Mask_t{1, 1, 1, 1, 1, 0, 0}.begin()),
+    {1, 1, 1, 1, 1, 1, 1});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TEST_F(GroupbyReplaceNullsStructsTest, FollowingFill)
+{
+  using LCW    = lists_column_wrapper<int32_t>;
+  using Mask_t = std::vector<valid_type>;
+  fixed_width_column_wrapper<K> key{1, 0, 0, 1, 0, 1, 1};
+
+  // Only null rows are replaced.
+
+  SCW val =
+    this->data({{1, -1, 3, -1, -1, -1, 7}, {1, 0, 1, 0, 0, 0, 1}},
+               {{"x", "yy", "", "", "", "zz", ""}, {true, true, false, false, false, true, false}},
+               LCW({{1, 2, 3}, {-1}, {}, {}, {42}, {}, {}}, Mask_t{1, 1, 0, 0, 1, 0, 0}.begin()),
+               {1, 1, 0, 0, 1, 1, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
+
+  SCW expect_val = this->data(
+    {{-1, -1, -1, 1, -1, -1, -1}, {0, 0, 0, 1, 0, 0, 0}},
+    {{"yy", "", "", "x", "zz", "zz", ""}, {true, false, false, true, true, true, false}},
+    LCW({LCW{-1}, {42}, {42}, {1, 2, 3}, {}, {}, {}}, Mask_t{1, 1, 1, 1, 0, 0, 0}.begin()),
+    {1, 1, 1, 1, 1, 1, 0});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd
index 2225898d697..f3bad855725 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/cpp/groupby.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from libcpp.vector cimport vector
 from libcpp.memory cimport unique_ptr
@@ -11,6 +11,8 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.aggregation cimport aggregation
 from cudf._lib.cpp.types cimport size_type, order, null_order, null_policy
+from cudf._lib.cpp.replace cimport replace_policy
+from cudf._lib.cpp.utilities.host_span cimport host_span
 
 
 cdef extern from "cudf/groupby.hpp" \
@@ -74,3 +76,8 @@ cdef extern from "cudf/groupby.hpp" \
 
         groups get_groups() except +
         groups get_groups(table_view values) except +
+
+        pair[unique_ptr[table], unique_ptr[table]] replace_nulls(
+            const table_view& value,
+            const vector[replace_policy] replace_policy
+        ) except +
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 3d1a6493028..1979ddf8f0c 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -24,7 +24,12 @@ from cudf._lib.column cimport Column
 from cudf._lib.table cimport Table
 from cudf._lib.aggregation cimport Aggregation, make_aggregation
 
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table cimport table, table_view
+from cudf._lib.cpp.replace cimport replace_policy
+from cudf._lib.cpp.utilities.host_span cimport host_span
+from cudf._lib.cpp.types cimport size_type
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 
@@ -202,6 +207,32 @@ cdef class GroupBy:
 
         return Table(data=result_data, index=grouped_keys)
 
+    def replace_nulls(self, Table values, object method):
+        cdef table_view val_view = values.view()
+        cdef pair[unique_ptr[table], unique_ptr[table]] c_result
+        cdef replace_policy policy = (
+            replace_policy.PRECEDING
+            if method == 'ffill' else replace_policy.FOLLOWING
+        )
+        cdef vector[replace_policy] policies = vector[replace_policy](
+            val_view.num_columns(), policy
+        )
+
+        with nogil:
+            c_result = move(
+                self.c_obj.get()[0].replace_nulls(val_view, policies)
+            )
+
+        sorted_keys = Table.from_unique_ptr(
+            move(c_result.first),
+            column_names=self.keys._column_names
+        )
+        grouped_result = Table.from_unique_ptr(
+            move(c_result.second), column_names=values._column_names
+        )
+
+        result = Table(data=grouped_result, index=sorted_keys)
+        return result
 
 _GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"}
 

From dd5eecd4d726915b2a28c42360419535a01b87de Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Mon, 24 May 2021 15:54:27 -0600
Subject: [PATCH 24/24] Fix struct binary search and struct flattening (#8268)

This PR fixes several bugs. In particular:
 * Fixes a bug in struct binary search that only check for null elements at the top level.
 * Fixes a bug in struct flattening that uses default null order for the children of the input structs columns.

Unit tests for struct binary search and struct sorting are also rewritten.
Closes #8189.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jake Hemstad (https://github.com/jrhemstad)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/8268
---
 cpp/include/cudf/table/table_view.hpp   | 14 +++++-
 cpp/src/search/search.cu                |  2 +-
 cpp/src/structs/utilities.cpp           |  6 +--
 cpp/tests/search/search_struct_test.cpp | 20 ++++----
 cpp/tests/sort/sort_test.cpp            | 61 ++++++++++++++-----------
 5 files changed, 59 insertions(+), 44 deletions(-)

diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index a225e590f9a..1ff701c3b01 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -257,9 +257,19 @@ class mutable_table_view : public detail::table_view_base<mutable_column_view> {
   mutable_table_view(std::vector<mutable_table_view> const& views);
 };
 
-inline bool has_nulls(table_view view)
+inline bool has_nulls(table_view const& view)
 {
-  return std::any_of(view.begin(), view.end(), [](column_view col) { return col.has_nulls(); });
+  return std::any_of(view.begin(), view.end(), [](auto const& col) { return col.has_nulls(); });
+}
+
+inline bool has_nested_nulls(table_view const& input)
+{
+  return std::any_of(input.begin(), input.end(), [](auto const& col) {
+    return col.has_nulls() ||
+           std::any_of(col.child_begin(), col.child_end(), [](auto const& child_col) {
+             return has_nested_nulls(table_view{{child_col}});
+           });
+  });
 }
 
 /**
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index cc4f0727a77..fbb89bec731 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -101,7 +101,7 @@ std::unique_ptr<column> search_ordered(table_view const& t,
   auto const matched = dictionary::detail::match_dictionaries({t, values}, stream);
 
   // Prepare to flatten the structs column
-  auto const has_null_elements   = has_nulls(t) or has_nulls(values);
+  auto const has_null_elements   = has_nested_nulls(t) or has_nested_nulls(values);
   auto const flatten_nullability = has_null_elements
                                      ? structs::detail::column_nullability::FORCE
                                      : structs::detail::column_nullability::MATCH_INCOMING;
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 6cc537d2042..4f7795bad7a 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -107,13 +107,11 @@ struct flattened_table {
     for (decltype(col.num_children()) i = 0; i < col.num_children(); ++i) {
       auto const& child = col.get_sliced_child(i);
       if (child.type().id() == type_id::STRUCT) {
-        flatten_struct_column(structs_column_view{child}, col_order, null_order::BEFORE);
-        // default spark behaviour is null_order::BEFORE
+        flatten_struct_column(structs_column_view{child}, col_order, col_null_order);
       } else {
         flat_columns.push_back(child);
         if (not column_order.empty()) flat_column_order.push_back(col_order);
-        if (not null_precedence.empty()) flat_null_precedence.push_back(null_order::BEFORE);
-        // default spark behaviour is null_order::BEFORE
+        if (not null_precedence.empty()) flat_null_precedence.push_back(col_null_order);
       }
     }
   }
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
index fbd8b283a42..1c2e9b02f05 100644
--- a/cpp/tests/search/search_struct_test.cpp
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -167,8 +167,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests)
   structs_t   = structs_col{{child_col_t}, null_at(10)}.release();
   results =
     search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
-  expected_lower_bound = int32s_col{1, 0, 10, 10, 2, 10};
-  expected_upper_bound = int32s_col{2, 0, 10, 11, 6, 10};
+  expected_lower_bound = int32s_col{1, 9, 9, 10, 2, 9};
+  expected_upper_bound = int32s_col{2, 10, 9, 11, 6, 9};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
 
@@ -187,8 +187,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests)
   structs_t   = structs_col{{child_col_t}, null_at(10)}.release();
   results =
     search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::AFTER});
-  expected_lower_bound = int32s_col{7, 11, 0, 0, 3, 0};
-  expected_upper_bound = int32s_col{8, 11, 0, 0, 7, 0};
+  expected_lower_bound = int32s_col{7, 0, 0, 0, 3, 0};
+  expected_upper_bound = int32s_col{8, 0, 0, 0, 7, 0};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
 }
@@ -214,8 +214,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithValuesHavingNullsTests)
   // Sorted asc, search nulls last
   results =
     search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
-  expected_lower_bound = int32s_col{3, 0, 11, 11, 4, 11};
-  expected_upper_bound = int32s_col{4, 0, 11, 11, 8, 11};
+  expected_lower_bound = int32s_col{3, 11, 11, 11, 4, 11};
+  expected_upper_bound = int32s_col{4, 11, 11, 11, 8, 11};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
 
@@ -232,8 +232,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithValuesHavingNullsTests)
   // Sorted dsc, search nulls last
   results =
     search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::AFTER});
-  expected_lower_bound = int32s_col{7, 11, 0, 0, 3, 0};
-  expected_upper_bound = int32s_col{8, 11, 0, 0, 7, 0};
+  expected_lower_bound = int32s_col{7, 0, 0, 0, 3, 0};
+  expected_upper_bound = int32s_col{8, 0, 0, 0, 7, 0};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
 }
@@ -261,8 +261,8 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithTargetHavingNullsTests)
   structs_t   = structs_col{{child_col_t}, null_at(10)}.release();
   results =
     search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
-  expected_lower_bound = int32s_col{1, 0, 10, 0, 2, 10};
-  expected_upper_bound = int32s_col{2, 1, 10, 1, 6, 10};
+  expected_lower_bound = int32s_col{1, 0, 9, 0, 2, 9};
+  expected_upper_bound = int32s_col{2, 1, 9, 1, 6, 9};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
 
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 9eb082c513c..0f4688119b7 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -50,11 +50,16 @@ void run_sort_test(table_view input,
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort_by_key_table->view(), got_sort_by_key_table->view());
 }
 
+using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                     cudf::test::FloatingPointTypes,
+                                     cudf::test::DurationTypes,
+                                     cudf::test::TimestampTypes>;
+
 template <typename T>
 struct Sort : public BaseFixture {
 };
 
-TYPED_TEST_CASE(Sort, NumericTypes);
+TYPED_TEST_CASE(Sort, TestTypes);
 
 TYPED_TEST(Sort, WithNullMax)
 {
@@ -461,14 +466,14 @@ TYPED_TEST(Sort, WithStructColumnCombinations)
     +------------+       +------------+      +------------+       +------------+
     |           s|       |           s|      |           s|       |           s|
     +------------+       +------------+      +------------+       +------------+
-  2 |        null|     1 |   {1, null}|    2 |        null|     3 |{null, null}|
-  4 |        null|     0 |   {0, null}|    4 |        null|     5 |{null, null}|
-  1 |   {1, null}|     6 |   {null, 1}|    3 |{null, null}|     7 |   {null, 0}|
-  0 |   {0, null}|     7 |   {null, 0}|    5 |{null, null}|     6 |   {null, 1}|
-  6 |   {null, 1}|     3 |{null, null}|    7 |   {null, 0}|     0 |   {0, null}|
-  7 |   {null, 0}|     5 |{null, null}|    6 |   {null, 1}|     1 |   {1, null}|
-  3 |{null, null}|     2 |        null|    0 |   {0, null}|     2 |        null|
-  5 |{null, null}|     4 |        null|    1 |   {1, null}|     4 |        null|
+  2 |        null|     1 |   {1, null}|    2 |        null|     0 |   {0, null}|
+  4 |        null|     0 |   {0, null}|    4 |        null|     1 |   {1, null}|
+  3 |{null, null}|     6 |   {null, 1}|    3 |{null, null}|     7 |   {null, 0}|
+  5 |{null, null}|     7 |   {null, 0}|    5 |{null, null}|     6 |   {null, 1}|
+  6 |   {null, 1}|     3 |{null, null}|    7 |   {null, 0}|     3 |{null, null}|
+  7 |   {null, 0}|     5 |{null, null}|    6 |   {null, 1}|     5 |{null, null}|
+  1 |   {1, null}|     2 |        null|    0 |   {0, null}|     2 |        null|
+  0 |   {0, null}|     4 |        null|    1 |   {1, null}|     4 |        null|
     +------------+       +------------+      +------------+       +------------+
   */
   // clang-format on
@@ -477,7 +482,7 @@ TYPED_TEST(Sort, WithStructColumnCombinations)
   std::vector<order> column_order1{order::DESCENDING};
 
   // desc_nulls_first
-  fixed_width_column_wrapper<int32_t> expected1{{2, 4, 1, 0, 6, 7, 3, 5}};
+  fixed_width_column_wrapper<int32_t> expected1{{2, 4, 3, 5, 6, 7, 1, 0}};
   auto got = sorted_order(input, column_order1, {null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
   // Run test for sort and sort_by_key
@@ -499,7 +504,7 @@ TYPED_TEST(Sort, WithStructColumnCombinations)
   run_sort_test(input, expected3, column_order2, {null_order::BEFORE});
 
   // asce_nulls_last
-  fixed_width_column_wrapper<int32_t> expected4{{3, 5, 7, 6, 0, 1, 2, 4}};
+  fixed_width_column_wrapper<int32_t> expected4{{0, 1, 7, 6, 3, 5, 2, 4}};
   got = sorted_order(input, column_order2, {null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
   // Run test for sort and sort_by_key
@@ -534,14 +539,14 @@ TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls)
     +------------+      +------------+
     |           s|      |           s|
     +------------+      +------------+
-  2 |      {9, 9}|    3 |{null, null}|
-  4 |      {9, 9}|    5 |{null, null}|
-  1 |   {1, null}|    7 |   {null, 0}|
-  0 |   {0, null}|    6 |   {null, 1}|
-  6 |   {null, 1}|    0 |   {0, null}|
-  7 |   {null, 0}|    1 |   {1, null}|
-  3 |{null, null}|    2 |      {9, 9}|
-  5 |{null, null}|    4 |      {9, 9}|
+  3 |{null, null}|    0 |   {0, null}|
+  5 |{null, null}|    1 |   {1, null}|
+  6 |   {null, 1}|    2 |      {9, 9}|
+  7 |   {null, 0}|    4 |      {9, 9}|
+  2 |      {9, 9}|    7 |   {null, 0}|
+  4 |      {9, 9}|    6 |   {null, 1}|
+  1 |   {1, null}|    3 |{null, null}|
+  0 |   {0, null}|    5 |{null, null}|
     +------------+      +------------+
   */
   // clang-format on
@@ -550,31 +555,33 @@ TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls)
   std::vector<order> column_order{order::DESCENDING};
 
   // desc_nulls_first
-  fixed_width_column_wrapper<int32_t> expected1{{2, 4, 1, 0, 6, 7, 3, 5}};
+  fixed_width_column_wrapper<int32_t> expected1{{3, 5, 6, 7, 2, 4, 1, 0}};
   auto got = sorted_order(input, column_order, {null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
   // Run test for sort and sort_by_key
   run_sort_test(input, expected1, column_order, {null_order::AFTER});
 
   // desc_nulls_last
+  fixed_width_column_wrapper<int32_t> expected2{{2, 4, 1, 0, 6, 7, 3, 5}};
   got = sorted_order(input, column_order, {null_order::BEFORE});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected1, column_order, {null_order::BEFORE});
+  run_sort_test(input, expected2, column_order, {null_order::BEFORE});
 
   // asce_nulls_first
   std::vector<order> column_order2{order::ASCENDING};
-  fixed_width_column_wrapper<int32_t> expected2{{3, 5, 7, 6, 0, 1, 2, 4}};
+  fixed_width_column_wrapper<int32_t> expected3{{3, 5, 7, 6, 0, 1, 2, 4}};
   got = sorted_order(input, column_order2, {null_order::BEFORE});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected2, column_order2, {null_order::BEFORE});
+  run_sort_test(input, expected3, column_order2, {null_order::BEFORE});
 
   // asce_nulls_last
+  fixed_width_column_wrapper<int32_t> expected4{{0, 1, 2, 4, 7, 6, 3, 5}};
   got = sorted_order(input, column_order2, {null_order::AFTER});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected2, column_order2, {null_order::AFTER});
+  run_sort_test(input, expected4, column_order2, {null_order::AFTER});
 }
 
 TYPED_TEST(Sort, Stable)