From 0a00579350e37e9fb9c159f22c4def8c60c6f8f5 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Wed, 25 Aug 2021 13:15:27 -0500
Subject: [PATCH 1/8] Remove -g from cython compile commands (#9074)

Removes `-g` from the compile commands generated by distutils to compile Cython files.

This will make our container images, conda packages, and python wheels smaller.
---
 ci/gpu/build.sh                          |  4 +-
 conda/environments/cudf_dev_cuda11.0.yml |  4 +-
 conda/environments/cudf_dev_cuda11.2.yml |  4 +-
 python/cudf/setup.py                     | 81 ++++++++++++++++--------
 4 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 8ebc85e5736..d825de2ad04 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -101,8 +101,8 @@ function install_dask {
     # Install the main version of dask, distributed, and streamz
     gpuci_logger "Install the main version of dask, distributed, and streamz"
     set -x
-    pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-    pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+    pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps
+    pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps
     # Need to uninstall streamz that is already in the env.
     pip uninstall -y streamz
     pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 62b59c3f081..d8635b09f8b 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -58,7 +58,7 @@ dependencies:
   - cachetools
   - transformers
   - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
+      - git+https://github.com/dask/dask.git@2021.07.1
+      - git+https://github.com/dask/distributed.git@2021.07.1
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 94c7116802b..61af2f8aef1 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -58,7 +58,7 @@ dependencies:
   - cachetools
   - transformers
   - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
+      - git+https://github.com/dask/dask.git@2021.07.1
+      - git+https://github.com/dask/distributed.git@2021.07.1
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 54921396b6f..e9fd3ae9d1f 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -6,13 +6,24 @@
 import subprocess
 import sys
 import sysconfig
+
+# Must import in this order:
+#   setuptools -> Cython.Distutils.build_ext -> setuptools.command.build_ext
+# Otherwise, setuptools.command.build_ext ends up inheriting from
+# Cython.Distutils.old_build_ext which we do not want
+import setuptools
+
+try:
+    from Cython.Distutils.build_ext import new_build_ext as _build_ext
+except ImportError:
+    from setuptools.command.build_ext import build_ext as _build_ext
+
 from distutils.spawn import find_executable
 from distutils.sysconfig import get_python_lib
 
 import numpy as np
 import pyarrow as pa
-from Cython.Build import cythonize
-from Cython.Distutils import build_ext
+import setuptools.command.build_ext
 from setuptools import find_packages, setup
 from setuptools.extension import Extension
 
@@ -105,22 +116,46 @@ def get_cuda_version_from_header(cuda_include_dir, delimeter=""):
     ),
 )
 
-try:
-    nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
-except Exception:
-    nthreads = 0
 
-cmdclass = versioneer.get_cmdclass()
+class build_ext_and_proto_no_debug(_build_ext):
+    def build_extensions(self):
+        def remove_flags(compiler, *flags):
+            for flag in flags:
+                try:
+                    compiler.compiler_so = list(
+                        filter((flag).__ne__, compiler.compiler_so)
+                    )
+                except Exception:
+                    pass
 
+        # Full optimization
+        self.compiler.compiler_so.append("-O3")
+        # Silence '-Wunknown-pragmas' warning
+        self.compiler.compiler_so.append("-Wno-unknown-pragmas")
+        # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
+        remove_flags(
+            self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"
+        )
+        super().build_extensions()
 
-class build_ext_and_proto(build_ext):
-    def build_extensions(self):
-        try:
-            # Silence the '-Wstrict-prototypes' warning
-            self.compiler.compiler_so.remove("-Wstrict-prototypes")
-        except Exception:
-            pass
-        build_ext.build_extensions(self)
+    def finalize_options(self):
+        if self.distribution.ext_modules:
+            # Delay import this to allow for Cython-less installs
+            from Cython.Build.Dependencies import cythonize
+
+            nthreads = getattr(self, "parallel", None)  # -j option in Py3.5+
+            nthreads = int(nthreads) if nthreads else None
+            self.distribution.ext_modules = cythonize(
+                self.distribution.ext_modules,
+                nthreads=nthreads,
+                force=self.force,
+                gdb_debug=False,
+                compiler_directives=dict(
+                    profile=False, language_level=3, embedsignature=True
+                ),
+            )
+        # Skip calling super() and jump straight to setuptools
+        setuptools.command.build_ext.build_ext.finalize_options(self)
 
     def run(self):
         # Get protoc
@@ -158,11 +193,9 @@ def run(self):
                     src.write(new_src_content)
 
         # Run original Cython build_ext command
-        build_ext.run(self)
+        _build_ext.run(self)
 
 
-cmdclass["build_ext"] = build_ext_and_proto
-
 extensions = [
     Extension(
         "*",
@@ -196,6 +229,10 @@ def run(self):
     )
 ]
 
+cmdclass = versioneer.get_cmdclass()
+cmdclass["build_ext"] = build_ext_and_proto_no_debug
+
+
 setup(
     name="cudf",
     version=versioneer.get_version(),
@@ -214,13 +251,7 @@ def run(self):
     ],
     # Include the separately-compiled shared library
     setup_requires=["cython", "protobuf"],
-    ext_modules=cythonize(
-        extensions,
-        nthreads=nthreads,
-        compiler_directives=dict(
-            profile=False, language_level=3, embedsignature=True
-        ),
-    ),
+    ext_modules=extensions,
     packages=find_packages(include=["cudf", "cudf.*"]),
     package_data=dict.fromkeys(
         find_packages(include=["cudf._lib*"]), ["*.pxd"],

From 9da7c01bf394243ae37319277e83a8edda3b4c70 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 22 Sep 2021 12:37:56 -0400
Subject: [PATCH 2/8] Fix call to thrust::reduce_by_key in argmin/argmax
 libcudf groupby (#9263)

Closes #9156

This PR simplifies the parameters when calling thrust::reduce_by_key for the argmin/argmax aggregations in cudf::groupby. The illegalMemoryAccess found in #9156 was due to invalid data being passed from thrust::reduce_by_key through to the BinaryPredicate function as documented in NVIDIA/thrust#1525

The invalid data being passed is only a real issue for strings columns where the device pointer was neither nullptr nor a valid address. The new logic provides only size_type values to thrust::reduce_by_key so invalid values can only be out-of-bounds for the input column which is easily checked before retrieving the string_view objects within the ArgMin and ArgMax operators.

This the same as #9244 but based on 21.10

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Nghia Truong (https://github.com/ttnghia)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/9263
---
 .../sort/group_single_pass_reduction_util.cuh | 96 ++++++-------------
 1 file changed, 30 insertions(+), 66 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 8eccadd653e..db2ae5b5d8e 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -31,77 +31,50 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
 
 namespace cudf {
 namespace groupby {
 namespace detail {
 
-// ArgMin binary operator with tuple of (value, index)
+/**
+ * @brief ArgMin binary operator with index values into input column.
+ *
+ * @tparam T Type of the underlying column. Must support '<' operator.
+ */
 template <typename T>
 struct ArgMin {
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
-                                            thrust::tuple<T, size_type> const& rhs) const
-  {
-    if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
-      return rhs;
-    else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
-      return lhs;
-    else
-      return thrust::get<0>(lhs) < thrust::get<0>(rhs) ? lhs : rhs;
-  }
-};
-
-// ArgMax binary operator with tuple of (value, index)
-template <typename T>
-struct ArgMax {
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
-                                            thrust::tuple<T, size_type> const& rhs) const
-  {
-    if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
-      return rhs;
-    else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
-      return lhs;
-    else
-      return thrust::get<0>(lhs) > thrust::get<0>(rhs) ? lhs : rhs;
-  }
-};
-
-struct get_tuple_second_element {
-  template <typename T>
-  __device__ size_type operator()(thrust::tuple<T, size_type> const& rhs) const
+  column_device_view const d_col;
+  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
   {
-    return thrust::get<1>(rhs);
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
+    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
+    return d_col.element<T>(lhs) < d_col.element<T>(rhs) ? lhs : rhs;
   }
 };
 
 /**
- * @brief Functor to store the boolean value to null mask.
+ * @brief ArgMax binary operator with index values into input column.
+ *
+ * @tparam T Type of the underlying column. Must support '<' operator.
  */
-struct bool_to_nullmask {
-  mutable_column_device_view d_result;
-  __device__ void operator()(size_type i, bool rhs)
+template <typename T>
+struct ArgMax {
+  column_device_view const d_col;
+  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
   {
-    if (rhs) {
-      d_result.set_valid(i);
-    } else {
-      d_result.set_null(i);
-    }
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
+    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
+    return d_col.element<T>(rhs) < d_col.element<T>(lhs) ? lhs : rhs;
   }
 };
 
-/**
- * @brief Returns index for non-null element, and SENTINEL for null element in a column.
- *
- */
-struct null_as_sentinel {
-  column_device_view const col;
-  size_type const SENTINEL;
-  __device__ size_type operator()(size_type i) const { return col.is_null(i) ? SENTINEL : i; }
-};
-
 /**
  * @brief Value accessor for column which supports dictionary column too.
  *
@@ -191,25 +164,16 @@ struct reduce_functor {
     auto resultview = mutable_column_device_view::create(result->mutable_view(), stream);
     auto valuesview = column_device_view::create(values, stream);
     if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
-      constexpr auto SENTINEL =
-        (K == aggregation::ARGMAX ? cudf::detail::ARGMAX_SENTINEL : cudf::detail::ARGMIN_SENTINEL);
-      auto idx_begin =
-        cudf::detail::make_counting_transform_iterator(0, null_as_sentinel{*valuesview, SENTINEL});
-      // dictionary keys are sorted, so dictionary32 index comparison is enough.
-      auto column_begin = valuesview->begin<DeviceType>();
-      auto begin        = thrust::make_zip_iterator(thrust::make_tuple(column_begin, idx_begin));
-      auto result_begin = thrust::make_transform_output_iterator(resultview->begin<ResultDType>(),
-                                                                 get_tuple_second_element{});
       using OpType =
         std::conditional_t<(K == aggregation::ARGMAX), ArgMax<DeviceType>, ArgMin<DeviceType>>;
       thrust::reduce_by_key(rmm::exec_policy(stream),
                             group_labels.data(),
                             group_labels.data() + group_labels.size(),
-                            begin,
+                            thrust::make_counting_iterator<ResultType>(0),
                             thrust::make_discard_iterator(),
-                            result_begin,
-                            thrust::equal_to<size_type>{},
-                            OpType{});
+                            resultview->begin<ResultType>(),
+                            thrust::equal_to<ResultType>{},
+                            OpType{*valuesview});
     } else {
       auto init  = OpType::template identity<DeviceType>();
       auto begin = cudf::detail::make_counting_transform_iterator(

From 20713dff41fd6668e6e631c148e86424597b4934 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 22 Sep 2021 09:55:34 -0700
Subject: [PATCH 3/8] Explicit about bitwidth difference between cudf boolean
 and arrow boolean (#9192)

Currently, we map boolean type to `pa.int8` because the bitwidth of cudf boolean mismatches that in arrow. However the implication of this mapping is subtle and may cause unwanted result such as:

```python
>>> cudf.StructDtype({
    "a": np.bool_,
    "b": np.int8,
})
StructDtype({'a': dtype('int8'), 'b': dtype('int8')})
```

This PR changes the mapping back to `pa.bool_`, and use explicit type handling when we are dealing with type conversion to arrow.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - https://github.com/brandon-b-miller
  - H. Thomson Comer (https://github.com/thomcom)

URL: https://github.com/rapidsai/cudf/pull/9192
---
 python/cudf/cudf/_lib/utils.pyx        | 20 ++++++++++++++++++--
 python/cudf/cudf/core/column/column.py |  5 +----
 python/cudf/cudf/tests/test_dtypes.py  |  9 +++++++++
 python/cudf/cudf/utils/dtypes.py       |  7 ++++++-
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index dd12c92a15a..810cdd51df5 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
@@ -81,7 +82,14 @@ cpdef generate_pandas_metadata(Table table, index):
         ):
             types.append(col.dtype.to_arrow())
         else:
-            types.append(np_to_pa_dtype(col.dtype))
+            # A boolean element takes 8 bits in cudf and 1 bit in
+            # pyarrow. To make sure the cudf format is interperable
+            # in arrow, we use `int8` type when converting from a
+            # cudf boolean array.
+            if col.dtype.type == np.bool_:
+                types.append(pa.int8())
+            else:
+                types.append(np_to_pa_dtype(col.dtype))
 
     # Indexes
     if index is not False:
@@ -125,7 +133,15 @@ cpdef generate_pandas_metadata(Table table, index):
                 elif is_list_dtype(idx):
                     types.append(col.dtype.to_arrow())
                 else:
-                    types.append(np_to_pa_dtype(idx.dtype))
+                    # A boolean element takes 8 bits in cudf and 1 bit in
+                    # pyarrow. To make sure the cudf format is interperable
+                    # in arrow, we use `int8` type when converting from a
+                    # cudf boolean array.
+                    if idx.dtype.type == np.bool_:
+                        types.append(pa.int8())
+                    else:
+                        types.append(np_to_pa_dtype(idx.dtype))
+
                 index_levels.append(idx)
             col_names.append(name)
             index_descriptors.append(descr)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 8f18d83eb31..de278db919d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2089,10 +2089,7 @@ def as_column(
                             data
                         )
                     np_type = np.dtype(dtype).type
-                    if np_type == np.bool_:
-                        pa_type = pa.bool_()
-                    else:
-                        pa_type = np_to_pa_dtype(np.dtype(dtype))
+                    pa_type = np_to_pa_dtype(np.dtype(dtype))
                 data = as_column(
                     pa.array(
                         arbitrary,
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index d98ab0504cc..877cec24afa 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -324,3 +324,12 @@ def test_dtype(in_dtype, expect):
 def test_dtype_raise(in_dtype):
     with pytest.raises(TypeError):
         cudf.dtype(in_dtype)
+
+
+def test_dtype_np_bool_to_pa_bool():
+    """This test case captures that utility np_to_pa_dtype
+    should map np.bool_ to pa.bool_, nuances on bit width
+    difference should be handled elsewhere.
+    """
+
+    assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_()
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 5100f1a9c49..bdaf5e144a5 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -14,6 +14,11 @@
 from cudf.core._compat import PANDAS_GE_120
 
 _NA_REP = "<NA>"
+
+"""Map numpy dtype to pyarrow types.
+Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
+handling is required when converting a Boolean column into arrow.
+"""
 _np_pa_dtypes = {
     np.float64: pa.float64(),
     np.float32: pa.float32(),
@@ -22,7 +27,7 @@
     np.int32: pa.int32(),
     np.int16: pa.int16(),
     np.int8: pa.int8(),
-    np.bool_: pa.int8(),
+    np.bool_: pa.bool_(),
     np.uint64: pa.uint64(),
     np.uint32: pa.uint32(),
     np.uint16: pa.uint16(),

From b0c8bbbab0c819881fab1323fb68f70971a224d7 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 22 Sep 2021 12:21:31 -0500
Subject: [PATCH 4/8] Fix Java column leak in testParquetWriteMap (#9271)

Fixes a Java column vector leak in TableTest#testParquetWriteMap.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9271
---
 java/src/test/java/ai/rapids/cudf/TableTest.java | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index cd1e433d07b..b69dce57180 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -6669,8 +6669,9 @@ void testParquetWriteMap() throws IOException {
     HostColumnVector.StructType structType = new HostColumnVector.StructType(true,
      Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING),
         new HostColumnVector.BasicType(true, DType.STRING)));
-    try (Table t0 = new Table(ColumnVector.fromLists(new HostColumnVector.ListType(true,
-     structType), list1, list2, list3))) {
+    try (ColumnVector listColumn = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            structType), list1, list2, list3);
+         Table t0 = new Table(listColumn)) {
       try (TableWriter writer = Table.writeParquetChunked(options, f)) {
         writer.write(t0);
       }

From ef5ba4cee31a5f335314b5ceec9d0db473aef7a0 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Wed, 22 Sep 2021 14:31:11 -0400
Subject: [PATCH 5/8] Fixing empty input to getMapValue crashing (#9262)

This changes the calls in java/cudf to check for an empty input and return an empty result instead of crashing.

Fixes #9253

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9262
---
 java/src/main/native/src/map_lookup.cu                |  6 +++++-
 .../test/java/ai/rapids/cudf/ColumnVectorTest.java    | 11 +++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index ad791747713..683651799e7 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -183,6 +183,10 @@ std::unique_ptr<column> map_lookup(column_view const &map_column, string_scalar
   // Defensive checks.
   map_input_check(map_column, stream);
 
+  if (map_column.size() == 0) {
+    return make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  }
+
   lists_column_view lcv{map_column};
   column_view structs_column = lcv.get_sliced_child(stream);
   // Two-pass plan: construct gather map, and then gather() on structs_column.child(1). Plan A.
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 0643776a546..d1af0d9a2f6 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -5412,6 +5412,17 @@ void testGetMapValue() {
     }
   }
 
+  @Test
+  void testGetMapValueEmptyInput() {
+    HostColumnVector.StructType structType = new HostColumnVector.StructType(true, Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING),
+        new HostColumnVector.BasicType(true, DType.STRING)));
+    try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, structType));
+         ColumnVector res = cv.getMapValue(Scalar.fromString("a"));
+         ColumnVector expected = ColumnVector.fromStrings()) {
+      assertColumnsAreEqual(expected, res);
+    }
+  }
+
   @Test
   void testGetMapKeyExistence() {
     List<HostColumnVector.StructData> list1 = Arrays.asList(new HostColumnVector.StructData("a", "b"));

From 08cbbcdcea2c9fb18e5614f5e29ba99b5443d38f Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Thu, 23 Sep 2021 00:28:03 +0530
Subject: [PATCH 6/8] Use nvcomp's snappy compressor in ORC writer (#9242)

Issue #9205

depends on #9235

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Elias Stehle (https://github.com/elstehle)
  - https://github.com/nvdbaranec
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9242
---
 cpp/src/io/orc/orc_common.h   |  3 +-
 cpp/src/io/orc/orc_gpu.h      |  6 ++-
 cpp/src/io/orc/stripe_enc.cu  | 96 ++++++++++++++++++++++++++++++-----
 cpp/src/io/orc/stripe_init.cu | 16 +++---
 cpp/src/io/orc/writer_impl.cu | 51 ++++++++++---------
 5 files changed, 126 insertions(+), 46 deletions(-)

diff --git a/cpp/src/io/orc/orc_common.h b/cpp/src/io/orc/orc_common.h
index ab6788d01f1..eedaa9d4fc2 100644
--- a/cpp/src/io/orc/orc_common.h
+++ b/cpp/src/io/orc/orc_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@ namespace orc {
 
 // ORC rows are divided into groups and assigned indexes for faster seeking
 static constexpr uint32_t default_row_index_stride = 10000;
+static constexpr uint32_t BLOCK_HEADER_SIZE        = 3;
 
 enum CompressionKind : uint8_t {
   NONE   = 0,
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 30687331c15..88d7e26b3b6 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -355,6 +355,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
  * @param[in] num_compressed_blocks Total number of compressed blocks
  * @param[in] compression Type of compression
  * @param[in] comp_blk_size Compression block size
+ * @param[in] max_comp_blk_size Max size of any block after compression
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
  * @param[out] comp_in Per-block compression input parameters
@@ -365,10 +366,11 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
+                            uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            gpu_inflate_input_s* comp_in,
-                            gpu_inflate_status_s* comp_out,
+                            device_span<gpu_inflate_input_s> comp_in,
+                            device_span<gpu_inflate_status_s> comp_out,
                             rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index d50d3898c3b..9348d817dad 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -24,6 +24,9 @@
 
 #include <cub/cub.cuh>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <nvcomp/snappy.h>
 
 namespace cudf {
 namespace io {
@@ -1102,15 +1105,17 @@ __global__ void __launch_bounds__(1024)
  * @param[out] comp_out Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
+ * @param[in] max_comp_blk_size Max size of any block after compression
  */
 // blockDim {256,1,1}
 __global__ void __launch_bounds__(256)
   gpuInitCompressionBlocks(device_2dspan<StripeStream const> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams,  // const?
-                           gpu_inflate_input_s* comp_in,
-                           gpu_inflate_status_s* comp_out,
+                           device_span<gpu_inflate_input_s> comp_in,
+                           device_span<gpu_inflate_status_s> comp_out,
                            uint8_t* compressed_bfr,
-                           uint32_t comp_blk_size)
+                           uint32_t comp_blk_size,
+                           uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
   __shared__ uint8_t* volatile uncomp_base_g;
@@ -1135,8 +1140,8 @@ __global__ void __launch_bounds__(256)
     uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
     blk_in->srcDevice = src + b * comp_blk_size;
     blk_in->srcSize   = blk_size;
-    blk_in->dstDevice = dst + b * (3 + comp_blk_size) + 3;  // reserve 3 bytes for block header
-    blk_in->dstSize   = blk_size;
+    blk_in->dstDevice = dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE;
+    blk_in->dstSize   = max_comp_blk_size;
     blk_out->bytes_written = blk_size;
     blk_out->status        = 1;
     blk_out->reserved      = 0;
@@ -1153,14 +1158,16 @@ __global__ void __launch_bounds__(256)
  * @param[in] comp_out Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
+ * @param[in] max_comp_blk_size Max size of any block after compression
  */
 // blockDim {1024,1,1}
 __global__ void __launch_bounds__(1024)
   gpuCompactCompressedBlocks(device_2dspan<StripeStream> strm_desc,
-                             gpu_inflate_input_s* comp_in,
-                             gpu_inflate_status_s* comp_out,
+                             device_span<gpu_inflate_input_s> comp_in,
+                             device_span<gpu_inflate_status_s> comp_out,
                              uint8_t* compressed_bfr,
-                             uint32_t comp_blk_size)
+                             uint32_t comp_blk_size,
+                             uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
   __shared__ const uint8_t* volatile comp_src_g;
@@ -1271,20 +1278,83 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
+                            uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            gpu_inflate_input_s* comp_in,
-                            gpu_inflate_status_s* comp_out,
+                            device_span<gpu_inflate_input_s> comp_in,
+                            device_span<gpu_inflate_status_s> comp_out,
                             rmm::cuda_stream_view stream)
 {
   dim3 dim_block_init(256, 1);
   dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
   gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(
-    strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size);
-  if (compression == SNAPPY) { gpu_snap(comp_in, comp_out, num_compressed_blocks, stream); }
+    strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
+  if (compression == SNAPPY) {
+    auto env_use_nvcomp = std::getenv("LIBCUDF_USE_NVCOMP");
+    bool use_nvcomp     = env_use_nvcomp != nullptr ? std::atoi(env_use_nvcomp) : 0;
+    if (use_nvcomp) {
+      try {
+        size_t temp_size;
+        nvcompStatus_t nvcomp_status = nvcompBatchedSnappyCompressGetTempSize(
+          num_compressed_blocks, comp_blk_size, nvcompBatchedSnappyDefaultOpts, &temp_size);
+
+        CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+                     "Error in getting snappy compression scratch size");
+
+        rmm::device_buffer scratch(temp_size, stream);
+        rmm::device_uvector<void const*> uncompressed_data_ptrs(num_compressed_blocks, stream);
+        rmm::device_uvector<size_t> uncompressed_data_sizes(num_compressed_blocks, stream);
+        rmm::device_uvector<void*> compressed_data_ptrs(num_compressed_blocks, stream);
+        rmm::device_uvector<size_t> compressed_bytes_written(num_compressed_blocks, stream);
+
+        auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(),
+                                                 uncompressed_data_sizes.begin(),
+                                                 compressed_data_ptrs.begin());
+        thrust::transform(rmm::exec_policy(stream),
+                          comp_in.begin(),
+                          comp_in.end(),
+                          comp_it,
+                          [] __device__(gpu_inflate_input_s in) {
+                            return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice);
+                          });
+        nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(),
+                                                         uncompressed_data_sizes.data(),
+                                                         max_comp_blk_size,
+                                                         num_compressed_blocks,
+                                                         scratch.data(),
+                                                         scratch.size(),
+                                                         compressed_data_ptrs.data(),
+                                                         compressed_bytes_written.data(),
+                                                         nvcompBatchedSnappyDefaultOpts,
+                                                         stream.value());
+
+        CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in snappy compression");
+
+        thrust::transform(rmm::exec_policy(stream),
+                          compressed_bytes_written.begin(),
+                          compressed_bytes_written.end(),
+                          comp_out.begin(),
+                          [] __device__(size_t size) {
+                            gpu_inflate_status_s status{};
+                            status.bytes_written = size;
+                            return status;
+                          });
+      } catch (...) {
+        // If we reach this then there was an error in compressing so set an error status for each
+        // block
+        thrust::for_each(rmm::exec_policy(stream),
+                         comp_out.begin(),
+                         comp_out.end(),
+                         [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; });
+      };
+
+    } else {
+      gpu_snap(comp_in.data(), comp_out.data(), num_compressed_blocks, stream);
+    }
+  }
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
-    strm_desc, comp_in, comp_out, compressed_data, comp_blk_size);
+    strm_desc, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 94d8de6561b..d6dbdbe6403 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,13 +52,13 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
     uint32_t max_uncompressed_block_size = 0;
     uint32_t num_compressed_blocks       = 0;
     uint32_t num_uncompressed_blocks     = 0;
-    while (cur + 3 < end) {
+    while (cur + BLOCK_HEADER_SIZE < end) {
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
       uint32_t is_uncompressed = block_len & 1;
       uint32_t uncompressed_size;
       gpu_inflate_input_s* init_ctl = nullptr;
       block_len >>= 1;
-      cur += 3;
+      cur += BLOCK_HEADER_SIZE;
       if (block_len > block_size || cur + block_len > end) {
         // Fatal
         num_compressed_blocks       = 0;
@@ -145,12 +145,12 @@ extern "C" __global__ void __launch_bounds__(128, 8)
     uint32_t num_compressed_blocks      = 0;
     uint32_t max_compressed_blocks      = s->info.num_compressed_blocks;
 
-    while (cur + 3 < end) {
+    while (cur + BLOCK_HEADER_SIZE < end) {
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
       uint32_t is_uncompressed = block_len & 1;
       uint32_t uncompressed_size_est, uncompressed_size_actual;
       block_len >>= 1;
-      cur += 3;
+      cur += BLOCK_HEADER_SIZE;
       if (cur + block_len > end) { break; }
       if (is_uncompressed) {
         uncompressed_size_est    = block_len;
@@ -367,9 +367,11 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
       for (;;) {
         uint32_t block_len, is_uncompressed;
 
-        if (cur + 3 > end || cur + 3 >= start + compressed_offset) { break; }
+        if (cur + BLOCK_HEADER_SIZE > end || cur + BLOCK_HEADER_SIZE >= start + compressed_offset) {
+          break;
+        }
         block_len = cur[0] | (cur[1] << 8) | (cur[2] << 16);
-        cur += 3;
+        cur += BLOCK_HEADER_SIZE;
         is_uncompressed = block_len & 1;
         block_len >>= 1;
         cur += block_len;
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e0018ed7166..8a0112deb76 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -36,6 +36,8 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <nvcomp/snappy.h>
+
 #include <algorithm>
 #include <cstring>
 #include <numeric>
@@ -999,10 +1001,10 @@ void writer::impl::write_index_stream(int32_t stripe_id,
       record.pos += stream.lengths[type];
       while ((record.pos >= 0) && (record.blk_pos >= 0) &&
              (static_cast<size_t>(record.pos) >= compression_blocksize_) &&
-             (record.comp_pos + 3 + comp_out[record.blk_pos].bytes_written <
+             (record.comp_pos + BLOCK_HEADER_SIZE + comp_out[record.blk_pos].bytes_written <
               static_cast<size_t>(record.comp_size))) {
         record.pos -= compression_blocksize_;
-        record.comp_pos += 3 + comp_out[record.blk_pos].bytes_written;
+        record.comp_pos += BLOCK_HEADER_SIZE + comp_out[record.blk_pos].bytes_written;
         record.blk_pos += 1;
       }
     }
@@ -1472,29 +1474,31 @@ void writer::impl::write(table_view const& table)
   }
 
   // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size   = 0;
-  size_t num_compressed_blocks = 0;
-  auto stream_output           = [&]() {
+  size_t compressed_bfr_size       = 0;
+  size_t num_compressed_blocks     = 0;
+  size_t max_compressed_block_size = 0;
+  if (compression_kind_ != NONE) {
+    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+  }
+  auto stream_output = [&]() {
     size_t max_stream_size = 0;
     bool all_device_write  = true;
 
-    for (size_t stripe_id = 0; stripe_id < segmentation.num_stripes(); stripe_id++) {
-      for (size_t i = 0; i < num_data_streams; i++) {  // TODO range for (at least)
-        gpu::StripeStream* ss = &strm_descs[stripe_id][i];
-        if (!out_sink_->is_device_write_preferred(ss->stream_size)) { all_device_write = false; }
-        size_t stream_size = ss->stream_size;
-        if (compression_kind_ != NONE) {
-          ss->first_block = num_compressed_blocks;
-          ss->bfr_offset  = compressed_bfr_size;
-
-          auto num_blocks = std::max<uint32_t>(
-            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-          stream_size += num_blocks * 3;
-          num_compressed_blocks += num_blocks;
-          compressed_bfr_size += stream_size;
-        }
-        max_stream_size = std::max(max_stream_size, stream_size);
+    for (auto& ss : strm_descs.host_view().flat_view()) {
+      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+      size_t stream_size = ss.stream_size;
+      if (compression_kind_ != NONE) {
+        ss.first_block = num_compressed_blocks;
+        ss.bfr_offset  = compressed_bfr_size;
+
+        auto num_blocks = std::max<uint32_t>(
+          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+        stream_size += num_blocks * BLOCK_HEADER_SIZE;
+        num_compressed_blocks += num_blocks;
+        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
       }
+      max_stream_size = std::max(max_stream_size, stream_size);
     }
 
     if (all_device_write) {
@@ -1519,10 +1523,11 @@ void writer::impl::write(table_view const& table)
                                 num_compressed_blocks,
                                 compression_kind_,
                                 compression_blocksize_,
+                                max_compressed_block_size,
                                 strm_descs,
                                 enc_data.streams,
-                                comp_in.device_ptr(),
-                                comp_out.device_ptr(),
+                                comp_in,
+                                comp_out,
                                 stream);
     strm_descs.device_to_host(stream);
     comp_out.device_to_host(stream, true);

From 10fd071dc12e35f02192d7bdd14af03221bb2ae9 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 22 Sep 2021 12:05:14 -0700
Subject: [PATCH 7/8] Add `dseries.struct.explode` (#9086)

Closes #8660

Per discussions in thread #8872 , this PR adds a struct-accessor member function to provide a lateral view to a struct type series.

Example:
```python
>>> import cudf, dask_cudf as dgd
>>> ds = dgd.from_cudf(cudf.Series(
...     [{'a': 42, 'b': 'str1', 'c': [-1]},
...      {'a': 0,  'b': 'str2', 'c': [400, 500]},
...      {'a': 7,  'b': '',     'c': []}]), npartitions=2)
>>> ds.struct.explode().compute()
    a     b           c
0  42  str1        [-1]
1   0  str2  [400, 500]
2   7                []
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/9086
---
 python/dask_cudf/dask_cudf/accessors.py       | 26 +++++++++++++++++++
 .../dask_cudf/tests/test_accessor.py          | 15 +++++++++++
 2 files changed, 41 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py
index 77973ee34ff..1c21fca51c8 100644
--- a/python/dask_cudf/dask_cudf/accessors.py
+++ b/python/dask_cudf/dask_cudf/accessors.py
@@ -37,6 +37,32 @@ def field(self, key):
             meta=self.d_series._meta._constructor([], dtype=typ),
         )
 
+    def explode(self):
+        """
+        Creates a dataframe view of the struct column, one column per field.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf, dask_cudf
+        >>> ds = dask_cudf.from_cudf(cudf.Series(
+        ...     [{'a': 42, 'b': 'str1', 'c': [-1]},
+        ...      {'a': 0,  'b': 'str2', 'c': [400, 500]},
+        ...      {'a': 7,  'b': '',     'c': []}]), npartitions=2)
+        >>> ds.struct.explode().compute()
+            a     b           c
+        0  42  str1        [-1]
+        1   0  str2  [400, 500]
+        2   7                []
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.struct.explode(),
+            meta=self.d_series._meta.struct.explode(),
+        )
+
 
 class ListMethods:
     def __init__(self, d_series):
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 805927dd474..1521ce41806 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -499,3 +499,18 @@ def test_dask_struct_field_Int_Error(data):
 
     with pytest.raises(IndexError):
         got.struct.field(1000).compute()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [{}, {}, {}],
+        [{"a": 100, "b": "abc"}, {"a": 42, "b": "def"}, {"a": -87, "b": ""}],
+        [{"a": [1, 2, 3], "b": {"c": 101}}, {"a": [4, 5], "b": {"c": 102}}],
+    ],
+)
+def test_struct_explode(data):
+    expect = Series(data).struct.explode()
+    got = dgd.from_cudf(Series(data), 2).struct.explode()
+
+    assert_eq(expect, got.compute())

From 1cb527f01aac631f4d44866b5474e503501d58cd Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 23 Sep 2021 01:26:59 +0530
Subject: [PATCH 8/8] Add shallow hash function and shallow equality comparison
 for column_view (#9185)

Fixes #9140
Added `shallow_hash(column_view)`
Added unit tests

It computes hash values based on the shallow states of `column_view`:
type, size, data pointer, null_mask pointer,  offset, and the hash value of the children.
`null_count` is not used since it is a cached value and it may vary based on contents of `null_mask`, and may be pre-computed or not.

Fixes #9139
Added `is_shallow_equivalent(column_view, column_view)` ~shallow_equal~
Added unit tests

It compares two column_views based on the shallow states of column_view:
type, size, data pointer, null_mask pointer, offset, and the column_view of the children.
null_count is not used since it is a cached value and it may vary based on contents of null_mask, and may be pre-computed or not.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Jake Hemstad (https://github.com/jrhemstad)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9185
---
 cpp/include/cudf/column/column_view.hpp       |  41 ++
 cpp/include/cudf/detail/hashing.hpp           |  36 ++
 .../cudf/detail/utilities/hash_functions.cuh  |  12 +
 cpp/include/cudf_test/type_lists.hpp          |  12 +
 cpp/src/column/column_view.cpp                |  55 +++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/column/column_view_shallow_test.cpp | 442 ++++++++++++++++++
 7 files changed, 599 insertions(+)
 create mode 100644 cpp/tests/column/column_view_shallow_test.cpp

diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 7feaeafbad0..cd490c3c832 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -633,4 +633,45 @@ column_view bit_cast(column_view const& input, data_type type);
  */
 mutable_column_view bit_cast(mutable_column_view const& input, data_type type);
 
+namespace detail {
+/**
+ * @brief Computes a hash value from the shallow state of the specified column
+ *
+ * For any two columns, if `is_shallow_equivalent(c0,c1)` then `shallow_hash(c0) ==
+ * shallow_hash(c1)`.
+ *
+ * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e.,
+ * it is independent of the number of elements in the column.
+ *
+ * This function does _not_ inspect the elements of `input` nor access any device memory or launch
+ * any kernels.
+ *
+ * @param input The `column_view` to compute hash
+ * @return The hash value derived from the shallow state of `input`.
+ */
+std::size_t shallow_hash(column_view const& input);
+
+/**
+ * @brief Uses only shallow state to determine if two `column_view`s view equivalent columns
+ *
+ *  Two columns are equivalent if for any operation `F` then:
+ *   ```
+ *    is_shallow_equivalent(c0, c1) ==> The results of F(c0) and F(c1) are equivalent
+ *   ```
+ * For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact
+ * same physical column. In other words, two physically independent columns may have exactly
+ * equivalent elements but their shallow state would not be equivalent.
+ *
+ * The complexity of this function is `O( min(count_descendants(lhs), count_descendants(rhs)) )`,
+ * i.e., it is independent of the number of elements in either column.
+ *
+ * This function does _not_ inspect the elements of `lhs` or `rhs` nor access any device memory nor
+ * launch any kernels.
+ *
+ * @param lhs The left `column_view` to compare
+ * @param rhs The right `column_view` to compare
+ * @return If `lhs` and `rhs` have equivalent shallow state
+ */
+bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs);
+}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index 83d6be14709..bd5c8a42a51 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -19,6 +19,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cstddef>
+#include <functional>
+
 namespace cudf {
 namespace detail {
 
@@ -53,5 +56,38 @@ std::unique_ptr<column> serial_murmur_hash3_32(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/* Copyright 2005-2014 Daniel James.
+ *
+ * Use, modification and distribution is subject to the Boost Software
+ * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt)
+ */
+/**
+ * @brief Combines two hashed values into a single hashed value.
+ *
+ * Adapted from Boost hash_combine function, modified for 64-bit
+ * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
+ *
+ * @param lhs The first hashed value
+ * @param rhs The second hashed value
+ * @return Combined hash value
+ */
+constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
+{
+  lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2);
+  return lhs;
+}
 }  // namespace detail
 }  // namespace cudf
+
+// specialization of std::hash for cudf::data_type
+namespace std {
+template <>
+struct hash<cudf::data_type> {
+  std::size_t operator()(cudf::data_type const& type) const noexcept
+  {
+    return cudf::detail::hash_combine(std::hash<int32_t>{}(static_cast<int32_t>(type.id())),
+                                      std::hash<int32_t>{}(type.scale()));
+  }
+};
+}  // namespace std
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 6eab13ae9af..65deadd6cd0 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -395,6 +395,12 @@ struct MurmurHash3_32 {
     return h;
   }
 
+  /* Copyright 2005-2014 Daniel James.
+   *
+   * Use, modification and distribution is subject to the Boost Software
+   * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+   * http://www.boost.org/LICENSE_1_0.txt)
+   */
   /**
    * @brief  Combines two hash values into a new single hash value. Called
    * repeatedly to create a hash value from several variables.
@@ -795,6 +801,12 @@ struct IdentityHash {
   IdentityHash()    = default;
   constexpr IdentityHash(uint32_t seed) : m_seed(seed) {}
 
+  /* Copyright 2005-2014 Daniel James.
+   *
+   * Use, modification and distribution is subject to the Boost Software
+   * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+   * http://www.boost.org/LICENSE_1_0.txt)
+   */
   /**
    * @brief  Combines two hash values into a new single hash value. Called
    * repeatedly to create a hash value from several variables.
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 74688b7f133..982c94ac402 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -315,6 +315,18 @@ using FixedWidthTypesWithoutChrono = Concat<NumericTypes, FixedPointTypes>;
  */
 using ComparableTypes = Concat<NumericTypes, ChronoTypes, StringTypes>;
 
+/**
+ * @brief Provides a list of all compound types for use in GTest typed tests.
+ *
+ * Example:
+ * ```
+ * // Invokes all typed fixture tests for all compound types in libcudf
+ * TYPED_TEST_CASE(MyTypedFixture, cudf::test::CompoundTypes);
+ * ```
+ */
+using CompoundTypes =
+  cudf::test::Types<cudf::string_view, cudf::dictionary32, cudf::list_view, cudf::struct_view>;
+
 /**
  * @brief Provides a list of all types supported in libcudf for use in a GTest
  * typed test.
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 186669ae697..5749cb48c0e 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/hashing.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -22,6 +23,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
+#include <algorithm>
 #include <exception>
 #include <numeric>
 #include <vector>
@@ -76,6 +78,59 @@ size_type column_view_base::null_count(size_type begin, size_type end) const
            ? 0
            : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end);
 }
+
+// Struct to use custom hash combine and fold expression
+struct HashValue {
+  std::size_t hash;
+  explicit HashValue(std::size_t h) : hash{h} {}
+  HashValue operator^(HashValue const& other) const
+  {
+    return HashValue{hash_combine(hash, other.hash)};
+  }
+};
+
+template <typename... Ts>
+constexpr auto hash(Ts&&... ts)
+{
+  return (... ^ HashValue(std::hash<Ts>{}(ts))).hash;
+}
+
+std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false)
+{
+  std::size_t const init = (is_parent_empty or c.is_empty())
+                             ? hash(c.type(), 0)
+                             : hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset());
+  return std::accumulate(c.child_begin(),
+                         c.child_end(),
+                         init,
+                         [&c, is_parent_empty](std::size_t hash, auto const& child) {
+                           return hash_combine(
+                             hash, shallow_hash_impl(child, c.is_empty() or is_parent_empty));
+                         });
+}
+
+std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }
+
+bool shallow_equivalent_impl(column_view const& lhs,
+                             column_view const& rhs,
+                             bool is_parent_empty = false)
+{
+  bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty;
+  return (lhs.type() == rhs.type()) and
+         (is_empty or ((lhs.size() == rhs.size()) and (lhs.head() == rhs.head()) and
+                       (lhs.null_mask() == rhs.null_mask()) and (lhs.offset() == rhs.offset()))) and
+         std::equal(lhs.child_begin(),
+                    lhs.child_end(),
+                    rhs.child_begin(),
+                    rhs.child_end(),
+                    [is_empty](auto const& lhs_child, auto const& rhs_child) {
+                      return shallow_equivalent_impl(lhs_child, rhs_child, is_empty);
+                    });
+}
+bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
+{
+  return shallow_equivalent_impl(lhs, rhs);
+}
 }  // namespace detail
 
 // Immutable view constructor
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 03f7967cee0..cde170fb598 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -33,6 +33,7 @@ endfunction()
 # - column tests ----------------------------------------------------------------------------------
 ConfigureTest(COLUMN_TEST
     column/bit_cast_test.cpp
+    column/column_view_shallow_test.cpp
     column/column_test.cu
     column/column_device_view_test.cu
     column/compound_test.cu)
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
new file mode 100644
index 00000000000..f76f682bb2f
--- /dev/null
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <memory>
+#include <type_traits>
+
+// fixed_width, dict, string, list, struct
+template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+{
+  auto begin = thrust::make_counting_iterator(1);
+  auto end   = thrust::make_counting_iterator(16);
+  return cudf::test::fixed_width_column_wrapper<T>(begin, end).release();
+}
+
+template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+{
+  return cudf::test::dictionary_column_wrapper<std::string>(
+           {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0})
+    .release();
+}
+
+template <typename T,
+          std::enable_if_t<std::is_same_v<T, std::string> or
+                           std::is_same_v<T, cudf::string_view>>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+
+{
+  return cudf::test::strings_column_wrapper(
+           {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""})
+    .release();
+}
+
+template <typename T, std::enable_if_t<std::is_same_v<T, cudf::list_view>>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+{
+  return cudf::test::lists_column_wrapper<int>({{1, 2, 3}, {4, 5}, {}, {6, 7, 8}}).release();
+}
+
+template <typename T, std::enable_if_t<std::is_same_v<T, cudf::struct_view>>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+{
+  auto begin    = thrust::make_counting_iterator(1);
+  auto end      = thrust::make_counting_iterator(16);
+  auto member_0 = cudf::test::fixed_width_column_wrapper<int32_t>(begin, end);
+  auto member_1 = cudf::test::fixed_width_column_wrapper<int32_t>(begin + 10, end + 10);
+  return cudf::test::structs_column_wrapper({member_0, member_1}).release();
+}
+
+template <typename T>
+struct ColumnViewShallowTests : public cudf::test::BaseFixture {
+};
+
+using AllTypes = cudf::test::Concat<cudf::test::AllTypes, cudf::test::CompoundTypes>;
+TYPED_TEST_CASE(ColumnViewShallowTests, AllTypes);
+
+// Test for fixed_width, dict, string, list, struct
+// column_view, column_view = same hash.
+// column_view, make a copy = same hash.
+// new column_view from colmn = same hash
+// column_view, copy column = diff hash
+// column_view, diff column = diff hash.
+//
+// column_view old, update data + new column_view     = same hash.
+// column_view old, add null_mask + new column_view   = diff hash.
+// column_view old, update nulls + new column_view    = same hash.
+// column_view old, set_null_count + new column_view  = same hash.
+//
+// column_view, sliced[0, size) = same hash (for split too)
+// column_view, sliced[n:)      = diff hash (for split too)
+// column_view, bit_cast        = diff hash
+//
+// mutable_column_view, column_view = same hash
+// mutable_column_view, modified mutable_column_view = same hash
+//
+// update the children column data  = same hash
+// update the children column_views = diff hash
+
+TYPED_TEST(ColumnViewShallowTests, shallow_hash_basic)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // same = same hash
+  {
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view));
+  }
+  // copy column_view = same hash
+  {
+    auto col_view_copy = col_view;
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_copy));
+  }
+
+  // new column_view from column = same hash
+  {
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
+  }
+
+  // copy column = diff hash
+  {
+    auto col_new       = std::make_unique<cudf::column>(*col);
+    auto col_view_copy = col_new->view();
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_copy));
+  }
+
+  // column_view, diff column = diff hash.
+  {
+    auto col_diff      = example_column<TypeParam>();
+    auto col_view_diff = cudf::column_view{*col_diff};
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_diff));
+  }
+}
+TYPED_TEST(ColumnViewShallowTests, shallow_hash_update_data)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // update data + new column_view = same hash.
+  {
+    // update data by modifying some bits: fixed_width, string, dict, list, struct
+    if constexpr (cudf::is_fixed_width<TypeParam>()) {
+      // Update data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
+      cudf::set_null_mask(data, 2, 64, true);
+    } else {
+      // Update child(0).data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
+      cudf::set_null_mask(data, 2, 64, true);
+    }
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
+  }
+  // add null_mask + new column_view = diff hash.
+  {
+    col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
+    col_view_new.null_count();
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
+    auto col_view_new2 = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view_new), shallow_hash(col_view_new2));
+  }
+  col_view = cudf::column_view{*col};  // updating after adding null_mask
+  // update nulls + new column_view = same hash.
+  {
+    cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false);
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
+  }
+  // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT)
+  {
+    col->set_null_count(cudf::UNKNOWN_NULL_COUNT);
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
+    col->set_null_count(col->size());
+    auto col_view_new2 = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new2));
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, shallow_hash_slice)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // column_view, sliced[0, size)  = same hash (for split too)
+  {
+    auto col_sliced = cudf::slice(col_view, {0, col_view.size()});
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_sliced[0]));
+    auto col_split = cudf::split(col_view, {0});
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0]));
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_split[1]));
+  }
+  // column_view, sliced[n:]       = diff hash (for split too)
+  {
+    auto col_sliced = cudf::slice(col_view, {1, col_view.size()});
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_sliced[0]));
+    auto col_split = cudf::split(col_view, {1});
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0]));
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[1]));
+  }
+  // column_view, col copy sliced[0, 0)  = same hash (empty column)
+  {
+    auto col_new        = std::make_unique<cudf::column>(*col);
+    auto col_new_view   = col_new->view();
+    auto col_sliced     = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
+    auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
+
+    EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_sliced[1]));
+    EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_sliced[2]));
+    EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_new_sliced[0]));
+    EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_new_sliced[1]));
+    EXPECT_EQ(shallow_hash(col_sliced[2]), shallow_hash(col_new_sliced[2]));
+  }
+
+  // column_view, bit_cast         = diff hash
+  {
+    if constexpr (std::is_integral_v<TypeParam> and not std::is_same_v<TypeParam, bool>) {
+      using newType    = std::conditional_t<std::is_signed_v<TypeParam>,
+                                         std::make_unsigned_t<TypeParam>,
+                                         std::make_signed_t<TypeParam>>;
+      auto new_type    = cudf::data_type(cudf::type_to_id<newType>());
+      auto col_bitcast = cudf::bit_cast(col_view, new_type);
+      EXPECT_NE(shallow_hash(col_view), shallow_hash(col_bitcast));
+    }
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, shallow_hash_mutable)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // mutable_column_view, column_view = same hash
+  {
+    auto col_mutable = cudf::mutable_column_view{*col};
+    EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_view));
+  }
+  // mutable_column_view, modified mutable_column_view = same hash
+  // update the children column data = same hash
+  {
+    auto col_mutable = cudf::mutable_column_view{*col};
+    if constexpr (cudf::is_fixed_width<TypeParam>()) {
+      // Update data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
+      cudf::set_null_mask(data, 1, 32, false);
+    } else {
+      // Update child(0).data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
+      cudf::set_null_mask(data, 1, 32, false);
+    }
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_mutable));
+    auto col_mutable_new = cudf::mutable_column_view{*col};
+    EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_mutable_new));
+  }
+  // update the children column_views = diff hash
+  {
+    if constexpr (cudf::is_nested<TypeParam>()) {
+      col->child(0).set_null_mask(
+        cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL));
+      auto col_child_updated = cudf::mutable_column_view{*col};
+      EXPECT_NE(shallow_hash(col_view), shallow_hash(col_child_updated));
+    }
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_basic)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // same = same hash
+  {
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view));
+  }
+  // copy column_view = same hash
+  {
+    auto col_view_copy = col_view;
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_copy));
+  }
+
+  // new column_view from column = same hash
+  {
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
+  }
+
+  // copy column = diff hash
+  {
+    auto col_new       = std::make_unique<cudf::column>(*col);
+    auto col_view_copy = col_new->view();
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_copy));
+  }
+
+  // column_view, diff column = diff hash.
+  {
+    auto col_diff      = example_column<TypeParam>();
+    auto col_view_diff = cudf::column_view{*col_diff};
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_diff));
+  }
+}
+TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_update_data)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // update data + new column_view = same hash.
+  {
+    // update data by modifying some bits: fixed_width, string, dict, list, struct
+    if constexpr (cudf::is_fixed_width<TypeParam>()) {
+      // Update data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
+      cudf::set_null_mask(data, 2, 64, true);
+    } else {
+      // Update child(0).data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
+      cudf::set_null_mask(data, 2, 64, true);
+    }
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
+  }
+  // add null_mask + new column_view = diff hash.
+  {
+    col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
+    col_view_new.null_count();
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
+    auto col_view_new2 = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view_new, col_view_new2));
+  }
+  col_view = cudf::column_view{*col};  // updating after adding null_mask
+  // update nulls + new column_view = same hash.
+  {
+    cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false);
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
+  }
+  // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT)
+  {
+    col->set_null_count(cudf::UNKNOWN_NULL_COUNT);
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
+    col->set_null_count(col->size());
+    auto col_view_new2 = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new2));
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_slice)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // column_view, sliced[0, size)  = same hash (for split too)
+  {
+    auto col_sliced = cudf::slice(col_view, {0, col_view.size()});
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_sliced[0]));
+    auto col_split = cudf::split(col_view, {0});
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0]));
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_split[1]));
+  }
+  // column_view, sliced[n:]       = diff hash (for split too)
+  {
+    auto col_sliced = cudf::slice(col_view, {1, col_view.size()});
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_sliced[0]));
+    auto col_split = cudf::split(col_view, {1});
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0]));
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[1]));
+  }
+  // column_view, col copy sliced[0, 0)  = same hash (empty column)
+  {
+    auto col_new        = std::make_unique<cudf::column>(*col);
+    auto col_new_view   = col_new->view();
+    auto col_sliced     = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
+    auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
+
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_sliced[1]));
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_sliced[2]));
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_new_sliced[0]));
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_new_sliced[1]));
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[2], col_new_sliced[2]));
+  }
+
+  // column_view, bit_cast         = diff hash
+  {
+    if constexpr (std::is_integral_v<TypeParam> and not std::is_same_v<TypeParam, bool>) {
+      using newType    = std::conditional_t<std::is_signed_v<TypeParam>,
+                                         std::make_unsigned_t<TypeParam>,
+                                         std::make_signed_t<TypeParam>>;
+      auto new_type    = cudf::data_type(cudf::type_to_id<newType>());
+      auto col_bitcast = cudf::bit_cast(col_view, new_type);
+      EXPECT_FALSE(is_shallow_equivalent(col_view, col_bitcast));
+    }
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_mutable)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // mutable_column_view, column_view = same hash
+  {
+    auto col_mutable = cudf::mutable_column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_view));
+  }
+  // mutable_column_view, modified mutable_column_view = same hash
+  // update the children column data = same hash
+  {
+    auto col_mutable = cudf::mutable_column_view{*col};
+    if constexpr (cudf::is_fixed_width<TypeParam>()) {
+      // Update data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
+      cudf::set_null_mask(data, 1, 32, false);
+    } else {
+      // Update child(0).data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
+      cudf::set_null_mask(data, 1, 32, false);
+    }
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_mutable));
+    auto col_mutable_new = cudf::mutable_column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_mutable_new));
+  }
+  // update the children column_views = diff hash
+  {
+    if constexpr (cudf::is_nested<TypeParam>()) {
+      col->child(0).set_null_mask(
+        cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL));
+      auto col_child_updated = cudf::mutable_column_view{*col};
+      EXPECT_FALSE(is_shallow_equivalent(col_view, col_child_updated));
+    }
+  }
+}