From 77fa49eddf1c961277ec5e0fb3616433f2a46ea4 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 14 Apr 2022 14:13:06 -0700
Subject: [PATCH 01/26] Clean up C++ includes to use <> instead of "". (#10658)

This PR cleans up some C++ includes to use `#include <...>` instead of `#include "..."` where appropriate.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/10658
---
 cpp/benchmarks/io/orc/orc_writer.cpp              |  2 +-
 cpp/benchmarks/sort/rank.cpp                      |  2 +-
 cpp/benchmarks/string/convert_durations.cpp       | 15 +++++++--------
 cpp/include/cudf/detail/reduction_functions.hpp   |  2 +-
 cpp/libcudf_kafka/src/kafka_callback.cpp          |  2 +-
 cpp/libcudf_kafka/src/kafka_consumer.cpp          |  2 +-
 cpp/src/merge/merge.cu                            |  2 +-
 cpp/src/structs/structs_column_view.cpp           |  4 ++--
 .../binaryop/binop-compiled-fixed_point-test.cpp  |  2 +-
 cpp/tests/hash_map/map_test.cu                    |  2 +-
 cpp/tests/iterator/value_iterator_test_strings.cu | 10 ++++++----
 cpp/tests/partitioning/partition_test.cpp         | 10 +++++-----
 12 files changed, 28 insertions(+), 27 deletions(-)
diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
index 525c13af5c0..f61dac7677b 100644
--- a/cpp/benchmarks/io/orc/orc_writer.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "cudf/io/types.hpp"
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_input.hpp>
@@ -23,6 +22,7 @@
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/orc.hpp>
+#include <cudf/io/types.hpp>
 
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
diff --git a/cpp/benchmarks/sort/rank.cpp b/cpp/benchmarks/sort/rank.cpp
index 22acb241f0b..c3c77ebd52f 100644
--- a/cpp/benchmarks/sort/rank.cpp
+++ b/cpp/benchmarks/sort/rank.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "cudf/column/column_view.hpp"
+#include <cudf/column/column_view.hpp>
 #include <cudf/sorting.hpp>
 
 #include <cudf_test/base_fixture.hpp>
diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp
index dc9a1e991b2..8af111d9a63 100644
--- a/cpp/benchmarks/string/convert_durations.cpp
+++ b/cpp/benchmarks/string/convert_durations.cpp
@@ -13,25 +13,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include <benchmark/benchmark.h>
-
+#include <cudf/column/column_view.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/types.hpp>
+#include <cudf/wrappers/durations.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 
+#include <benchmark/benchmark.h>
+
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <algorithm>
 #include <random>
 
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
-#include "cudf/column/column_view.hpp"
-#include "cudf/wrappers/durations.hpp"
-
 class DurationsToString : public cudf::benchmark {
 };
 template <class TypeParam>
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp
index 3a6113e66ce..317e4d0cf47 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/detail/reduction_functions.hpp
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 
-#include "cudf/lists/lists_column_view.hpp"
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
diff --git a/cpp/libcudf_kafka/src/kafka_callback.cpp b/cpp/libcudf_kafka/src/kafka_callback.cpp
index 6b98747c145..79a40640627 100644
--- a/cpp/libcudf_kafka/src/kafka_callback.cpp
+++ b/cpp/libcudf_kafka/src/kafka_callback.cpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "cudf_kafka/kafka_callback.hpp"
+#include <cudf_kafka/kafka_callback.hpp>
 
 #include <librdkafka/rdkafkacpp.h>
 
diff --git a/cpp/libcudf_kafka/src/kafka_consumer.cpp b/cpp/libcudf_kafka/src/kafka_consumer.cpp
index 49e89a56e60..2ddaa9892da 100644
--- a/cpp/libcudf_kafka/src/kafka_consumer.cpp
+++ b/cpp/libcudf_kafka/src/kafka_consumer.cpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "cudf_kafka/kafka_consumer.hpp"
+#include <cudf_kafka/kafka_consumer.hpp>
 
 #include <librdkafka/rdkafkacpp.h>
 
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 01a94457b69..9c94a6220d6 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -26,6 +26,7 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -38,7 +39,6 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-#include "cudf/utilities/traits.hpp"
 #include <queue>
 #include <vector>
 
diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp
index db9496f18be..681f13386ff 100644
--- a/cpp/src/structs/structs_column_view.cpp
+++ b/cpp/src/structs/structs_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "cudf/utilities/error.hpp"
 #include <cudf/column/column.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/error.hpp>
 
 namespace cudf {
 
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 64462669f90..28df893aff1 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -20,13 +20,13 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include "cudf/utilities/error.hpp"
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
 
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index d69aee57756..f42549514e6 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -23,12 +23,12 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/logical.h>
 #include <thrust/pair.h>
 #include <thrust/tabulate.h>
 
-#include "rmm/exec_policy.hpp"
 #include <cstdlib>
 #include <iostream>
 #include <limits>
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
index 5bddbfbd4aa..9aa18eb844f 100644
--- a/cpp/tests/iterator/value_iterator_test_strings.cu
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -12,10 +12,12 @@
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */
-#include "cudf/detail/utilities/vector_factories.hpp"
-#include "rmm/cuda_stream_view.hpp"
-#include "rmm/device_uvector.hpp"
-#include <tests/iterator/iterator_tests.cuh>
+#include "iterator_tests.cuh"
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp
index 785af409c4c..014a19e93a9 100644
--- a/cpp/tests/partitioning/partition_test.cpp
+++ b/cpp/tests/partitioning/partition_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,16 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/copying.hpp>
-#include <cudf/partitioning.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include "cudf/sorting.hpp"
+#include <cudf/copying.hpp>
+#include <cudf/partitioning.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
 
 template <typename T>
 class PartitionTest : public cudf::test::BaseFixture {

From 14a32619a5b1c0eff49588b141f8ef2eb754cadf Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 14 Apr 2022 14:40:20 -0700
Subject: [PATCH 02/26] Improve User Guide docs (#10663)

This PR makes some minor improvements to the cuDF user guide and some docstrings.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10663
---
 docs/cudf/source/basics/basics.rst            | 58 ++++++++++---------
 docs/cudf/source/basics/internals.rst         |  4 +-
 .../cudf/source/basics/io-gds-integration.rst | 24 ++++----
 .../source/basics/io-nvcomp-integration.rst   |  4 +-
 python/cudf/cudf/core/cut.py                  | 46 ++++++++++-----
 python/cudf/cudf/core/groupby/groupby.py      | 21 +++----
 python/cudf/cudf/core/single_column_frame.py  |  4 +-
 7 files changed, 91 insertions(+), 70 deletions(-)

diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst
index 60a65558033..9b8983fba49 100644
--- a/docs/cudf/source/basics/basics.rst
+++ b/docs/cudf/source/basics/basics.rst
@@ -15,36 +15,40 @@ The following table lists all of cudf types. For methods requiring dtype argumen
 .. rst-class:: special-table
 .. table::
 
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Kind of Data           | Data Type        | Scalar                                                                              | String Aliases                              |
-    +========================+==================+=====================================================================================+=============================================+
-    | Integer                |                  | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_,                   | ``'int8'``, ``'int16'``, ``'int32'``,       |
-    |                        |                  | np.uint32_, np.uint64_                                                              | ``'int64'``, ``'uint8'``, ``'uint16'``,     |
-    |                        |                  |                                                                                     | ``'uint32'``, ``'uint64'``                  |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Float                  |                  | np.float32_, np.float64_                                                            | ``'float32'``, ``'float64'``                |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Strings                |                  | `str <https://docs.python.org/3/library/stdtypes.html#str>`_                        | ``'string'``, ``'object'``                  |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Datetime               |                  | np.datetime64_                                                                      | ``'datetime64[s]'``, ``'datetime64[ms]'``,  |
-    |                        |                  |                                                                                     | ``'datetime64[us]'``, ``'datetime64[ns]'``  |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Timedelta              |                  | np.timedelta64_                                                                     | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,|
-    | (duration type)        |                  |                                                                                     | ``'timedelta64[us]'``, ``'timedelta64[ns]'``|
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Categorical            | CategoricalDtype | (none)                                                                              | ``'category'``                              |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Boolean                |                  | np.bool_                                                                            | ``'bool'``                                  |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Decimal                | Decimal32Dtype,  | (none)                                                                              | (none)                                      |
-    |                        | Decimal64Dtype,  |                                                                                     |                                             |
-    |                        | Decimal128Dtype  |                                                                                     |                                             |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Kind of Data    | Data Type        | Scalar                                                       | String Aliases                               |
+    +=================+==================+==============================================================+==============================================+
+    | Integer         |                  | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_,        | ``'int8'``, ``'int16'``, ``'int32'``,        |
+    |                 |                  | np.uint16_, np.uint32_, np.uint64_                           | ``'int64'``, ``'uint8'``, ``'uint16'``,      |
+    |                 |                  |                                                              | ``'uint32'``, ``'uint64'``                   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Float           |                  | np.float32_, np.float64_                                     | ``'float32'``, ``'float64'``                 |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Strings         |                  | `str <https://docs.python.org/3/library/stdtypes.html#str>`_ | ``'string'``, ``'object'``                   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Datetime        |                  | np.datetime64_                                               | ``'datetime64[s]'``, ``'datetime64[ms]'``,   |
+    |                 |                  |                                                              | ``'datetime64[us]'``, ``'datetime64[ns]'``   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Timedelta       |                  | np.timedelta64_                                              | ``'timedelta64[s]'``, ``'timedelta64[ms]'``, |
+    | (duration type) |                  |                                                              | ``'timedelta64[us]'``, ``'timedelta64[ns]'`` |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Categorical     | CategoricalDtype | (none)                                                       | ``'category'``                               |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Boolean         |                  | np.bool_                                                     | ``'bool'``                                   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Decimal         | Decimal32Dtype,  | (none)                                                       | (none)                                       |
+    |                 | Decimal64Dtype,  |                                                              |                                              |
+    |                 | Decimal128Dtype  |                                                              |                                              |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Lists           | ListDtype        | list                                                         | ``'list'``                                   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Structs         | StructDtype      | dict                                                         | ``'struct'``                                 |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
 
 **Note: All dtypes above are Nullable**
 
-.. _np.int8: 
-.. _np.int16: 
+.. _np.int8:
+.. _np.int16:
 .. _np.int32:
 .. _np.int64:
 .. _np.uint8:
diff --git a/docs/cudf/source/basics/internals.rst b/docs/cudf/source/basics/internals.rst
index 60b63c6fab8..96ef40d51e6 100644
--- a/docs/cudf/source/basics/internals.rst
+++ b/docs/cudf/source/basics/internals.rst
@@ -54,7 +54,7 @@ As another example, the ``StringColumn`` backing the Series
 2. No mask buffer as there are no nulls in the Series
 3. Two children columns:
 
-    -  A column of 8-bit characters
+    -  A column of UTF-8 characters
        ``['d', 'o', 'y', 'o', 'u', h' ... '?']``
     -  A column of "offsets" to the characters column (in this case,
        ``[0, 2, 5, 9, 12, 19]``)
@@ -172,7 +172,7 @@ Selecting columns by index:
     >>> ca.select_by_index(1)
     ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
     >>> ca.select_by_index([0, 1])
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))    
+    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
     >>> ca.select_by_index(slice(1, 3))
     ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
 
diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
index 71c114e9149..5ff07ac29c5 100644
--- a/docs/cudf/source/basics/io-gds-integration.rst
+++ b/docs/cudf/source/basics/io-gds-integration.rst
@@ -1,14 +1,14 @@
 GPUDirect Storage Integration
 =============================
 
-Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. 
-GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. 
-GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. 
+Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations.
+GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU.
+GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer.
 The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
 GDS is also included in CUDA Toolkit 11.4 and higher.
 
-Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. 
-This variable also controls the GDS compatibility mode. 
+Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``.
+This variable also controls the GDS compatibility mode.
 
 There are three valid values for the environment variable:
 
@@ -20,17 +20,17 @@ If no value is set, behavior will be the same as the "GDS" option.
 
 This environment variable also affects how cuDF treats GDS errors.
 When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
-When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), 
+When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on),
 cuDF throws an exception to propagate the error to te user.
 
 Operations that support the use of GPUDirect Storage:
 
-- `read_avro`
-- `read_parquet`
-- `read_orc`
-- `to_csv`
-- `to_parquet`
-- `to_orc`
+- :py:func:`cudf.read_avro`
+- :py:func:`cudf.read_parquet`
+- :py:func:`cudf.read_orc`
+- :py:meth:`cudf.DataFrame.to_csv`
+- :py:meth:`cudf.DataFrame.to_parquet`
+- :py:meth:`cudf.DataFrame.to_orc`
 
 Several parameters that can be used to tune the performance of GDS-enabled I/O are exposed through environment variables:
 
diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst
index 521833e2afd..fc24e0c15f4 100644
--- a/docs/cudf/source/basics/io-nvcomp-integration.rst
+++ b/docs/cudf/source/basics/io-nvcomp-integration.rst
@@ -1,14 +1,14 @@
 nvCOMP Integration
 =============================
 
-Some types of compression/decompression can be performed using either `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation. 
+Some types of compression/decompression can be performed using either the `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation.
 
 Which implementation is used by default depends on the data format and the compression type.
 Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
 
 There are three valid values for the environment variable:
 
-- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. 
+- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use.
 - "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations.
 - "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead.
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 7c585602c23..915383e4852 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
 from collections.abc import Sequence
 
 import cupy
@@ -21,21 +23,27 @@ def cut(
     duplicates: str = "raise",
     ordered: bool = True,
 ):
+    """Bin values into discrete intervals.
 
-    """
-    Bin values into discrete intervals.
     Use cut when you need to segment and sort data values into bins. This
     function is also useful for going from a continuous variable to a
     categorical variable.
+
     Parameters
     ----------
     x : array-like
         The input array to be binned. Must be 1-dimensional.
     bins : int, sequence of scalars, or IntervalIndex
         The criteria to bin by.
-        * int : Defines the number of equal-width bins in the
-        range of x. The range of x is extended by .1% on each
-        side to include the minimum and maximum values of x.
+
+        * int : Defines the number of equal-width bins in the range of `x`. The
+          range of `x` is extended by .1% on each side to include the minimum
+          and maximum values of `x`.
+        * sequence of scalars : Defines the bin edges allowing for non-uniform
+          width. No extension of the range of `x` is done.
+        * IntervalIndex : Defines the exact bins to be used. Note that
+          IntervalIndex for `bins` must be non-overlapping.
+
     right : bool, default True
         Indicates whether bins includes the rightmost edge or not.
     labels : array or False, default None
@@ -66,30 +74,38 @@ def cut(
         For scalar or sequence bins, this is an ndarray with the computed
         bins. If set duplicates=drop, bins will drop non-unique bin. For
         an IntervalIndex bins, this is equal to bins.
+
     Examples
     --------
     Discretize into three equal-sized bins.
+
     >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
     CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
-    ...         (5.0, 7.0],(0.994, 3.0]], categories=[(0.994, 3.0],
-    ...         (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category')
+                (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0],
+                (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category')
+
     >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
     (CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
-    ...         (5.0, 7.0],(0.994, 3.0]],categories=[(0.994, 3.0],
-    ...         (3.0, 5.0], (5.0, 7.0]],ordered=True, dtype='category'),
-    array([0.994, 3.   , 5.   , 7.   ]))
+                (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0],
+                (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category'),
+     array([0.994, 3.   , 5.   , 7.   ]))
+
     >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]),
-    ...        3, labels=["bad", "medium", "good"])
+    ...          3, labels=["bad", "medium", "good"])
     CategoricalIndex(['bad', 'good', 'medium', 'medium', 'good', 'bad'],
-    ...       categories=['bad', 'medium', 'good'],ordered=True,
-    ...       dtype='category')
+                     categories=['bad', 'medium', 'good'],ordered=True,
+                     dtype='category')
+
     >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
-    ...       labels=["B", "A", "B"], ordered=False)
+    ...          labels=["B", "A", "B"], ordered=False)
     CategoricalIndex(['B', 'B', 'A', 'A', 'B', 'B'], categories=['A', 'B'],
-    ...        ordered=False, dtype='category')
+               ordered=False, dtype='category')
+
     >>> cudf.cut([0, 1, 1, 2], bins=4, labels=False)
     array([0, 1, 1, 3], dtype=int32)
+
     Passing a Series as an input returns a Series with categorical dtype:
+
     >>> s = cudf.Series(np.array([2, 4, 6, 8, 10]),
     ...        index=['a', 'b', 'c', 'd', 'e'])
     >>> cudf.cut(s, 3)
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6b98e82d553..40f8eda0e4f 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -566,19 +566,20 @@ def mult(df):
             .. code-block::
 
                 >>> df = pd.DataFrame({
-                    'a': [1, 1, 2, 2],
-                    'b': [1, 2, 1, 2],
-                    'c': [1, 2, 3, 4]})
+                ...     'a': [1, 1, 2, 2],
+                ...     'b': [1, 2, 1, 2],
+                ...     'c': [1, 2, 3, 4],
+                ... })
                 >>> gdf = cudf.from_pandas(df)
                 >>> df.groupby('a').apply(lambda x: x.iloc[[0]])
-                        a  b  c
-                    a
-                    1 0  1  1  1
-                    2 2  2  1  3
+                     a  b  c
+                a
+                1 0  1  1  1
+                2 2  2  1  3
                 >>> gdf.groupby('a').apply(lambda x: x.iloc[[0]])
-                        a  b  c
-                    0  1  1  1
-                    2  2  1  3
+                   a  b  c
+                0  1  1  1
+                2  2  1  3
         """
         if not callable(function):
             raise TypeError(f"type {type(function)} is not callable")
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index addc823e7f1..7fa66bd831d 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -81,8 +81,8 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def ndim(self):
-        """Get the dimensionality (always 1 for single-columned frames)."""
+    def ndim(self):  # noqa: D401
+        """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
     @property  # type: ignore

From 6e6c325e7cb99baeecaec65aff8c97aa2450ff51 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 14 Apr 2022 18:58:48 -0500
Subject: [PATCH 03/26] Fix some docstrings formatting (#10660)

This PR fixes some of the broken docstring formattings in the code-base.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10660
---
 docs/cudf/source/api_docs/dataframe.rst       | 3 +++
 docs/cudf/source/api_docs/index_objects.rst   | 2 ++
 docs/cudf/source/api_docs/series.rst          | 2 ++
 docs/cudf/source/api_docs/string_handling.rst | 1 -
 docs/cudf/source/conf.py                      | 1 +
 python/cudf/cudf/core/_base_index.py          | 2 +-
 python/cudf/cudf/core/cut.py                  | 1 +
 python/cudf/cudf/core/indexed_frame.py        | 2 ++
 python/cudf/cudf/core/tools/numeric.py        | 2 +-
 9 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index 1d600acfef1..e0ef3cb2ff0 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -149,6 +149,7 @@ Computations / descriptive stats
    DataFrame.round
    DataFrame.skew
    DataFrame.sum
+   DataFrame.sum_of_squares
    DataFrame.std
    DataFrame.var
    DataFrame.nunique
@@ -248,9 +249,11 @@ Serialization / IO / conversion
    DataFrame.to_dlpack
    DataFrame.to_parquet
    DataFrame.to_csv
+   DataFrame.to_cupy
    DataFrame.to_hdf
    DataFrame.to_dict
    DataFrame.to_json
+   DataFrame.to_numpy
    DataFrame.to_pandas
    DataFrame.to_feather
    DataFrame.to_records
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 6f5affd0ecd..8e0e3bbd411 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -92,7 +92,9 @@ Conversion
 
    Index.astype
    Index.to_arrow
+   Index.to_cupy
    Index.to_list
+   Index.to_numpy
    Index.to_series
    Index.to_frame
    Index.to_pandas
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 95aa71919e4..d7015c9348d 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -390,10 +390,12 @@ Serialization / IO / conversion
    :toctree: api/
 
    Series.to_arrow
+   Series.to_cupy
    Series.to_dlpack
    Series.to_frame
    Series.to_hdf
    Series.to_json
+   Series.to_numpy
    Series.to_pandas
    Series.to_string
    Series.from_arrow
diff --git a/docs/cudf/source/api_docs/string_handling.rst b/docs/cudf/source/api_docs/string_handling.rst
index 3087bcaa826..8d4646c47a7 100644
--- a/docs/cudf/source/api_docs/string_handling.rst
+++ b/docs/cudf/source/api_docs/string_handling.rst
@@ -83,7 +83,6 @@ strings and apply several methods to it. These can be accessed like
    rsplit
    startswith
    strip
-   subword_tokenize
    swapcase
    title
    token_count
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index dbdf8e59e6a..d65b77ef74b 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -252,6 +252,7 @@ def process_class_docstrings(app, what, name, obj, options, lines):
             lines[:] = lines[:cut_index]
 
 
+nitpick_ignore = [("py:class", "SeriesOrIndex"),]
 
 
 def setup(app):
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 259a7f711c3..6fed6510484 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -118,7 +118,7 @@ def get_level_values(self, level):
 
         See Also
         --------
-        cudf.core.multiindex.MultiIndex.get_level_values : Get values for
+        cudf.MultiIndex.get_level_values : Get values for
             a level of a MultiIndex.
 
         Notes
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 915383e4852..0fef6630248 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -64,6 +64,7 @@ def cut(
         Categorical and Series (with Categorical dtype). If True,
         the resulting categorical will be ordered. If False, the resulting
         categorical will be unordered (labels must be provided).
+
     Returns
     -------
     out : CategoricalIndex
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 10736948b57..ea722ec3968 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -991,6 +991,7 @@ def add_prefix(self, prefix):
         Examples
         --------
         **Series**
+
         >>> s = cudf.Series([1, 2, 3, 4])
         >>> s
         0    1
@@ -1006,6 +1007,7 @@ def add_prefix(self, prefix):
         dtype: int64
 
         **DataFrame**
+
         >>> df = cudf.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
         >>> df
            A  B
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 7eea7cedaad..0273227010b 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -57,7 +57,7 @@ def to_numeric(arg, errors="raise", downcast=None):
         otherwise ndarray
 
     Notes
-    -------
+    -----
     An important difference from pandas is that this function does not accept
     mixed numeric/non-numeric type sequences. For example ``[1, 'a']``.
     A ``TypeError`` will be raised when such input is received, regardless of

From 8f5a04451f8f61015d08c5699f0427b550afb53b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 14 Apr 2022 17:24:37 -0700
Subject: [PATCH 04/26] Add option to drop cache in cuIO benchmarks (#10488)

Dropping cache allows us to benchmark I/O times in a realistic/fair way.
Cache is dropped before each iteration if `CUDF_BENCHMARK_DROP_CACHE` environment variable is set.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - MithunR (https://github.com/mythrocks)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10488
---
 cpp/benchmarks/io/csv/csv_reader.cpp         |  2 ++
 cpp/benchmarks/io/cuio_common.cpp            | 28 ++++++++++++++++++++
 cpp/benchmarks/io/cuio_common.hpp            | 10 +++++++
 cpp/benchmarks/io/orc/orc_reader.cpp         |  2 ++
 cpp/benchmarks/io/parquet/parquet_reader.cpp |  2 ++
 cpp/benchmarks/io/text/multibyte_split.cpp   |  1 +
 6 files changed, 45 insertions(+)

diff --git a/cpp/benchmarks/io/csv/csv_reader.cpp b/cpp/benchmarks/io/csv/csv_reader.cpp
index c50f5220200..6f5e7160cd3 100644
--- a/cpp/benchmarks/io/csv/csv_reader.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader.cpp
@@ -52,6 +52,7 @@ void BM_csv_read_varying_input(benchmark::State& state)
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_csv(read_options);
   }
@@ -98,6 +99,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
       // only read the header in the first chunk
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index afe0cc77a4c..7d356263220 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -141,3 +141,31 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,
 
   return selected_segments;
 }
+
+// Executes the command and returns stderr output
+std::string exec_cmd(std::string_view cmd)
+{
+  // Switch stderr and stdout to only capture stderr
+  auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
+  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
+  CUDF_EXPECTS(pipe != nullptr, "popen() failed");
+
+  std::array<char, 128> buffer;
+  std::string error_out;
+  while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+    error_out += buffer.data();
+  }
+  return error_out;
+}
+
+void try_drop_l3_cache()
+{
+  static bool is_drop_cache_enabled = std::getenv("CUDF_BENCHMARK_DROP_CACHE") != nullptr;
+  if (not is_drop_cache_enabled) { return; }
+
+  std::array drop_cache_cmds{"/sbin/sysctl vm.drop_caches=3", "sudo /sbin/sysctl vm.drop_caches=3"};
+  CUDF_EXPECTS(std::any_of(drop_cache_cmds.cbegin(),
+                           drop_cache_cmds.cend(),
+                           [](auto& cmd) { return exec_cmd(cmd).empty(); }),
+               "Failed to execute the drop cache command");
+}
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 2ed534d5333..ff900d20e6f 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -132,3 +132,13 @@ std::vector<std::string> select_column_names(std::vector<std::string> const& col
  * The segments could be Parquet row groups or ORC stripes.
  */
 std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk);
+
+/**
+ * @brief Drops L3 cache if `CUDF_BENCHMARK_DROP_CACHE` environment variable is set.
+ *
+ * Has no effect if the environment variable is not set.
+ * May require sudo access ro run successfully.
+ *
+ * @throw cudf::logic_error if the environment variable is set and the command fails
+ */
+void try_drop_l3_cache();
diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp
index 0fc2238a272..fc76fbe7603 100644
--- a/cpp/benchmarks/io/orc/orc_reader.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader.cpp
@@ -60,6 +60,7 @@ void BM_orc_read_varying_input(benchmark::State& state)
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_orc(read_opts);
   }
@@ -117,6 +118,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
     cudf::size_type rows_read = 0;
diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp
index 8a97fd35c31..b20534e8ac0 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp
@@ -60,6 +60,7 @@ void BM_parq_read_varying_input(benchmark::State& state)
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer const raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_parquet(read_opts);
   }
@@ -117,6 +118,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
     cudf::size_type rows_read = 0;
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index ada8856e8e5..af6c2c5e030 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -137,6 +137,7 @@ static void BM_multibyte_split(benchmark::State& state)
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);
     auto output = cudf::io::text::multibyte_split(*source, delim);
   }

From b542678fda6ea40544d42e759caf3a6f8ad2b44d Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 15 Apr 2022 09:59:51 -0400
Subject: [PATCH 05/26] cuco isn't a cudf dependency when we are built shared
 (#10662)

With the corrections in https://github.com/rapidsai/cudf/pull/10545 we didn't install the cuco headers / cmake files as they aren't needed for shared builds. But we forgot to remove the `find_package(cuco)` call from the generated cudf-config.cmake.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Paul Taylor (https://github.com/trxcllnt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10662
---
 cpp/cmake/thirdparty/get_cucollections.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 1639655d1e9..5232821d113 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -21,12 +21,14 @@ function(find_and_configure_cucollections)
     cuco 0.0.1
     GLOBAL_TARGETS cuco::cuco
     BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
     GIT_TAG fb58a38701f1c24ecfe07d8f1f208bbe80930da5
     EXCLUDE_FROM_ALL ${BUILD_SHARED_LIBS}
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
+  if(NOT BUILD_SHARED_LIBS)
+    rapids_export_package(INSTALL cuco cudf-exports)
+  endif()
 
 endfunction()
 

From 4e668f27ba741ec1065b6ae6f99c0a4608df4336 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 15 Apr 2022 09:40:49 -0500
Subject: [PATCH 06/26] Update UDF notebook in User Guide. (#10668)

I noticed a couple lines I didn't expect in the UDF notebook in the User Guide while working on #10663. I didn't get these changes into that PR (had to wait for a local build to verify some things). The two changes are:
- We don't require `method="cudf"` in groupby statements.
- We don't need to execute `from cudf.utils import cudautils` to run this notebook.

(The cell execution counts also changed. There were some cells executed multiple times the last time this notebook was executed so they got out of order - this fixes it.)

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/10668
---
 .../source/user_guide/guide-to-udfs.ipynb     | 152 +++++++++---------
 1 file changed, 75 insertions(+), 77 deletions(-)

diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 41bce8b865e..0d05ddb00b4 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -138,7 +138,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -148,7 +148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -160,7 +160,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -193,7 +193,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -205,7 +205,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -218,7 +218,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -229,7 +229,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -241,7 +241,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -260,7 +260,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -274,7 +274,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -286,7 +286,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -322,7 +322,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -331,7 +331,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -355,7 +355,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -373,7 +373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -452,7 +452,7 @@
        "4   979   982  1011   9790.0"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -497,7 +497,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -514,7 +514,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -569,7 +569,7 @@
        "2  3     6"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -591,7 +591,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -603,7 +603,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -621,7 +621,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -633,7 +633,7 @@
        "dtype: object"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -658,7 +658,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -709,7 +709,7 @@
        "2     3"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -728,7 +728,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -740,7 +740,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -758,7 +758,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -813,7 +813,7 @@
        "2  3  1"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -836,7 +836,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -848,7 +848,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -866,7 +866,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -921,7 +921,7 @@
        "2  3  3.14"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -939,7 +939,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -951,7 +951,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -982,7 +982,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -1033,7 +1033,7 @@
        "2  5"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1054,7 +1054,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -1066,7 +1066,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1084,7 +1084,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -1151,7 +1151,7 @@
        "2  3  6     4  8  6"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1172,7 +1172,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -1184,7 +1184,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1212,7 +1212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1241,7 +1241,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
@@ -1312,7 +1312,7 @@
        "2  3  6     4  8  6  9.0"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1344,7 +1344,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -1417,7 +1417,7 @@
        "4   979   982  1011"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1443,7 +1443,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -1522,7 +1522,7 @@
        "4   979   982  1011  1961.0"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1555,7 +1555,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
@@ -1570,7 +1570,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1582,7 +1582,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
@@ -1591,7 +1591,7 @@
        "Rolling [window=3,min_periods=3,center=False]"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 35,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1610,7 +1610,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1634,7 +1634,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
@@ -1649,7 +1649,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1667,7 +1667,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [
     {
@@ -1734,7 +1734,7 @@
        "4  59.0  59.0"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 38,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1748,7 +1748,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [
     {
@@ -1845,7 +1845,7 @@
        "9        100.0        100.0"
       ]
      },
-     "execution_count": 43,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1863,12 +1863,12 @@
     "\n",
     "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n",
     "\n",
-    "First, we'll group our DataFrame based on column `b`, which is either True or False. Note that we currently need to pass `method=\"cudf\"` to use UDFs with GroupBy objects."
+    "First, we'll group our DataFrame based on column `b`, which is either True or False."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [
     {
@@ -1947,7 +1947,7 @@
        "4 -0.970850  False   Sarah  0.342905"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 40,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1959,7 +1959,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1975,7 +1975,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2002,7 +2002,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
@@ -2132,7 +2132,7 @@
        "9 -0.725581   True  George  0.405245       0.271319"
       ]
      },
-     "execution_count": 47,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2162,7 +2162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [
     {
@@ -2171,7 +2171,7 @@
        "array([ 1.,  2.,  3.,  4., 10.])"
       ]
      },
-     "execution_count": 48,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2193,7 +2193,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
@@ -2207,14 +2207,12 @@
        "dtype: int32"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 45,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from cudf.utils import cudautils\n",
-    "\n",
     "@cuda.jit\n",
     "def multiply_by_5(x, out):\n",
     "    i = cuda.grid(1)\n",
@@ -2235,7 +2233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [
     {
@@ -2244,7 +2242,7 @@
        "array([ 5., 10., 15., 20., 50.])"
       ]
      },
-     "execution_count": 50,
+     "execution_count": 46,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2307,7 +2305,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.9.12"
   }
  },
  "nbformat": 4,

From 9e1258de32422f6f36e54bd3a2085a3c9c517a66 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 15 Apr 2022 12:13:45 -0700
Subject: [PATCH 07/26] Use `std::filesystem` for temporary directory location
 and deletion (#10664)

Addressing a long-standing TODO. Since std::filesystem is available since C++17, use it to recursively delete temporary directories (used for benchmarks, etc.).
Small step towards portable temp directory/file utilities.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/10664
---
 cpp/benchmarks/text/subword.cpp          |  3 ++-
 cpp/include/cudf_test/file_utilities.hpp | 20 +++++++-------------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index 150f578a22a..b8311324f70 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -21,6 +21,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <vector>
@@ -29,7 +30,7 @@
 
 static std::string create_hash_vocab_file()
 {
-  std::string dir_template("/tmp");
+  std::string dir_template{std::filesystem::temp_directory_path().string()};
   if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
   std::string hash_file = dir_template + "/hash_vocab.txt";
   // create a fake hashed vocab text file for this test
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 4df7b6a69c8..d722b836674 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -18,6 +18,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <filesystem>
 #include <string>
 
 #include <ftw.h>
@@ -34,17 +35,14 @@ class temp_directory {
  public:
   temp_directory(const std::string& base_name)
   {
-    std::string dir_template("/tmp");
-    if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
+    std::string dir_template{std::filesystem::temp_directory_path().string()};
+    if (auto env_p = std::getenv("WORKSPACE")) dir_template = env_p;
+
     dir_template += "/" + base_name + ".XXXXXX";
     auto const tmpdirptr = mkdtemp(const_cast<char*>(dir_template.data()));
-    if (tmpdirptr == nullptr) CUDF_FAIL("Temporary directory creation failure: " + dir_template);
-    _path = dir_template + "/";
-  }
+    CUDF_EXPECTS(tmpdirptr != nullptr, "Temporary directory creation failure: " + dir_template);
 
-  static int rm_files(const char* pathname, const struct stat* sbuf, int type, struct FTW* ftwb)
-  {
-    return std::remove(pathname);
+    _path = dir_template + "/";
   }
 
   temp_directory& operator=(temp_directory const&) = delete;
@@ -52,11 +50,7 @@ class temp_directory {
   temp_directory& operator=(temp_directory&&) = default;
   temp_directory(temp_directory&&)            = default;
 
-  ~temp_directory()
-  {
-    // TODO: should use std::filesystem instead, once C++17 support added
-    nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
-  }
+  ~temp_directory() { std::filesystem::remove_all(std::filesystem::path{_path}); }
 
   /**
    * @brief Returns the path of the temporary directory

From d5a982b621dc67b1a9db6292abcc4e72e4693b36 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 15 Apr 2022 16:00:13 -0400
Subject: [PATCH 08/26] Add column field ID control in parquet writer (#10504)

Closes https://github.com/rapidsai/cudf/issues/10375
Closes https://github.com/rapidsai/cudf/issues/10376

This PR enables column `field_id` control in the parquet writer. When writing a parquet file, users can specify a column's `field_id` via `column_in_metadata.set_parquet_field_id()`. JNI bindings and uni tests are added as well.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/10504
---
 cpp/include/cudf/io/types.hpp                 |  27 +++-
 .../io/parquet/compact_protocol_reader.cpp    |   1 +
 .../io/parquet/compact_protocol_reader.hpp    |  24 ++++
 .../io/parquet/compact_protocol_writer.cpp    |   1 +
 cpp/src/io/parquet/parquet.hpp                |   6 +-
 cpp/src/io/parquet/writer_impl.cu             |  15 ++-
 cpp/tests/io/parquet_test.cpp                 |  10 +-
 .../ai/rapids/cudf/ColumnWriterOptions.java   | 119 +++++++++++++++--
 .../CompressionMetadataWriterOptions.java     |  10 ++
 java/src/main/java/ai/rapids/cudf/Table.java  |  15 ++-
 java/src/main/native/src/TableJni.cpp         |  46 +++++--
 .../test/java/ai/rapids/cudf/TableTest.java   | 121 ++++++++++++++++++
 12 files changed, 368 insertions(+), 27 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 7e4ab5b8d9d..23ed0153f3f 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -244,6 +244,7 @@ class column_in_metadata {
   bool _use_int96_timestamp = false;
   // bool _output_as_binary = false;
   thrust::optional<uint8_t> _decimal_precision;
+  thrust::optional<int32_t> _parquet_field_id;
   std::vector<column_in_metadata> children;
 
  public:
@@ -324,6 +325,18 @@ class column_in_metadata {
     return *this;
   }
 
+  /**
+   * @brief Set the parquet field id of this column.
+   *
+   * @param field_id The parquet field id to set
+   * @return this for chaining
+   */
+  column_in_metadata& set_parquet_field_id(int32_t field_id)
+  {
+    _parquet_field_id = field_id;
+    return *this;
+  }
+
   /**
    * @brief Get reference to a child of this column
    *
@@ -379,6 +392,18 @@ class column_in_metadata {
    */
   [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
 
+  /**
+   * @brief Get whether parquet field id has been set for this column.
+   */
+  [[nodiscard]] bool is_parquet_field_id_set() const { return _parquet_field_id.has_value(); }
+
+  /**
+   * @brief Get the parquet field id that was set for this column.
+   * @throws If parquet field id was not set for this column.
+   *         Check using `is_parquet_field_id_set()` first.
+   */
+  [[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); }
+
   /**
    * @brief Get the number of children of this column
    */
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 7feaa8e61b4..a1fc2edb0bb 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -156,6 +156,7 @@ bool CompactProtocolReader::read(SchemaElement* s)
                             ParquetFieldEnum<ConvertedType>(6, s->converted_type),
                             ParquetFieldInt32(7, s->decimal_scale),
                             ParquetFieldInt32(8, s->decimal_precision),
+                            ParquetFieldOptionalInt32(9, s->field_id),
                             ParquetFieldStruct(10, s->logical_type));
   return function_builder(this, op);
 }
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index ba48f7b127f..ddca6c37e08 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet.hpp"
 
+#include <thrust/optional.h>
+
 #include <algorithm>
 #include <cstddef>
 #include <string>
@@ -137,6 +139,7 @@ class CompactProtocolReader {
   friend class ParquetFieldBool;
   friend class ParquetFieldInt8;
   friend class ParquetFieldInt32;
+  friend class ParquetFieldOptionalInt32;
   friend class ParquetFieldInt64;
   template <typename T>
   friend class ParquetFieldStructListFunctor;
@@ -216,6 +219,27 @@ class ParquetFieldInt32 {
   int field() { return field_val; }
 };
 
+/**
+ * @brief Functor to set value to optional 32 bit integer read from CompactProtocolReader
+ *
+ * @return True if field type is not int32
+ */
+class ParquetFieldOptionalInt32 {
+  int field_val;
+  thrust::optional<int32_t>& val;
+
+ public:
+  ParquetFieldOptionalInt32(int f, thrust::optional<int32_t>& v) : field_val(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    val = cpr->get_i32();
+    return (field_type != ST_FLD_I32);
+  }
+
+  int field() { return field_val; }
+};
+
 /**
  * @brief Functor to set value to 64 bit integer read from CompactProtocolReader
  *
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 927844cb1c2..176ecb6a572 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -144,6 +144,7 @@ size_t CompactProtocolWriter::write(const SchemaElement& s)
       c.field_int(8, s.decimal_precision);
     }
   }
+  if (s.field_id) { c.field_int(9, s.field_id.value()); }
   auto const isset = s.logical_type.isset;
   // TODO: add handling for all logical types
   // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index b1800640c91..ccaf3485bdf 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet_common.hpp"
 
+#include <thrust/optional.h>
+
 #include <cstdint>
 #include <string>
 #include <vector>
@@ -145,6 +147,7 @@ struct SchemaElement {
   int32_t num_children                = 0;
   int32_t decimal_scale               = 0;
   int32_t decimal_precision           = 0;
+  thrust::optional<int32_t> field_id  = thrust::nullopt;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
@@ -157,7 +160,8 @@ struct SchemaElement {
     return type == other.type && converted_type == other.converted_type &&
            type_length == other.type_length && repetition_type == other.repetition_type &&
            name == other.name && num_children == other.num_children &&
-           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision;
+           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision &&
+           field_id == other.field_id;
   }
 
   // the parquet format is a little squishy when it comes to interpreting
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index cb1acb4d9ec..4bc084c61d0 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -491,6 +491,13 @@ std::vector<schema_tree_node> construct_schema_tree(
     [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) {
       bool col_nullable = is_col_nullable(col, col_meta, single_write_mode);
 
+      auto set_field_id = [&schema, parent_idx](schema_tree_node& s,
+                                                column_in_metadata const& col_meta) {
+        if (schema[parent_idx].name != "list" and col_meta.is_parquet_field_id_set()) {
+          s.field_id = col_meta.get_parquet_field_id();
+        }
+      };
+
       if (col->type().id() == type_id::STRUCT) {
         // if struct, add current and recursively call for all children
         schema_tree_node struct_schema{};
@@ -500,6 +507,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         struct_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
         struct_schema.num_children = col->children.size();
         struct_schema.parent_idx   = parent_idx;
+        set_field_id(struct_schema, col_meta);
         schema.push_back(std::move(struct_schema));
 
         auto struct_node_index = schema.size() - 1;
@@ -524,6 +532,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         list_schema_1.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
         list_schema_1.num_children = 1;
         list_schema_1.parent_idx   = parent_idx;
+        set_field_id(list_schema_1, col_meta);
         schema.push_back(std::move(list_schema_1));
 
         schema_tree_node list_schema_2{};
@@ -555,7 +564,10 @@ std::vector<schema_tree_node> construct_schema_tree(
         map_schema.converted_type = ConvertedType::MAP;
         map_schema.repetition_type =
           col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
-        map_schema.name         = col_meta.get_name();
+        map_schema.name = col_meta.get_name();
+        if (col_meta.is_parquet_field_id_set()) {
+          map_schema.field_id = col_meta.get_parquet_field_id();
+        }
         map_schema.num_children = 1;
         map_schema.parent_idx   = parent_idx;
         schema.push_back(std::move(map_schema));
@@ -612,6 +624,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
         col_schema.parent_idx  = parent_idx;
         col_schema.leaf_column = col;
+        set_field_id(col_schema, col_meta);
         schema.push_back(col_schema);
       }
     };
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index cd0aab3caeb..3905df2b274 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -219,15 +219,21 @@ struct ParquetWriterTimestampTypeTest : public ParquetWriterTest {
   auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
+// Typed test fixture for all types
+template <typename T>
+struct ParquetWriterSchemaTest : public ParquetWriterTest {
+  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
+};
+
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
 TYPED_TEST_SUITE(ParquetWriterNumericTypeTest, SupportedTypes);
-using SupportedChronoTypes = cudf::test::Concat<cudf::test::ChronoTypes, cudf::test::DurationTypes>;
-TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, SupportedChronoTypes);
+TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, cudf::test::ChronoTypes);
 using SupportedTimestampTypes =
   cudf::test::Types<cudf::timestamp_ms, cudf::timestamp_us, cudf::timestamp_ns>;
 TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes);
+TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes);
 
 // Base test fixture for chunked writer tests
 struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
index 78b3d5d52ec..f3fb7de6abe 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
@@ -33,9 +33,15 @@ public class ColumnWriterOptions {
   private boolean isNullable;
   private boolean isMap = false;
   private String columnName;
+  // only for Parquet
+  private boolean hasParquetFieldId;
+  private int parquetFieldId;
+
   private ColumnWriterOptions(AbstractStructBuilder builder) {
     this.columnName = builder.name;
     this.isNullable = builder.isNullable;
+    this.hasParquetFieldId = builder.hasParquetFieldId;
+    this.parquetFieldId = builder.parquetFieldId;
     this.childColumnOptions =
         (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]);
   }
@@ -67,6 +73,10 @@ public AbstractStructBuilder(String name, boolean isNullable) {
       super(name, isNullable);
     }
 
+    public AbstractStructBuilder(String name, boolean isNullable, int parquetFieldId) {
+      super(name, isNullable, parquetFieldId);
+    }
+
     protected AbstractStructBuilder() {
       super();
     }
@@ -84,6 +94,9 @@ public static abstract class NestedBuilder<T extends NestedBuilder, V extends Co
     protected List<ColumnWriterOptions> children = new ArrayList<>();
     protected boolean isNullable = true;
     protected String name = "";
+    // Parquet structure needs
+    protected boolean hasParquetFieldId;
+    protected int parquetFieldId;
 
     /**
      * Builder specific to build a Struct meta
@@ -93,22 +106,43 @@ protected NestedBuilder(String name, boolean isNullable) {
       this.isNullable = isNullable;
     }
 
+    protected NestedBuilder(String name, boolean isNullable, int parquetFieldId) {
+      this.name = name;
+      this.isNullable = isNullable;
+      this.hasParquetFieldId = true;
+      this.parquetFieldId = parquetFieldId;
+    }
+
     protected NestedBuilder() {}
 
-    protected ColumnWriterOptions withColumns(String name, boolean isNullable) {
+    protected ColumnWriterOptions withColumn(String name, boolean isNullable) {
       return new ColumnWriterOptions(name, isNullable);
     }
 
+    protected ColumnWriterOptions withColumn(String name, boolean isNullable, int parquetFieldId) {
+      return new ColumnWriterOptions(name, isNullable, parquetFieldId);
+    }
+
     protected ColumnWriterOptions withDecimal(String name, int precision,
                                               boolean isNullable) {
       return new ColumnWriterOptions(name, false, precision, isNullable);
     }
 
+    protected ColumnWriterOptions withDecimal(String name, int precision,
+                                              boolean isNullable, int parquetFieldId) {
+      return new ColumnWriterOptions(name, false, precision, isNullable, parquetFieldId);
+    }
+
     protected ColumnWriterOptions withTimestamp(String name, boolean isInt96,
                                                 boolean isNullable) {
       return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable);
     }
 
+    protected ColumnWriterOptions withTimestamp(String name, boolean isInt96,
+                                                boolean isNullable, int parquetFieldId) {
+      return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable, parquetFieldId);
+    }
+
     /**
      * Set the list column meta.
      * Lists should have only one child in ColumnVector, but the metadata expects a
@@ -155,16 +189,16 @@ public T withStructColumn(StructColumnWriterOptions child) {
     /**
      * Set column name
      */
-    public T withNonNullableColumns(String... name) {
-      withColumns(false, name);
+    public T withNonNullableColumns(String... names) {
+      withColumns(false, names);
       return (T) this;
     }
 
     /**
      * Set nullable column meta data
      */
-    public T withNullableColumns(String... name) {
-      withColumns(true, name);
+    public T withNullableColumns(String... names) {
+      withColumns(true, names);
       return (T) this;
     }
 
@@ -172,13 +206,22 @@ public T withNullableColumns(String... name) {
      * Set a simple child meta data
      * @return this for chaining.
      */
-    public T withColumns(boolean nullable, String... name) {
-      for (String n : name) {
-        children.add(withColumns(n, nullable));
+    public T withColumns(boolean nullable, String... names) {
+      for (String n : names) {
+        children.add(withColumn(n, nullable));
       }
       return (T) this;
     }
 
+    /**
+     * Set a simple child meta data
+     * @return this for chaining.
+     */
+    public T withColumn(boolean nullable, String name, int parquetFieldId) {
+      children.add(withColumn(name, nullable, parquetFieldId));
+      return (T) this;
+    }
+
     /**
      * Set a Decimal child meta data
      * @return this for chaining.
@@ -188,6 +231,15 @@ public T withDecimalColumn(String name, int precision, boolean nullable) {
       return (T) this;
     }
 
+    /**
+     * Set a Decimal child meta data
+     * @return this for chaining.
+     */
+    public T withDecimalColumn(String name, int precision, boolean nullable, int parquetFieldId) {
+      children.add(withDecimal(name, precision, nullable, parquetFieldId));
+      return (T) this;
+    }
+
     /**
      * Set a Decimal child meta data
      * @return this for chaining.
@@ -206,6 +258,15 @@ public T withDecimalColumn(String name, int precision) {
       return (T) this;
     }
 
+    /**
+     * Set a timestamp child meta data
+     * @return this for chaining.
+     */
+    public T withTimestampColumn(String name, boolean isInt96, boolean nullable, int parquetFieldId) {
+      children.add(withTimestamp(name, isInt96, nullable, parquetFieldId));
+      return (T) this;
+    }
+
     /**
      * Set a timestamp child meta data
      * @return this for chaining.
@@ -244,6 +305,13 @@ public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
     this.columnName = columnName;
   }
 
+  public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
+                             int precision, boolean isNullable, int parquetFieldId) {
+    this(columnName, isTimestampTypeInt96, precision, isNullable);
+    this.hasParquetFieldId = true;
+    this.parquetFieldId = parquetFieldId;
+  }
+
   public ColumnWriterOptions(String columnName, boolean isNullable) {
     this.isTimestampTypeInt96 = false;
     this.precision = UNKNOWN_PRECISION;
@@ -251,6 +319,12 @@ public ColumnWriterOptions(String columnName, boolean isNullable) {
     this.columnName = columnName;
   }
 
+  public ColumnWriterOptions(String columnName, boolean isNullable, int parquetFieldId) {
+    this(columnName, isNullable);
+    this.hasParquetFieldId = true;
+    this.parquetFieldId = parquetFieldId;
+  }
+
   public ColumnWriterOptions(String columnName) {
     this(columnName, true);
   }
@@ -302,6 +376,24 @@ int[] getFlatPrecision() {
     }
   }
 
+  boolean[] getFlatHasParquetFieldId() {
+    boolean[] ret = {hasParquetFieldId};
+    if (childColumnOptions.length > 0) {
+      return getFlatBooleans(ret, (opt) -> opt.getFlatHasParquetFieldId());
+    } else {
+      return ret;
+    }
+  }
+
+  int[] getFlatParquetFieldId() {
+    int[] ret = {parquetFieldId};
+    if (childColumnOptions.length > 0) {
+      return getFlatInts(ret, (opt) -> opt.getFlatParquetFieldId());
+    } else {
+      return ret;
+    }
+  }
+
   boolean[] getFlatIsNullable() {
     boolean[] ret = {isNullable};
     if (childColumnOptions.length > 0) {
@@ -418,6 +510,13 @@ public static StructBuilder structBuilder(String name, boolean isNullable) {
     return new StructBuilder(name, isNullable);
   }
 
+  /**
+   * Creates a StructBuilder for column called 'name'
+   */
+  public static StructBuilder structBuilder(String name, boolean isNullable, int parquetFieldId) {
+    return new StructBuilder(name, isNullable, parquetFieldId);
+  }
+
   /**
    * Creates a StructBuilder for column called 'name'
    */
@@ -477,6 +576,10 @@ public StructBuilder(String name, boolean isNullable) {
       super(name, isNullable);
     }
 
+    public StructBuilder(String name, boolean isNullable, int parquetFieldId) {
+      super(name, isNullable, parquetFieldId);
+    }
+
     public StructColumnWriterOptions build() {
       return new StructColumnWriterOptions(this);
     }
diff --git a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java
index 9292975d0ce..3a3b7d721b7 100644
--- a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java
@@ -41,6 +41,16 @@ int[] getFlatPrecision() {
     return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatPrecision());
   }
 
+  @Override
+  boolean[] getFlatHasParquetFieldId() {
+    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatHasParquetFieldId());
+  }
+
+  @Override
+  int[] getFlatParquetFieldId() {
+    return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatParquetFieldId());
+  }
+
   @Override
   int[] getFlatNumChildren() {
     return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatNumChildren());
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index ff966643866..24f7d44ed28 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -289,7 +289,10 @@ private static native long writeParquetFileBegin(String[] columnNames,
                                                    int statsFreq,
                                                    boolean[] isInt96,
                                                    int[] precisions,
-                                                   boolean[] isMapValues, String filename) throws CudfException;
+                                                   boolean[] isMapValues,
+                                                   boolean[] hasParquetFieldIds,
+                                                   int[] parquetFieldIds,
+                                                   String filename) throws CudfException;
 
   /**
    * Setup everything to write parquet formatted data to a buffer.
@@ -319,6 +322,8 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      boolean[] isInt96,
                                                      int[] precisions,
                                                      boolean[] isMapValues,
+                                                     boolean[] hasParquetFieldIds,
+                                                     int[] parquetFieldIds,
                                                      HostBufferConsumer consumer) throws CudfException;
 
   /**
@@ -1201,6 +1206,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
       boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
       boolean[] isMapValues = options.getFlatIsMap();
       int[] precisions = options.getFlatPrecision();
+      boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId();
+      int[] parquetFieldIds = options.getFlatParquetFieldId();
       int[] flatNumChildren = options.getFlatNumChildren();
 
       this.consumer = null;
@@ -1215,6 +1222,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
           timeInt96Values,
           precisions,
           isMapValues,
+          hasParquetFieldIds,
+          parquetFieldIds,
           outputFile.getAbsolutePath());
     }
 
@@ -1224,6 +1233,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
       boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
       boolean[] isMapValues = options.getFlatIsMap();
       int[] precisions = options.getFlatPrecision();
+      boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId();
+      int[] parquetFieldIds = options.getFlatParquetFieldId();
       int[] flatNumChildren = options.getFlatNumChildren();
 
       this.consumer = consumer;
@@ -1238,6 +1249,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
           timeInt96Values,
           precisions,
           isMapValues,
+          hasParquetFieldIds,
+          parquetFieldIds,
           consumer);
     }
 
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index cebe476dd87..919958d4db2 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -673,6 +673,8 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
                         cudf::jni::native_jbooleanArray &is_int96,
                         cudf::jni::native_jintArray &precisions,
                         cudf::jni::native_jbooleanArray &is_map,
+                        cudf::jni::native_jbooleanArray &hasParquetFieldIds,
+                        cudf::jni::native_jintArray &parquetFieldIds,
                         cudf::jni::native_jintArray &children, int num_children, int read_index) {
   int write_index = 0;
   for (int i = 0; i < num_children; i++, write_index++) {
@@ -687,12 +689,15 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
     if (is_map[read_index]) {
       child.set_list_column_as_map();
     }
+    if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
+      child.set_parquet_field_id(parquetFieldIds[read_index]);
+    }
     column_metadata.add_child(child);
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index =
-          set_column_metadata(column_metadata.child(write_index), col_names, nullability, is_int96,
-                              precisions, is_map, children, childs_children, read_index);
+      read_index = set_column_metadata(column_metadata.child(write_index), col_names, nullability,
+                                       is_int96, precisions, is_map, hasParquetFieldIds,
+                                       parquetFieldIds, children, childs_children, read_index);
     }
   }
   return read_index;
@@ -701,12 +706,15 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
 void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names,
                          jintArray &j_children, jbooleanArray &j_col_nullability,
                          jbooleanArray &j_is_int96, jintArray &j_precisions,
-                         jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata) {
+                         jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata,
+                         jbooleanArray &j_hasParquetFieldIds, jintArray &j_parquetFieldIds) {
   cudf::jni::auto_set_device(env);
   cudf::jni::native_jstringArray col_names(env, j_col_names);
   cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
   cudf::jni::native_jbooleanArray is_int96(env, j_is_int96);
   cudf::jni::native_jintArray precisions(env, j_precisions);
+  cudf::jni::native_jbooleanArray hasParquetFieldIds(env, j_hasParquetFieldIds);
+  cudf::jni::native_jintArray parquetFieldIds(env, j_parquetFieldIds);
   cudf::jni::native_jintArray children(env, j_children);
   cudf::jni::native_jbooleanArray is_map(env, j_is_map);
 
@@ -729,11 +737,14 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
     if (is_map[read_index]) {
       metadata.column_metadata[write_index].set_list_column_as_map();
     }
+    if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
+      metadata.column_metadata[write_index].set_parquet_field_id(parquetFieldIds[read_index]);
+    }
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index =
-          set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability,
-                              is_int96, precisions, is_map, children, childs_children, read_index);
+      read_index = set_column_metadata(
+          metadata.column_metadata[write_index], cpp_names, col_nullability, is_int96, precisions,
+          is_map, hasParquetFieldIds, parquetFieldIds, children, childs_children, read_index);
     }
   }
 }
@@ -1539,7 +1550,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
     jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jobject consumer) {
+    jbooleanArray j_is_map, jbooleanArray j_hasParquetFieldIds, jintArray j_parquetFieldIds,
+    jobject consumer) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1554,7 +1566,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     sink_info sink{data_sink.get()};
     table_input_metadata metadata;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata);
+                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds);
 
     auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
@@ -1583,7 +1595,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
     jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jstring j_output_path) {
+    jbooleanArray j_is_map, jbooleanArray j_hasParquetFieldIds, jintArray j_parquetFieldIds,
+    jstring j_output_path) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1596,7 +1609,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     using namespace cudf::jni;
     table_input_metadata metadata;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata);
+                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds);
 
     auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
@@ -1721,8 +1734,12 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     table_input_metadata metadata;
     // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_is_int96 = NULL;
+    // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
+    jbooleanArray j_hasParquetFieldIds = NULL;
+    jintArray j_parquetFieldIds = NULL;
+
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata);
+                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds);
 
     auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
@@ -1766,8 +1783,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     table_input_metadata metadata;
     // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_is_int96 = NULL;
+    // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
+    jbooleanArray j_hasParquetFieldIds = NULL;
+    jintArray j_parquetFieldIds = NULL;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata);
+                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds);
 
     auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 7be1ca2118b..af28cfb6d6c 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -36,6 +36,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.OriginalType;
 import org.junit.jupiter.api.Test;
@@ -7899,6 +7900,126 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
     }
   }
 
+  @Test
+  void testParquetWriteWithFieldId() throws IOException {
+    // field IDs are:
+    // c1: -1, c2: 2, c3: 3, c31: 31, c32: 32, c4: -4, c5: not specified
+    ColumnWriterOptions.StructBuilder sBuilder =
+        structBuilder("c3", true, 3)
+            .withColumn(true, "c31", 31)
+            .withColumn(true, "c32", 32);
+    ParquetWriterOptions options = ParquetWriterOptions.builder()
+        .withColumn(true, "c1", -1)
+        .withDecimalColumn("c2", 9, true, 2)
+        .withStructColumn(sBuilder.build())
+        .withTimestampColumn("c4", true, true, -4)
+        .withColumns( true, "c5")
+        .build();
+
+    File tempFile = File.createTempFile("test-field-id", ".parquet");
+    try {
+      HostColumnVector.StructType structType = new HostColumnVector.StructType(
+          true,
+          new HostColumnVector.BasicType(true, DType.STRING),
+          new HostColumnVector.BasicType(true, DType.STRING));
+
+      try (Table table0 = new Table.TestBuilder()
+          .column(true, false) // c1
+          .decimal32Column(0, 298, 2473) // c2
+          .column(structType, // c3
+              new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b"))
+          .timestampMicrosecondsColumn(1000L, 2000L) // c4
+          .column("a", "b") // c5
+          .build()) {
+        try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
+          writer.write(table0);
+        }
+      }
+
+      try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(
+          new Path(tempFile.getAbsolutePath()),
+          new Configuration()))) {
+        MessageType schema = reader.getFooter().getFileMetaData().getSchema();
+        assert (schema.getFields().get(0).getId().intValue() == -1);
+        assert (schema.getFields().get(1).getId().intValue() == 2);
+        assert (schema.getFields().get(2).getId().intValue() == 3);
+        assert (((GroupType) schema.getFields().get(2)).getFields().get(0).getId().intValue() == 31);
+        assert (((GroupType) schema.getFields().get(2)).getFields().get(1).getId().intValue() == 32);
+        assert (schema.getFields().get(3).getId().intValue() == -4);
+        assert (schema.getFields().get(4).getId() == null);
+      }
+    } finally {
+      tempFile.delete();
+    }
+  }
+
+  @Test
+  void testParquetWriteWithFieldIdNestNotSpecified() throws IOException {
+    // field IDs are:
+    // c0: no field ID
+    // c1: 1
+    // c2: no field ID
+    //   c21: 21
+    //   c22: no field ID
+    // c3: 3
+    //   c31: 31
+    //   c32: no field ID
+    // c4: 0
+    ColumnWriterOptions.StructBuilder c2Builder =
+        structBuilder("c2", true)
+            .withColumn(true, "c21", 21)
+            .withColumns(true, "c22");
+    ColumnWriterOptions.StructBuilder c3Builder =
+        structBuilder("c3", true, 3)
+            .withColumn(true, "c31", 31)
+            .withColumns(true, "c32");
+    ParquetWriterOptions options = ParquetWriterOptions.builder()
+        .withColumns(true, "c0")
+        .withDecimalColumn("c1", 9, true, 1)
+        .withStructColumn(c2Builder.build())
+        .withStructColumn(c3Builder.build())
+        .withColumn(true, "c4", 0)
+        .build();
+
+    File tempFile = File.createTempFile("test-field-id", ".parquet");
+    try {
+      HostColumnVector.StructType structType = new HostColumnVector.StructType(
+          true,
+          new HostColumnVector.BasicType(true, DType.STRING),
+          new HostColumnVector.BasicType(true, DType.STRING));
+
+      try (Table table0 = new Table.TestBuilder()
+          .column(true, false) // c0
+          .decimal32Column(0, 298, 2473) // c1
+          .column(structType, // c2
+              new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b"))
+          .column(structType, // c3
+              new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b"))
+          .column("a", "b") // c4
+          .build()) {
+        try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
+          writer.write(table0);
+        }
+      }
+
+      try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(
+          new Path(tempFile.getAbsolutePath()),
+          new Configuration()))) {
+        MessageType schema = reader.getFooter().getFileMetaData().getSchema();
+        assert (schema.getFields().get(0).getId() == null);
+        assert (schema.getFields().get(1).getId().intValue() == 1);
+        assert (schema.getFields().get(2).getId() == null);
+        assert (((GroupType) schema.getFields().get(2)).getFields().get(0).getId().intValue() == 21);
+        assert (((GroupType) schema.getFields().get(2)).getFields().get(1).getId() == null);
+        assert (((GroupType) schema.getFields().get(3)).getFields().get(0).getId().intValue() == 31);
+        assert (((GroupType) schema.getFields().get(3)).getFields().get(1).getId() == null);
+        assert (schema.getFields().get(4).getId().intValue() == 0);
+      }
+    } finally {
+      tempFile.delete();
+    }
+  }
+
   /** Return a column where DECIMAL64 has been up-casted to DECIMAL128 */
   private ColumnVector castDecimal64To128(ColumnView c) {
     DType dtype = c.getType();

From 94a5d4180b1281d4250e9f915e547789d8da3ce0 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 15 Apr 2022 16:08:09 -0400
Subject: [PATCH 09/26] Add support for null and non-numeric types in
 Series.diff and DataFrame.diff (#10625)

This PR supports non-numeric data types (timestamp and ranges) in `Series.diff` and `DataFrame.diff`. In `DataFrame.diff`, datetime ranges are already supported because `DataFrame.shift` works. But `Series.diff` doesn't use the `Series.shift` implementation, so there wasn't support for datetime ranges.
```python
import datetime
dti = pd.to_datetime(
    ["1/1/2018", np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1), datetime.datetime(2020, 1, 1)]
)
df = DataFrame({"dates": dti})
df.diff(periods=periods, axis=axis)
```
closes #10212.

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/10625
---
 python/cudf/cudf/core/dataframe.py       |  5 --
 python/cudf/cudf/core/series.py          | 52 +++++++--------------
 python/cudf/cudf/tests/test_dataframe.py | 28 +++++++++---
 python/cudf/cudf/tests/test_series.py    | 58 ++++++++++++++++++++++++
 python/cudf/cudf/utils/cudautils.py      | 21 ---------
 5 files changed, 96 insertions(+), 68 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index ae60cd91fac..8893b85c97c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2660,11 +2660,6 @@ def diff(self, periods=1, axis=0):
         if axis != 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
-        if not all(is_numeric_dtype(i) for i in self.dtypes):
-            raise NotImplementedError(
-                "DataFrame.diff only supports numeric dtypes"
-            )
-
         if abs(periods) > len(self):
             df = cudf.DataFrame._from_data(
                 {
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 6e15c03e6b4..20ba52afccd 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -14,11 +14,11 @@
 import numpy as np
 import pandas as pd
 from pandas._config import get_option
+from pandas.core.dtypes.common import is_float
 
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.scalar import _is_null_host_scalar
-from cudf._lib.transform import bools_to_mask
 from cudf._typing import ColumnLike, DataFrameOrSeries, ScalarLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -42,7 +42,6 @@
     arange,
     as_column,
     column,
-    column_empty_like,
     full,
 )
 from cudf.core.column.categorical import (
@@ -64,7 +63,7 @@
 )
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.core.udf.scalar_function import _get_scalar_kernel
-from cudf.utils import cudautils, docutils
+from cudf.utils import docutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     can_convert_to_column,
@@ -2969,19 +2968,22 @@ def digitize(self, bins, right=False):
 
     @_cudf_nvtx_annotate
     def diff(self, periods=1):
-        """Calculate the difference between values at positions i and i - N in
-        an array and store the output in a new array.
+        """First discrete difference of element.
+
+        Calculates the difference of a Series element compared with another
+        element in the Series (default is element in previous row).
+
+        Parameters
+        ----------
+        periods : int, default 1
+            Periods to shift for calculating difference,
+            accepts negative values.
 
         Returns
         -------
         Series
             First differences of the Series.
 
-        Notes
-        -----
-        Diff currently only supports float and integer dtype columns with
-        no null values.
-
         Examples
         --------
         >>> import cudf
@@ -3028,32 +3030,12 @@ def diff(self, periods=1):
         5    <NA>
         dtype: int64
         """
-        if self.has_nulls:
-            raise AssertionError(
-                "Diff currently requires columns with no null values"
-            )
-
-        if not np.issubdtype(self.dtype, np.number):
-            raise NotImplementedError(
-                "Diff currently only supports numeric dtypes"
-            )
-
-        # TODO: move this libcudf
-        input_col = self._column
-        output_col = column_empty_like(input_col)
-        output_mask = column_empty_like(input_col, dtype="bool")
-        if output_col.size > 0:
-            cudautils.gpu_diff.forall(output_col.size)(
-                input_col, output_col, output_mask, periods
-            )
-
-        output_col = column.build_column(
-            data=output_col.data,
-            dtype=output_col.dtype,
-            mask=bools_to_mask(output_mask),
-        )
+        if not is_integer(periods):
+            if not (is_float(periods) and periods.is_integer()):
+                raise ValueError("periods must be an integer")
+            periods = int(periods)
 
-        return Series(output_col, name=self.name, index=self.index)
+        return self - self.shift(periods=periods)
 
     @copy_docstring(SeriesGroupBy)
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 13ab0b35822..07261534777 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9098,7 +9098,7 @@ def test_groupby_cov_for_pandas_bug_case():
     ],
 )
 @pytest.mark.parametrize("periods", (-5, -1, 0, 1, 5))
-def test_diff_dataframe_numeric_dtypes(data, periods):
+def test_diff_numeric_dtypes(data, periods):
     gdf = cudf.DataFrame(data)
     pdf = gdf.to_pandas()
 
@@ -9137,7 +9137,7 @@ def test_diff_decimal_dtypes(precision, scale, dtype):
     )
 
 
-def test_diff_dataframe_invalid_axis():
+def test_diff_invalid_axis():
     gdf = cudf.DataFrame(np.array([1.123, 2.343, 5.890, 0.0]))
     with pytest.raises(NotImplementedError, match="Only axis=0 is supported."):
         gdf.diff(periods=1, axis=1)
@@ -9152,16 +9152,30 @@ def test_diff_dataframe_invalid_axis():
             "string_col": ["a", "b", "c", "d", "e"],
         },
         ["a", "b", "c", "d", "e"],
-        [np.nan, None, np.nan, None],
     ],
 )
-def test_diff_dataframe_non_numeric_dypes(data):
+def test_diff_unsupported_dtypes(data):
     gdf = cudf.DataFrame(data)
     with pytest.raises(
-        NotImplementedError,
-        match="DataFrame.diff only supports numeric dtypes",
+        TypeError,
+        match=r"unsupported operand type\(s\)",
     ):
-        gdf.diff(periods=2, axis=0)
+        gdf.diff()
+
+
+def test_diff_many_dtypes():
+    pdf = pd.DataFrame(
+        {
+            "dates": pd.date_range("2020-01-01", "2020-01-06", freq="D"),
+            "bools": [True, True, True, False, True, True],
+            "floats": [1.0, 2.0, 3.5, np.nan, 5.0, -1.7],
+            "ints": [1, 2, 3, 3, 4, 5],
+            "nans_nulls": [np.nan, None, None, np.nan, np.nan, None],
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+    assert_eq(pdf.diff(), gdf.diff())
+    assert_eq(pdf.diff(periods=2), gdf.diff(periods=2))
 
 
 def test_dataframe_assign_cp_np_array():
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 6f0f77f0aa2..fccb9f680d9 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -18,6 +18,7 @@
     TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
+    gen_rand,
 )
 
 
@@ -1724,3 +1725,60 @@ def test_isin_categorical(data, values):
     got = gsr.isin(values)
     expected = psr.isin(values)
     assert_eq(got, expected)
+
+
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES)
+@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20])
+@pytest.mark.parametrize("data_empty", [False, True])
+def test_diff(dtype, period, data_empty):
+    if data_empty:
+        data = None
+    else:
+        if dtype == np.int8:
+            # to keep data in range
+            data = gen_rand(dtype, 100000, low=-2, high=2)
+        else:
+            data = gen_rand(dtype, 100000)
+
+    gs = cudf.Series(data, dtype=dtype)
+    ps = pd.Series(data, dtype=dtype)
+
+    expected_outcome = ps.diff(period)
+    diffed_outcome = gs.diff(period).astype(expected_outcome.dtype)
+
+    if data_empty:
+        assert_eq(diffed_outcome, expected_outcome, check_index_type=False)
+    else:
+        assert_eq(diffed_outcome, expected_outcome)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        ["a", "b", "c", "d", "e"],
+    ],
+)
+def test_diff_unsupported_dtypes(data):
+    gs = cudf.Series(data)
+    with pytest.raises(
+        TypeError,
+        match=r"unsupported operand type\(s\)",
+    ):
+        gs.diff()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        pd.date_range("2020-01-01", "2020-01-06", freq="D"),
+        [True, True, True, False, True, True],
+        [1.0, 2.0, 3.5, 4.0, 5.0, -1.7],
+        [1, 2, 3, 3, 4, 5],
+        [np.nan, None, None, np.nan, np.nan, None],
+    ],
+)
+def test_diff_many_dtypes(data):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps)
+    assert_eq(ps.diff(), gs.diff())
+    assert_eq(ps.diff(periods=2), gs.diff(periods=2))
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 4796402f14d..fb6e35f4f58 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -14,27 +14,6 @@
 #
 
 
-@cuda.jit
-def gpu_diff(in_col, out_col, out_mask, N):
-    """Calculate the difference between values at positions i and i - N in an
-    array and store the output in a new array.
-    """
-    i = cuda.grid(1)
-
-    if N > 0:
-        if i < in_col.size:
-            out_col[i] = in_col[i] - in_col[i - N]
-            out_mask[i] = True
-        if i < N:
-            out_mask[i] = False
-    else:
-        if i <= (in_col.size + N):
-            out_col[i] = in_col[i] - in_col[i - N]
-            out_mask[i] = True
-        if i >= (in_col.size + N) and i < in_col.size:
-            out_mask[i] = False
-
-
 # Find segments
 
 

From 9409559433ae55b9c44d68ef52d13b79885a8fde Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 18 Apr 2022 09:25:59 -0500
Subject: [PATCH 10/26] Rework JNI CMake to leverage rapids_find_package
 (#10649)

The JNI CMakeLists.txt has been fragile, looking for specific .a or .so libraries and header file locations, and will break again when libcudf moves to a pre-installed nvcomp 2.3 package since it expects to find nvcomp in a very specific location today.  This refactors the JNI CMakeLists.txt to leverage `rapids_find_package` to reuse the work performed in the libcudf build and also has the nice side-effect of avoiding redundant pulls and builds of the Thrust and RMM repositories that is happening today.

Another side-effect is that the JNI will now automatically pull in the same RMM compile definitions that are used for libcudf, meaning the separate RMM logging flag for the JNI build is no longer necessary.  Similarly it's no longer necessary to explicitly specify to the JNI build which type of Arrow library to use (i.e,.: static or dynamic), it will automatically use whichever Arrow library was built by libcudf.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/10649
---
 build.sh                            |   2 -
 java/README.md                      |  10 ---
 java/ci/build-in-docker.sh          |   1 -
 java/pom.xml                        |   4 -
 java/src/main/native/CMakeLists.txt | 132 ++++------------------------
 5 files changed, 17 insertions(+), 132 deletions(-)

diff --git a/build.sh b/build.sh
index e1d6df016dd..48182ca1a6f 100755
--- a/build.sh
+++ b/build.sh
@@ -148,10 +148,8 @@ function buildLibCudfJniInDocker {
                 -DCUDF_CPP_BUILD_DIR=$workspaceRepoDir/java/target/libcudf-cmake-build \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DPER_THREAD_DEFAULT_STREAM=ON \
-                -DRMM_LOGGING_LEVEL=OFF \
                 -DUSE_GDS=ON \
                 -DGPU_ARCHS=${CUDF_CMAKE_CUDA_ARCHITECTURES} \
-                -DCUDF_JNI_ARROW_STATIC=ON \
                 -DCUDF_JNI_LIBCUDF_STATIC=ON \
                 -Dtest=*,!CuFileTest"
 }
diff --git a/java/README.md b/java/README.md
index afd69df11ef..ea1b9e3e4e4 100644
--- a/java/README.md
+++ b/java/README.md
@@ -75,16 +75,6 @@ If you decide to build without Docker and the build script, examining the cmake
 settings in the [Java CI build script](ci/build-in-docker.sh) can be helpful if you are
 encountering difficulties during the build.
 
-## Dynamically Linking Arrow
-
-Since libcudf builds by default with a dynamically linked Arrow dependency, it may be
-desirable to build the Java bindings without requiring a statically-linked Arrow to avoid
-rebuilding an already built libcudf.so. To do so, specify the additional command-line flag
-`-DCUDF_JNI_ARROW_STATIC=OFF` when building the Java bindings with Maven.  However this will
-result in a jar that requires the correct Arrow version to be available in the runtime
-environment, and therefore is not recommended unless you are only performing local testing
-within the libcudf build environment.
-
 ## Statically Linking the CUDA Runtime
 
 If you use the default cmake options libcudart will be dynamically linked to libcudf and libcudfjni.
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index d6a193fbeaf..d21010ba30e 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -78,7 +78,6 @@ BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\"\
  -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS\
  -DCUDA_STATIC_RUNTIME=$ENABLE_CUDA_STATIC_RUNTIME\
  -DCUDF_JNI_LIBCUDF_STATIC=ON\
- -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL\
  -DUSE_GDS=$ENABLE_GDS -Dtest=*,!CuFileTest"
 
 if [ "$SIGN_FILE" == true ]; then
diff --git a/java/pom.xml b/java/pom.xml
index e2efed19636..50b6ca59440 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -165,10 +165,8 @@
         <CMAKE_EXPORT_COMPILE_COMMANDS>OFF</CMAKE_EXPORT_COMPILE_COMMANDS>
         <CUDA_STATIC_RUNTIME>OFF</CUDA_STATIC_RUNTIME>
         <PER_THREAD_DEFAULT_STREAM>OFF</PER_THREAD_DEFAULT_STREAM>
-        <RMM_LOGGING_LEVEL>INFO</RMM_LOGGING_LEVEL>
         <USE_GDS>OFF</USE_GDS>
         <GPU_ARCHS>ALL</GPU_ARCHS>
-        <CUDF_JNI_ARROW_STATIC>ON</CUDF_JNI_ARROW_STATIC>
         <CUDF_JNI_LIBCUDF_STATIC>OFF</CUDF_JNI_LIBCUDF_STATIC>
         <native.build.path>${project.build.directory}/cmake-build</native.build.path>
         <slf4j.version>1.7.30</slf4j.version>
@@ -386,13 +384,11 @@
                                     <arg value="${basedir}/src/main/native"/>
                                     <arg value="-DCUDA_STATIC_RUNTIME=${CUDA_STATIC_RUNTIME}" />
                                     <arg value="-DPER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM}" />
-                                    <arg value="-DRMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL}" />
                                     <arg value="-DUSE_GDS=${USE_GDS}" />
                                     <arg value="-DCMAKE_CXX_FLAGS=${cxx.flags}"/>
                                     <arg value="-DCMAKE_EXPORT_COMPILE_COMMANDS=${CMAKE_EXPORT_COMPILE_COMMANDS}"/>
                                     <arg value="-DCUDF_CPP_BUILD_DIR=${CUDF_CPP_BUILD_DIR}"/>
                                     <arg value="-DGPU_ARCHS=${GPU_ARCHS}"/>
-                                    <arg value="-DCUDF_JNI_ARROW_STATIC=${CUDF_JNI_ARROW_STATIC}"/>
                                     <arg value="-DCUDF_JNI_LIBCUDF_STATIC=${CUDF_JNI_LIBCUDF_STATIC}"/>
                                     <arg value="-DBUILD_SHARED_LIBS=ON"/>
                                 </exec>
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 9851102d011..3a375412bbd 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -41,7 +41,6 @@ option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 option(USE_GDS "Build with GPUDirect Storage (GDS)/cuFile support" OFF)
-option(CUDF_JNI_ARROW_STATIC "Statically link Arrow" ON)
 option(CUDF_JNI_LIBCUDF_STATIC "Link with libcudf.a" OFF)
 
 message(VERBOSE "CUDF_JNI: Build with NVTX support: ${USE_NVTX}")
@@ -50,7 +49,6 @@ message(VERBOSE "CUDF_JNI: Configure CMake to build tests: ${BUILD_TESTS}")
 message(VERBOSE "CUDF_JNI: Build with per-thread default stream: ${PER_THREAD_DEFAULT_STREAM}")
 message(VERBOSE "CUDF_JNI: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 message(VERBOSE "CUDF_JNI: Build with GPUDirect Storage support: ${USE_GDS}")
-message(VERBOSE "CUDF_JNI: Build with static Arrow library: ${CUDF_JNI_ARROW_STATIC}")
 message(VERBOSE "CUDF_JNI: Link with libcudf statically: ${CUDF_JNI_LIBCUDF_STATIC}")
 
 set(CUDF_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../../../../cpp")
@@ -93,67 +91,16 @@ endif()
 rapids_cmake_build_type("Release")
 
 # ##################################################################################################
-# * Thrust/CUB
-# ------------------------------------------------------------------------------------
-include(${CUDF_SOURCE_DIR}/cmake/thirdparty/get_thrust.cmake)
+# * nvcomp------------------------------------------------------------------------------------------
 
-# ##################################################################################################
-# * CUDF ------------------------------------------------------------------------------------------
-
-set(CUDF_INCLUDE "${PROJECT_SOURCE_DIR}/../../../../cpp/include"
-                 "${PROJECT_SOURCE_DIR}/../../../../cpp/src/"
-)
-
-set(CUDF_LIB_HINTS HINTS "$ENV{CUDF_ROOT}" "$ENV{CUDF_ROOT}/lib" "$ENV{CONDA_PREFIX}/lib"
-                   "${CUDF_CPP_BUILD_DIR}"
-)
-
-find_library(CUDF_LIB "cudf" REQUIRED HINTS ${CUDF_LIB_HINTS})
-
-# ##################################################################################################
-# * ZLIB ------------------------------------------------------------------------------------------
-
-# find zlib
-rapids_find_package(ZLIB REQUIRED)
+set(nvcomp_DIR "${CUDF_CPP_BUILD_DIR}/_deps/nvcomp-build")
+rapids_find_package(nvcomp REQUIRED)
 
 # ##################################################################################################
-# * RMM -------------------------------------------------------------------------------------------
+# * CUDF ------------------------------------------------------------------------------------------
 
-include(${CUDF_SOURCE_DIR}/cmake/thirdparty/get_rmm.cmake)
-
-# ##################################################################################################
-# * ARROW -----------------------------------------------------------------------------------------
-
-find_path(ARROW_INCLUDE "arrow" HINTS "$ENV{ARROW_ROOT}/include"
-                                      "${CUDF_CPP_BUILD_DIR}/_deps/arrow-src/cpp/src"
-)
-
-message(STATUS "ARROW: ARROW_INCLUDE set to ${ARROW_INCLUDE}")
-
-if(CUDF_JNI_ARROW_STATIC)
-  # Find static version of Arrow lib
-  set(CUDF_JNI_ARROW_LIBNAME "libarrow.a")
-else()
-  set(CUDF_JNI_ARROW_LIBNAME "arrow")
-endif()
-
-find_library(
-  ARROW_LIBRARY ${CUDF_JNI_ARROW_LIBNAME} REQUIRED
-  HINTS "$ENV{ARROW_ROOT}/lib" "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/release"
-        "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/debug"
-)
-
-if(NOT ARROW_LIBRARY)
-  if(CUDF_JNI_ARROW_STATIC)
-    message(
-      FATAL_ERROR "Arrow static library not found. Was libcudf built with CUDF_USE_ARROW_STATIC=ON?"
-    )
-  else()
-    message(FATAL_ERROR "Arrow dynamic library not found.")
-  endif()
-else()
-  message(STATUS "ARROW: ARROW_LIBRARY set to ${ARROW_LIBRARY}")
-endif()
+set(cudf_ROOT "${CUDF_CPP_BUILD_DIR}")
+rapids_find_package(cudf REQUIRED)
 
 # ##################################################################################################
 # * find JNI -------------------------------------------------------------------------------------
@@ -164,27 +111,6 @@ else()
   message(FATAL_ERROR "JDK with JNI not found, please check your settings.")
 endif()
 
-# ##################################################################################################
-# * nvcomp ----------------------------------------------------------------------------------------
-
-find_path(NVCOMP_INCLUDE "nvcomp" HINTS "${CUDF_CPP_BUILD_DIR}/_deps/nvcomp-src/include"
-                                        "$ENV{CONDA_PREFIX}/include"
-)
-
-message(STATUS "NVCOMP: NVCOMP_INCLUDE set to ${NVCOMP_INCLUDE}")
-
-set(CUDF_JNI_NVCOMP_LIBNAME "libnvcomp.a")
-find_library(
-  NVCOMP_LIBRARY ${CUDF_JNI_NVCOMP_LIBNAME} REQUIRED HINTS "${CUDF_CPP_BUILD_DIR}/lib"
-                                                           "$ENV{CONDA_PREFIX}/lib"
-)
-
-if(NOT NVCOMP_LIBRARY)
-  message(FATAL_ERROR "nvcomp static library not found.")
-else()
-  message(STATUS "NVCOMP: NVCOMP_LIBRARY set to ${NVCOMP_LIBRARY}")
-endif()
-
 # ##################################################################################################
 # * GDS/cufile ------------------------------------------------------------------------------------
 
@@ -238,17 +164,8 @@ endif()
 # * include paths ---------------------------------------------------------------------------------
 
 target_include_directories(
-  cudfjni
-  PUBLIC cudf::Thrust
-         "${LIBCUDACXX_INCLUDE}"
-         "${CUDAToolkit_INCLUDE_DIRS}"
-         "${NVCOMP_INCLUDE}"
-         "${CMAKE_BINARY_DIR}/include"
-         "${CMAKE_SOURCE_DIR}/include"
-         "${CMAKE_SOURCE_DIR}/src"
-         "${JNI_INCLUDE_DIRS}"
-         "${CUDF_INCLUDE}"
-         "${ARROW_INCLUDE}"
+  cudfjni PUBLIC "${CMAKE_BINARY_DIR}/include" "${CMAKE_SOURCE_DIR}/include"
+                 "${CMAKE_SOURCE_DIR}/src" "${JNI_INCLUDE_DIRS}"
 )
 
 # ##################################################################################################
@@ -291,39 +208,24 @@ if(USE_GDS)
                POSITION_INDEPENDENT_CODE ON
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
-  target_include_directories(
-    cufilejni
-    PUBLIC "${LIBCUDACXX_INCLUDE}" "${CUDF_INCLUDE}"
-    PRIVATE "${cuFile_INCLUDE_DIRS}"
-  )
-  target_link_libraries(cufilejni PRIVATE cudfjni rmm::rmm "${cuFile_LIBRARIES}")
+  target_include_directories(cufilejni PRIVATE "${cuFile_INCLUDE_DIRS}")
+  target_link_libraries(cufilejni PRIVATE cudfjni "${cuFile_LIBRARIES}")
 endif()
 
-# ##################################################################################################
-# * rmm logging level -----------------------------------------------------------------------------
-
-set(RMM_LOGGING_LEVEL
-    "INFO"
-    CACHE STRING "Choose the logging level."
-)
-# Set the possible values of build type for cmake-gui
-set_property(
-  CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF"
-)
-message(STATUS "RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.")
-
-target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL})
-
 # ##################################################################################################
 # * link libraries --------------------------------------------------------------------------------
 
-set(CUDF_LINK ${CUDF_LIB})
+set(CUDF_LINK PUBLIC cudf::cudf)
 if(CUDF_JNI_LIBCUDF_STATIC)
-  set(CUDF_LINK -Wl,--whole-archive ${CUDF_LIB} -Wl,--no-whole-archive ZLIB::ZLIB)
+  # Whole-link libcudf.a into the shared library but not its dependencies
+  set(CUDF_LINK PRIVATE -Wl,--whole-archive cudf::cudf -Wl,--no-whole-archive PUBLIC cudf::cudf)
 endif()
 
+# When nvcomp is installed we need to use nvcomp::nvcomp but from the cudf build directory it will
+# just be nvcomp.
 target_link_libraries(
-  cudfjni PRIVATE ${CUDF_LINK} ${NVCOMP_LIBRARY} ${ARROW_LIBRARY} rmm::rmm CUDA::cuda_driver
+  cudfjni ${CUDF_LINK} PRIVATE $<TARGET_NAME_IF_EXISTS:nvcomp>
+                               $<TARGET_NAME_IF_EXISTS:nvcomp::nvcomp>
 )
 
 # ##################################################################################################

From 45c003dc70790f02b044fb06e5f95679df6600de Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 18 Apr 2022 13:39:37 -0500
Subject: [PATCH 11/26] Fix list of testing requirements in setup.py. (#10678)

The list of testing requirements was missing a comma in `setup.py`. This fixes it.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10678
---
 python/cudf/setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 2ec9909dd6f..a447fcfe027 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -47,7 +47,8 @@
         "pytest",
         "pytest-benchmark",
         "pytest-xdist",
-        "hypothesis" "mimesis",
+        "hypothesis",
+        "mimesis",
         "fastavro>=0.22.9",
         "python-snappy>=0.6.0",
         "pyorc",

From c322cbac3ef31836f3327b1c048905ebdccdeec0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 18 Apr 2022 13:42:39 -0500
Subject: [PATCH 12/26] Standardize imports. (#10680)

This PR standardizes a few imports across the cudf code base. Changes include:

- Removed usage of some non-standard "two letter" names. For example, `import numpy as np` is common, but `import pyorc as po` and `import fastavro as fa` are non-standard and not the style used by their documentation. I left `import cupy as cp`, since both `import cupy` and `import cupy as cp` are prevalent in the code base (the one exception that I changed was a file that had both `import cupy` and `import cupy as cp`).
- Avoid the pattern `from some_package import x as x` -- just write `from some_package import x`
- Fixed some `cimport`s
- Always use `import datetime` instead of `import datetime as dt` to avoid conflicts with the many other `dt` names in our code (including local names that had the potential to shadow/overwrite the library's name)
- Use `warnings.warn` rather than `from warnings import warn` for consistency across the library
- Remove some legacy Python 2 compatibility

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10680
---
 python/cudf/cudf/_lib/column.pyx           |  4 +-
 python/cudf/cudf/_lib/null_mask.pyx        |  5 +-
 python/cudf/cudf/_lib/parquet.pyx          |  4 +-
 python/cudf/cudf/_lib/rolling.pyx          |  4 +-
 python/cudf/cudf/core/algorithms.py        |  4 +-
 python/cudf/cudf/core/column/datetime.py   |  6 +-
 python/cudf/cudf/core/column/decimal.py    |  4 +-
 python/cudf/cudf/core/column/timedelta.py  |  4 +-
 python/cudf/cudf/core/dataframe.py         | 10 +--
 python/cudf/cudf/core/subword_tokenizer.py |  6 +-
 python/cudf/cudf/tests/test_api_types.py   | 18 ++---
 python/cudf/cudf/tests/test_contains.py    |  8 ++-
 python/cudf/cudf/tests/test_dataframe.py   |  5 +-
 python/cudf/cudf/tests/test_datetime.py    |  5 +-
 python/cudf/cudf/tests/test_duplicates.py  |  4 +-
 python/cudf/cudf/tests/test_hdfs.py        |  6 +-
 python/cudf/cudf/tests/test_orc.py         | 83 ++++++++++++----------
 python/cudf/cudf/tests/test_scalar.py      |  8 +--
 python/cudf/cudf/utils/dtypes.py           |  6 +-
 python/cudf/cudf/utils/queryutils.py       |  4 +-
 20 files changed, 101 insertions(+), 97 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 448a22425a4..8cbadfa19a5 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -7,7 +7,7 @@ import pandas as pd
 import rmm
 
 import cudf
-import cudf._lib as libcudfxx
+import cudf._lib as libcudf
 from cudf.api.types import is_categorical_dtype, is_list_dtype, is_struct_dtype
 from cudf.core.buffer import Buffer
 
@@ -160,7 +160,7 @@ cdef class Column:
             if self.base_mask is None or self.offset == 0:
                 self._mask = self.base_mask
             else:
-                self._mask = libcudfxx.null_mask.copy_bitmask(self)
+                self._mask = libcudf.null_mask.copy_bitmask(self)
         return self._mask
 
     @property
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index b6e26fe594f..ce83a6f0f18 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from enum import Enum
 
@@ -8,9 +8,6 @@ from libcpp.utility cimport move
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf._lib.column cimport Column
-
-import cudf._lib as libcudfxx
-
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 8cb7dd942c1..e363ea875f0 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
@@ -17,7 +17,7 @@ except ImportError:
     import json
 
 import numpy as np
-from cython.operator import dereference
+from cython.operator cimport dereference
 
 from cudf.api.types import (
     is_categorical_dtype,
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
index b4b3384032c..a2cb115f668 100644
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ b/python/cudf/cudf/_lib/rolling.pyx
@@ -1,6 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-from __future__ import print_function
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import pandas as pd
 
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 22a5666ef3f..d13c55dfcc0 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -1,5 +1,5 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
-from warnings import warn
+import warnings
 
 import cupy as cp
 import numpy as np
@@ -50,7 +50,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
         raise NotImplementedError("na_sentinel can not be None.")
 
     if size_hint:
-        warn("size_hint is not applicable for cudf.factorize")
+        warnings.warn("size_hint is not applicable for cudf.factorize")
 
     return_cupy_array = isinstance(values, cp.ndarray)
 
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index fac8af652c1..375a19f5423 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-import datetime as dt
+import datetime
 import locale
 import re
 from locale import nl_langinfo
@@ -237,9 +237,9 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)):
             return other
 
-        if isinstance(other, dt.datetime):
+        if isinstance(other, datetime.datetime):
             other = np.datetime64(other)
-        elif isinstance(other, dt.timedelta):
+        elif isinstance(other, datetime.timedelta):
             other = np.timedelta64(other)
         elif isinstance(other, pd.Timestamp):
             other = other.to_datetime64()
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index f10e257d359..d8ddb3d8d1a 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
+import warnings
 from decimal import Decimal
 from typing import Any, Sequence, Tuple, Union, cast
-from warnings import warn
 
 import cupy as cp
 import numpy as np
@@ -43,7 +43,7 @@ def as_decimal_column(
             isinstance(dtype, cudf.core.dtypes.DecimalDtype)
             and dtype.scale < self.dtype.scale
         ):
-            warn(
+            warnings.warn(
                 "cuDF truncates when downcasting decimals to a lower scale. "
                 "To round, use Series.round() or DataFrame.round()."
             )
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 15815427aca..810624e9f4e 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-import datetime as dt
+import datetime
 from typing import Any, Sequence, cast
 
 import numpy as np
@@ -211,7 +211,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
     def normalize_binop_value(self, other) -> ColumnBinaryOperand:
         if isinstance(other, (ColumnBase, cudf.Scalar)):
             return other
-        if isinstance(other, dt.timedelta):
+        if isinstance(other, datetime.timedelta):
             other = np.timedelta64(other)
         elif isinstance(other, pd.Timestamp):
             other = other.to_datetime64()
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8893b85c97c..24aa0d01b3c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5596,14 +5596,14 @@ def select_dtypes(self, include=None, exclude=None):
     @ioutils.doc_to_parquet()
     def to_parquet(self, path, *args, **kwargs):
         """{docstring}"""
-        from cudf.io import parquet as pq
+        from cudf.io import parquet
 
-        return pq.to_parquet(self, path, *args, **kwargs)
+        return parquet.to_parquet(self, path, *args, **kwargs)
 
     @ioutils.doc_to_feather()
     def to_feather(self, path, *args, **kwargs):
         """{docstring}"""
-        from cudf.io import feather as feather
+        from cudf.io import feather
 
         feather.to_feather(self, path, *args, **kwargs)
 
@@ -5623,7 +5623,7 @@ def to_csv(
         **kwargs,
     ):
         """{docstring}"""
-        from cudf.io import csv as csv
+        from cudf.io import csv
 
         return csv.to_csv(
             self,
@@ -5643,7 +5643,7 @@ def to_csv(
     @ioutils.doc_to_orc()
     def to_orc(self, fname, compression=None, *args, **kwargs):
         """{docstring}"""
-        from cudf.io import orc as orc
+        from cudf.io import orc
 
         orc.to_orc(self, fname, compression, *args, **kwargs)
 
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 782b74ef4a6..83cceff5c4c 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
+import warnings
 from typing import Union
-from warnings import warn
 
 import cupy as cp
 
@@ -186,7 +186,7 @@ def __call__(
                 "When truncation is not True, the behaviour currently differs "
                 "from HuggingFace as cudf always returns overflowing tokens"
             )
-            warn(warning_msg)
+            warnings.warn(warning_msg)
 
         if padding != "max_length":
             error_msg = (
diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py
index e7cf113f604..c2cd78f88a0 100644
--- a/python/cudf/cudf/tests/test_api_types.py
+++ b/python/cudf/cudf/tests/test_api_types.py
@@ -3,10 +3,10 @@
 import numpy as np
 import pandas as pd
 import pytest
-from pandas.api import types as ptypes
+from pandas.api import types as pd_types
 
 import cudf
-from cudf.api import types as types
+from cudf.api import types
 
 
 @pytest.mark.parametrize(
@@ -1035,11 +1035,13 @@ def test_is_decimal_dtype(obj, expect):
     ),
 )
 def test_pandas_agreement(obj):
-    assert types.is_categorical_dtype(obj) == ptypes.is_categorical_dtype(obj)
-    assert types.is_numeric_dtype(obj) == ptypes.is_numeric_dtype(obj)
-    assert types.is_integer_dtype(obj) == ptypes.is_integer_dtype(obj)
-    assert types.is_integer(obj) == ptypes.is_integer(obj)
-    assert types.is_string_dtype(obj) == ptypes.is_string_dtype(obj)
+    assert types.is_categorical_dtype(obj) == pd_types.is_categorical_dtype(
+        obj
+    )
+    assert types.is_numeric_dtype(obj) == pd_types.is_numeric_dtype(obj)
+    assert types.is_integer_dtype(obj) == pd_types.is_integer_dtype(obj)
+    assert types.is_integer(obj) == pd_types.is_integer(obj)
+    assert types.is_string_dtype(obj) == pd_types.is_string_dtype(obj)
 
 
 @pytest.mark.parametrize(
@@ -1115,7 +1117,7 @@ def test_pandas_agreement(obj):
     ),
 )
 def test_pandas_agreement_scalar(obj):
-    assert types.is_scalar(obj) == ptypes.is_scalar(obj)
+    assert types.is_scalar(obj) == pd_types.is_scalar(obj)
 
 
 # TODO: Add test of interval.
diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py
index f06142f4cc9..15dfa111860 100644
--- a/python/cudf/cudf/tests/test_contains.py
+++ b/python/cudf/cudf/tests/test_contains.py
@@ -1,4 +1,6 @@
-from datetime import datetime as dt
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
+import datetime
 
 import numpy as np
 import pandas as pd
@@ -41,12 +43,12 @@ def get_string_series():
 testdata_all = [
     (
         cudf_date_series("20010101", "20020215", freq="400h"),
-        dt.strptime("2001-01-01", "%Y-%m-%d"),
+        datetime.datetime.strptime("2001-01-01", "%Y-%m-%d"),
         True,
     ),
     (
         cudf_date_series("20010101", "20020215", freq="400h"),
-        dt.strptime("2000-01-01", "%Y-%m-%d"),
+        datetime.datetime.strptime("2000-01-01", "%Y-%m-%d"),
         False,
     ),
     (cudf_date_series("20010101", "20020215", freq="400h"), 20000101, False),
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 07261534777..2685524add4 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -13,7 +13,6 @@
 from copy import copy
 
 import cupy
-import cupy as cp
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -7332,7 +7331,7 @@ def test_sample_axis_0(
 
 @pytest.mark.parametrize("replace", [True, False])
 @pytest.mark.parametrize(
-    "random_state_lib", [cp.random.RandomState, np.random.RandomState]
+    "random_state_lib", [cupy.random.RandomState, np.random.RandomState]
 )
 def test_sample_reproducibility(replace, random_state_lib):
     df = cudf.DataFrame({"a": cupy.arange(0, 1024)})
@@ -7384,7 +7383,7 @@ def test_oversample_without_replace(n, frac, axis):
     )
 
 
-@pytest.mark.parametrize("random_state", [None, cp.random.RandomState(42)])
+@pytest.mark.parametrize("random_state", [None, cupy.random.RandomState(42)])
 def test_sample_unsupported_arguments(random_state):
     df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]})
     with pytest.raises(
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 964ac9e5457..8be338e787a 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import datetime
-import datetime as dt
 import operator
 import re
 
@@ -219,8 +218,8 @@ def test_sort_datetime():
 
 def test_issue_165():
     df_pandas = pd.DataFrame()
-    start_date = dt.datetime.strptime("2000-10-21", "%Y-%m-%d")
-    data = [(start_date + dt.timedelta(days=x)) for x in range(6)]
+    start_date = datetime.datetime.strptime("2000-10-21", "%Y-%m-%d")
+    data = [(start_date + datetime.timedelta(days=x)) for x in range(6)]
     df_pandas["dates"] = data
     df_pandas["num"] = [1, 2, 3, 4, 5, 6]
     df_cudf = DataFrame.from_pandas(df_pandas)
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index e8a695570f0..a80208cfd7d 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import itertools as it
+import itertools
 import random
 
 import numpy as np
@@ -280,7 +280,7 @@ def test_drop_duplicates_empty(df):
 
 @pytest.mark.parametrize("num_columns", [3, 4, 5])
 def test_dataframe_drop_duplicates_numeric_method(num_columns):
-    comb = list(it.permutations(range(num_columns), num_columns))
+    comb = list(itertools.permutations(range(num_columns), num_columns))
     shuf = list(comb)
     random.Random(num_columns).shuffle(shuf)
 
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index de4303a34a8..8730cb187b5 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -3,12 +3,12 @@
 import os
 from io import BytesIO
 
-import fastavro as fa
+import fastavro
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
-from pyarrow import orc as orc
+from pyarrow import orc
 
 import cudf
 from cudf.testing._utils import assert_eq
@@ -253,7 +253,7 @@ def test_read_avro(datadir, hdfs, test_url):
 
     got = cudf.read_avro(hd_fpath)
     with open(fname, mode="rb") as f:
-        expect = pd.DataFrame.from_records(fa.reader(f))
+        expect = pd.DataFrame.from_records(fastavro.reader(f))
 
     for col in expect.columns:
         expect[col] = expect[col].astype(got[col].dtype)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 5082fb08b92..c3969bf6c14 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -11,7 +11,7 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.orc
-import pyorc as po
+import pyorc
 import pytest
 
 import cudf
@@ -307,7 +307,7 @@ def test_orc_read_skiprows(tmpdir):
         {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]},
         dtype=pd.BooleanDtype(),
     )
-    writer = po.Writer(buff, po.Struct(a=po.Boolean()))
+    writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
     tuples = list(
         map(
             lambda x: (None,) if x[0] is pd.NA else x,
@@ -931,29 +931,35 @@ def generate_list_struct_buff(size=100_000):
     buff = BytesIO()
 
     schema = {
-        "lvl3_list": po.Array(po.Array(po.Array(po.BigInt()))),
-        "lvl1_list": po.Array(po.BigInt()),
-        "lvl1_struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
-        "lvl2_struct": po.Struct(
+        "lvl3_list": pyorc.Array(pyorc.Array(pyorc.Array(pyorc.BigInt()))),
+        "lvl1_list": pyorc.Array(pyorc.BigInt()),
+        "lvl1_struct": pyorc.Struct(
+            **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
+        ),
+        "lvl2_struct": pyorc.Struct(
             **{
-                "a": po.BigInt(),
-                "lvl1_struct": po.Struct(
-                    **{"c": po.BigInt(), "d": po.BigInt()}
+                "a": pyorc.BigInt(),
+                "lvl1_struct": pyorc.Struct(
+                    **{"c": pyorc.BigInt(), "d": pyorc.BigInt()}
                 ),
             }
         ),
-        "list_nests_struct": po.Array(
-            po.Array(po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}))
+        "list_nests_struct": pyorc.Array(
+            pyorc.Array(
+                pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()})
+            )
         ),
-        "struct_nests_list": po.Struct(
+        "struct_nests_list": pyorc.Struct(
             **{
-                "struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
-                "list": po.Array(po.BigInt()),
+                "struct": pyorc.Struct(
+                    **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
+                ),
+                "list": pyorc.Array(pyorc.BigInt()),
             }
         ),
     }
 
-    schema = po.Struct(**schema)
+    schema = pyorc.Struct(**schema)
 
     lvl3_list = [
         rd.choice(
@@ -1019,7 +1025,7 @@ def generate_list_struct_buff(size=100_000):
         }
     )
 
-    writer = po.Writer(buff, schema, stripe_size=1024)
+    writer = pyorc.Writer(buff, schema, stripe_size=1024)
     tuples = list(
         map(
             lambda x: (None,) if x[0] is pd.NA else x,
@@ -1101,15 +1107,17 @@ def gen_map_buff(size=10000):
     buff = BytesIO()
 
     schema = {
-        "lvl1_map": po.Map(key=po.String(), value=po.BigInt()),
-        "lvl2_map": po.Map(key=po.String(), value=po.Array(po.BigInt())),
-        "lvl2_struct_map": po.Map(
-            key=po.String(),
-            value=po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
+        "lvl1_map": pyorc.Map(key=pyorc.String(), value=pyorc.BigInt()),
+        "lvl2_map": pyorc.Map(
+            key=pyorc.String(), value=pyorc.Array(pyorc.BigInt())
+        ),
+        "lvl2_struct_map": pyorc.Map(
+            key=pyorc.String(),
+            value=pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}),
         ),
     }
 
-    schema = po.Struct(**schema)
+    schema = pyorc.Struct(**schema)
 
     lvl1_map = [
         rd.choice(
@@ -1186,8 +1194,8 @@ def gen_map_buff(size=10000):
             "lvl2_struct_map": lvl2_struct_map,
         }
     )
-    writer = po.Writer(
-        buff, schema, stripe_size=1024, compression=po.CompressionKind.NONE
+    writer = pyorc.Writer(
+        buff, schema, stripe_size=1024, compression=pyorc.CompressionKind.NONE
     )
     tuples = list(
         map(
@@ -1479,8 +1487,9 @@ def test_statistics_sum_overflow():
     minint64 = np.iinfo(np.int64).min
 
     buff = BytesIO()
-    with po.Writer(
-        buff, po.Struct(a=po.BigInt(), b=po.BigInt(), c=po.BigInt())
+    with pyorc.Writer(
+        buff,
+        pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt(), c=pyorc.BigInt()),
     ) as writer:
         writer.write((maxint64, minint64, minint64))
         writer.write((1, -1, 1))
@@ -1497,20 +1506,20 @@ def test_statistics_sum_overflow():
 
 def test_empty_statistics():
     buff = BytesIO()
-    orc_schema = po.Struct(
-        a=po.BigInt(),
-        b=po.Double(),
-        c=po.String(),
-        d=po.Decimal(11, 2),
-        e=po.Date(),
-        f=po.Timestamp(),
-        g=po.Boolean(),
-        h=po.Binary(),
-        i=po.BigInt(),
+    orc_schema = pyorc.Struct(
+        a=pyorc.BigInt(),
+        b=pyorc.Double(),
+        c=pyorc.String(),
+        d=pyorc.Decimal(11, 2),
+        e=pyorc.Date(),
+        f=pyorc.Timestamp(),
+        g=pyorc.Boolean(),
+        h=pyorc.Binary(),
+        i=pyorc.BigInt(),
         # One column with non null value, else cudf/pyorc readers crash
     )
     data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
-    with po.Writer(buff, orc_schema) as writer:
+    with pyorc.Writer(buff, orc_schema) as writer:
         writer.write(data)
 
     got = cudf.io.orc.read_orc_statistics([buff])
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index e8382681820..79211456996 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import datetime
-import datetime as dt
 import re
 from decimal import Decimal
 
@@ -11,7 +10,6 @@
 import pytest
 
 import cudf
-from cudf import Scalar as pycudf_scalar
 from cudf._lib.copying import get_element
 from cudf.testing._utils import (
     ALL_TYPES,
@@ -297,9 +295,9 @@ def test_date_duration_scalars(value):
 
     actual = s.value
 
-    if isinstance(value, dt.datetime):
+    if isinstance(value, datetime.datetime):
         expected = np.datetime64(value)
-    elif isinstance(value, dt.timedelta):
+    elif isinstance(value, datetime.timedelta):
         expected = np.timedelta64(value)
     elif isinstance(value, pd.Timestamp):
         expected = value.to_datetime64()
@@ -344,7 +342,7 @@ def test_scalar_invalid_implicit_conversion(cls, dtype):
         cls(pd.NA)
     except TypeError as e:
         with pytest.raises(TypeError, match=re.escape(str(e))):
-            slr = pycudf_scalar(None, dtype=dtype)
+            slr = cudf.Scalar(None, dtype=dtype)
             cls(slr)
 
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 4cd1738996f..35c6fdc73f8 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import datetime as dt
+import datetime
 from collections import namedtuple
 from decimal import Decimal
 
@@ -259,9 +259,9 @@ def to_cudf_compatible_scalar(val, dtype=None):
     ) or cudf.api.types.is_string_dtype(dtype):
         dtype = "str"
 
-    if isinstance(val, dt.datetime):
+    if isinstance(val, datetime.datetime):
         val = np.datetime64(val)
-    elif isinstance(val, dt.timedelta):
+    elif isinstance(val, datetime.timedelta):
         val = np.timedelta64(val)
     elif isinstance(val, pd.Timestamp):
         val = val.to_datetime64()
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index cdaaff6b2af..25b3d517e1c 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import ast
-import datetime as dt
+import datetime
 from typing import Any, Dict
 
 import numpy as np
@@ -232,7 +232,7 @@ def query_execute(df, expr, callenv):
         name = name[len(ENVREF_PREFIX) :]
         try:
             val = envdict[name]
-            if isinstance(val, dt.datetime):
+            if isinstance(val, datetime.datetime):
                 val = np.datetime64(val)
         except KeyError:
             msg = "{!r} not defined in the calling environment"

From 6c79b5902d55bab599731a9bded7e89b9c4875c5 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 18 Apr 2022 18:20:38 -0500
Subject: [PATCH 13/26] Standardize usage of collections.abc. (#10679)

The codebase currently uses multiple ways of importing `collections.abc` classes in cudf. This can be problematic because the names in `collections.abc` can overlap with names in `typing`, so we need a way to disambiguate the two. This PR standardizes our imports such that abstract base classes follow a pattern so that `abc.` is always in the name of abstract base classes.

```python
from collections import abc
# Not "import collections.abc" or "from collections.abc import Mapping"

if isinstance(obj, abc.Mapping):
    pass
```

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10679
---
 python/cudf/cudf/_fuzz_testing/json.py      |  2 +-
 python/cudf/cudf/_lib/csv.pyx               |  8 +++----
 python/cudf/cudf/_lib/json.pyx              | 10 ++++-----
 python/cudf/cudf/api/types.py               |  4 ++--
 python/cudf/cudf/comm/gpuarrow.py           |  5 ++---
 python/cudf/cudf/core/column/categorical.py |  6 +++--
 python/cudf/cudf/core/column_accessor.py    |  8 +++----
 python/cudf/cudf/core/cut.py                |  6 ++---
 python/cudf/cudf/core/dataframe.py          | 25 +++++++++++----------
 python/cudf/cudf/core/df_protocol.py        |  6 ++---
 python/cudf/cudf/core/groupby/groupby.py    |  6 ++---
 python/cudf/cudf/core/join/_join_helpers.py |  4 ++--
 python/cudf/cudf/core/multiindex.py         |  6 ++---
 python/cudf/cudf/core/reshape.py            |  9 ++++----
 python/cudf/cudf/core/series.py             |  2 +-
 python/cudf/cudf/testing/_utils.py          | 10 ++++-----
 16 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index f850a7e79f9..29e0aeb7050 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -2,7 +2,7 @@
 
 import logging
 import random
-from collections import abc as abc
+from collections import abc
 
 import numpy as np
 
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 05ff32392fe..f1a75baa951 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.map cimport map
@@ -19,9 +19,9 @@ import cudf
 
 from cudf._lib.cpp.types cimport size_type
 
-import collections.abc as abc
 import errno
 import os
+from collections import abc
 from enum import IntEnum
 from io import BytesIO, StringIO
 
@@ -238,7 +238,7 @@ cdef csv_reader_options make_csv_reader_options(
                 "`parse_dates`: dictionaries are unsupported")
         if not isinstance(parse_dates, abc.Iterable):
             raise NotImplementedError(
-                "`parse_dates`: non-lists are unsupported")
+                "`parse_dates`: an iterable is required")
         for col in parse_dates:
             if isinstance(col, str):
                 c_parse_dates_names.push_back(str(col).encode())
@@ -279,7 +279,7 @@ cdef csv_reader_options make_csv_reader_options(
             )
             csv_reader_options_c.set_dtypes(c_dtypes_list)
             csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
-        elif isinstance(dtype, abc.Iterable):
+        elif isinstance(dtype, abc.Collection):
             c_dtypes_list.reserve(len(dtype))
             for index, col_dtype in enumerate(dtype):
                 if col_dtype in CSV_HEX_TYPE_MAP:
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 48da83450d7..263d70afe26 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -1,11 +1,11 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
 
-import collections.abc as abc
 import io
 import os
+from collections import abc
 
 import cudf
 
@@ -82,15 +82,15 @@ cpdef read_json(object filepaths_or_buffers,
             for k, v in dtype.items():
                 c_dtypes_map[str(k).encode()] = \
                     _get_cudf_data_type_from_dtype(v)
-        elif not isinstance(dtype, abc.Iterable):
-            raise TypeError("`dtype` must be 'list like' or 'dict'")
-        else:
+        elif isinstance(dtype, abc.Collection):
             is_list_like_dtypes = True
             c_dtypes_list.reserve(len(dtype))
             for col_dtype in dtype:
                 c_dtypes_list.push_back(
                     _get_cudf_data_type_from_dtype(
                         col_dtype))
+        else:
+            raise TypeError("`dtype` must be 'list like' or 'dict'")
 
     cdef json_reader_options opts = move(
         json_reader_options.builder(make_source_info(filepaths_or_buffers))
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index fad2a973681..56b453dae95 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from collections.abc import Sequence
+from collections import abc
 from functools import wraps
 from inspect import isclass
 from typing import List, Union
@@ -174,7 +174,7 @@ def is_list_like(obj):
     bool
         Return True if given object is list-like.
     """
-    return isinstance(obj, (Sequence, np.ndarray)) and not isinstance(
+    return isinstance(obj, (abc.Sequence, np.ndarray)) and not isinstance(
         obj, (str, bytes)
     )
 
diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
index f21eb4e4d8c..09b4cc5ffba 100644
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ b/python/cudf/cudf/comm/gpuarrow.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
-from collections import OrderedDict
-from collections.abc import Sequence
+from collections import OrderedDict, abc
 
 import numpy as np
 import pandas as pd
@@ -32,7 +31,7 @@ def __init__(self, source, schema=None):
         self._open(source, schema)
 
 
-class GpuArrowReader(Sequence):
+class GpuArrowReader(abc.Sequence):
     def __init__(self, schema, dev_ary):
         self._table = CudaRecordBatchStreamReader(dev_ary, schema).read_all()
 
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 911391ef984..7c33b9f81fe 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import pickle
-from collections.abc import MutableSequence
+from collections import abc
 from functools import cached_property
 from typing import (
     TYPE_CHECKING,
@@ -1379,7 +1379,9 @@ def view(self, dtype: Dtype) -> ColumnBase:
         )
 
     @staticmethod
-    def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
+    def _concat(
+        objs: abc.MutableSequence[CategoricalColumn],
+    ) -> CategoricalColumn:
         # TODO: This function currently assumes it is being called from
         # column.concat_columns, at least to the extent that all the
         # preprocessing in that function has already been done. That should be
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 291e50386cc..24a2958ce97 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import itertools
-from collections.abc import MutableMapping
+from collections import abc
 from functools import cached_property, reduce
 from typing import (
     TYPE_CHECKING,
@@ -78,7 +78,7 @@ def _to_flat_dict(d):
     return {k: v for k, v in _to_flat_dict_inner(d)}
 
 
-class ColumnAccessor(MutableMapping):
+class ColumnAccessor(abc.MutableMapping):
     """
     Parameters
     ----------
@@ -99,7 +99,7 @@ class ColumnAccessor(MutableMapping):
 
     def __init__(
         self,
-        data: Union[MutableMapping, ColumnAccessor] = None,
+        data: Union[abc.MutableMapping, ColumnAccessor] = None,
         multiindex: bool = False,
         level_names=None,
     ):
@@ -213,7 +213,7 @@ def columns(self) -> Tuple[ColumnBase, ...]:
         return tuple(self.values())
 
     @cached_property
-    def _grouped_data(self) -> MutableMapping:
+    def _grouped_data(self) -> abc.MutableMapping:
         """
         If self.multiindex is True,
         return the underlying mapping as a nested mapping.
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 0fef6630248..2ec39043eb2 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
-from collections.abc import Sequence
+from collections import abc
 
 import cupy
 import numpy as np
@@ -140,7 +140,7 @@ def cut(
                 )
 
     # bins can either be an int, sequence of scalars or an intervalIndex
-    if isinstance(bins, Sequence):
+    if isinstance(bins, abc.Sequence):
         if len(set(bins)) is not len(bins):
             if duplicates == "raise":
                 raise ValueError(
@@ -158,7 +158,7 @@ def cut(
 
     # create bins if given an int or single scalar
     if not isinstance(bins, pd.IntervalIndex):
-        if not isinstance(bins, (Sequence)):
+        if not isinstance(bins, (abc.Sequence)):
             if isinstance(
                 x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray)
             ):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 24aa0d01b3c..4ffacfa2ccc 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -9,8 +9,7 @@
 import pickle
 import sys
 import warnings
-from collections import defaultdict
-from collections.abc import Iterable, Mapping, Sequence
+from collections import abc, defaultdict
 from typing import (
     Any,
     Callable,
@@ -1857,7 +1856,7 @@ def _make_operands_and_index_for_binop(
         Optional[BaseIndex],
     ]:
         # Check built-in types first for speed.
-        if isinstance(other, (list, dict, Sequence, Mapping)):
+        if isinstance(other, (list, dict, abc.Sequence, abc.Mapping)):
             warnings.warn(
                 "Binary operations between host objects such as "
                 f"{type(other)} and cudf.DataFrame are deprecated and will be "
@@ -1878,7 +1877,7 @@ def _make_operands_and_index_for_binop(
 
         if _is_scalar_or_zero_d_array(other):
             rhs = {name: other for name in self._data}
-        elif isinstance(other, (list, Sequence)):
+        elif isinstance(other, (list, abc.Sequence)):
             rhs = {name: o for (name, o) in zip(self._data, other)}
         elif isinstance(other, Series):
             rhs = dict(zip(other.index.values_host, other.values_host))
@@ -1907,7 +1906,7 @@ def _make_operands_and_index_for_binop(
             # the fill value.
             left_default = fill_value
 
-        if not isinstance(rhs, (dict, Mapping)):
+        if not isinstance(rhs, (dict, abc.Mapping)):
             return NotImplemented, None
 
         operands = {
@@ -2961,7 +2960,9 @@ def agg(self, aggs, axis=None):
         if axis == 0 or axis is not None:
             raise NotImplementedError("axis not implemented yet")
 
-        if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)):
+        if isinstance(aggs, abc.Iterable) and not isinstance(
+            aggs, (str, dict)
+        ):
             result = DataFrame()
             # TODO : Allow simultaneous pass for multi-aggregation as
             # a future optimization
@@ -2997,13 +2998,13 @@ def agg(self, aggs, axis=None):
                             f"'Series' object"
                         )
                     result[key] = getattr(col, value)()
-            elif all([isinstance(val, Iterable) for val in aggs.values()]):
+            elif all([isinstance(val, abc.Iterable) for val in aggs.values()]):
                 idxs = set()
                 for val in aggs.values():
-                    if isinstance(val, Iterable):
-                        idxs.update(val)
-                    elif isinstance(val, str):
+                    if isinstance(val, str):
                         idxs.add(val)
+                    elif isinstance(val, abc.Iterable):
+                        idxs.update(val)
                 idxs = sorted(list(idxs))
                 for agg in idxs:
                     if agg is callable:
@@ -3017,7 +3018,7 @@ def agg(self, aggs, axis=None):
                         len(idxs), dtype=col.dtype, masked=True
                     )
                     ans = cudf.Series(data=col_empty, index=idxs)
-                    if isinstance(aggs.get(key), Iterable):
+                    if isinstance(aggs.get(key), abc.Iterable):
                         # TODO : Allow simultaneous pass for multi-aggregation
                         # as a future optimization
                         for agg in aggs.get(key):
@@ -6157,7 +6158,7 @@ def _sample_axis_1(
     def _from_columns_like_self(
         self,
         columns: List[ColumnBase],
-        column_names: Iterable[str],
+        column_names: abc.Iterable[str],
         index_names: Optional[List[str]] = None,
     ) -> DataFrame:
         result = super()._from_columns_like_self(
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 4a30a78bf65..f4ce658bff3 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
-import collections
 import enum
+from collections import abc
 from typing import (
     Any,
     Dict,
@@ -569,13 +569,13 @@ def get_columns(self) -> Iterable[_CuDFColumn]:
         ]
 
     def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame":
-        if not isinstance(indices, collections.abc.Sequence):
+        if not isinstance(indices, abc.Sequence):
             raise ValueError("`indices` is not a sequence")
 
         return _CuDFDataFrame(self._df.iloc[:, indices])
 
     def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame":
-        if not isinstance(names, collections.Sequence):
+        if not isinstance(names, abc.Sequence):
             raise ValueError("`names` is not a sequence")
 
         return _CuDFDataFrame(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 40f8eda0e4f..76b0217df3b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import collections
 import itertools
 import pickle
 import warnings
+from collections import abc
 from functools import cached_property
 from typing import Any, Iterable, List, Tuple, Union
 
@@ -1638,7 +1638,7 @@ def _handle_by_or_level(self, by=None, level=None):
                     self._handle_series(by)
                 elif isinstance(by, cudf.BaseIndex):
                     self._handle_index(by)
-                elif isinstance(by, collections.abc.Mapping):
+                elif isinstance(by, abc.Mapping):
                     self._handle_mapping(by)
                 elif isinstance(by, Grouper):
                     self._handle_grouper(by)
@@ -1757,7 +1757,7 @@ def _is_multi_agg(aggs):
     Returns True if more than one aggregation is performed
     on any of the columns as specified in `aggs`.
     """
-    if isinstance(aggs, collections.abc.Mapping):
+    if isinstance(aggs, abc.Mapping):
         return any(is_list_like(agg) for agg in aggs.values())
     if is_list_like(aggs):
         return True
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index ead0cd566d9..e1057c3b997 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -2,8 +2,8 @@
 
 from __future__ import annotations
 
-import collections
 import warnings
+from collections import abc
 from typing import TYPE_CHECKING, Any, Tuple, cast
 
 import numpy as np
@@ -166,7 +166,7 @@ def _match_categorical_dtypes_both(
 
 
 def _coerce_to_tuple(obj):
-    if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str):
+    if isinstance(obj, abc.Iterable) and not isinstance(obj, str):
         return tuple(obj)
     else:
         return (obj,)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 591ec582a3b..9b0b922a7d3 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -5,7 +5,7 @@
 import itertools
 import numbers
 import pickle
-from collections.abc import Sequence
+from collections import abc
 from functools import cached_property
 from numbers import Integral
 from typing import Any, List, MutableMapping, Tuple, Union
@@ -95,7 +95,7 @@ def __init__(
         if len(levels) == 0:
             raise ValueError("Must pass non-zero number of levels/codes")
         if not isinstance(codes, cudf.DataFrame) and not isinstance(
-            codes[0], (Sequence, np.ndarray)
+            codes[0], (abc.Sequence, np.ndarray)
         ):
             raise TypeError("Codes is not a Sequence of sequences")
 
@@ -912,7 +912,7 @@ def deserialize(cls, header, frames):
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
-        if isinstance(index, (Integral, Sequence)):
+        if isinstance(index, (Integral, abc.Sequence)):
             index = np.array(index)
         elif isinstance(index, slice):
             start, stop, step = index.indices(len(self))
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index a388e2560ee..f58c93aa0dc 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import itertools
+from collections import abc
 from typing import Dict, Optional
 
 import numpy as np
@@ -485,14 +486,14 @@ def melt(
     1  b         B          3
     2  c         B          5
     """
-    assert col_level in (None,)
+    if col_level is not None:
+        raise NotImplementedError("col_level != None is not supported yet.")
 
     # Arg cleaning
-    import collections
 
     # id_vars
     if id_vars is not None:
-        if not isinstance(id_vars, collections.abc.Sequence):
+        if not isinstance(id_vars, abc.Sequence):
             id_vars = [id_vars]
         id_vars = list(id_vars)
         missing = set(id_vars) - set(frame._column_names)
@@ -506,7 +507,7 @@ def melt(
 
     # value_vars
     if value_vars is not None:
-        if not isinstance(value_vars, collections.abc.Sequence):
+        if not isinstance(value_vars, abc.Sequence):
             value_vars = [value_vars]
         value_vars = list(value_vars)
         missing = set(value_vars) - set(frame._column_names)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 20ba52afccd..f780b5e3895 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -6,7 +6,7 @@
 import inspect
 import pickle
 import warnings
-from collections import abc as abc
+from collections import abc
 from shutil import get_terminal_size
 from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union
 
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 4dd9f434097..fbae7850e60 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -3,7 +3,7 @@
 import itertools
 import re
 import warnings
-from collections.abc import Mapping, Sequence
+from collections import abc
 from contextlib import contextmanager
 from decimal import Decimal
 
@@ -238,9 +238,9 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):
     else:
         if len(func_args_and_kwargs) == 1:
             func_args, func_kwargs = [], {}
-            if isinstance(func_args_and_kwargs[0], Sequence):
+            if isinstance(func_args_and_kwargs[0], abc.Sequence):
                 func_args = func_args_and_kwargs[0]
-            elif isinstance(func_args_and_kwargs[0], Mapping):
+            elif isinstance(func_args_and_kwargs[0], abc.Mapping):
                 func_kwargs = func_args_and_kwargs[0]
             else:
                 raise ValueError(
@@ -248,12 +248,12 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):
                     "either a Sequence or a Mapping"
                 )
         elif len(func_args_and_kwargs) == 2:
-            if not isinstance(func_args_and_kwargs[0], Sequence):
+            if not isinstance(func_args_and_kwargs[0], abc.Sequence):
                 raise ValueError(
                     "Positional argument at 1st position of "
                     "func_args_and_kwargs should be a sequence."
                 )
-            if not isinstance(func_args_and_kwargs[1], Mapping):
+            if not isinstance(func_args_and_kwargs[1], abc.Mapping):
                 raise ValueError(
                     "Key-word argument at 2nd position of "
                     "func_args_and_kwargs should be a dictionary mapping."

From 17d49faf19f9c5ff52a3ddeb7ddcee5545fa0f11 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Tue, 19 Apr 2022 09:05:28 +0800
Subject: [PATCH 14/26] Enable segmented_gather in Java package (#10669)

Current PR is to enable cuDF API `segmented_gather` in Java package. `segmented_gather` is essential to implement spark array functions like `arrays_zip`(https://github.com/NVIDIA/spark-rapids/issues/5229).

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/10669
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 28 +++++++++++++++++++
 java/src/main/native/src/ColumnViewJni.cpp    | 18 ++++++++++++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 25 +++++++++++++++++
 3 files changed, 71 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index ed3ac124216..b2c001c6737 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1546,6 +1546,31 @@ public ColumnVector segmentedReduce(ColumnView offsets, SegmentedReductionAggreg
     }
   }
 
+  /**
+   * Segmented gather of the elements within a list element in each row of a list column.
+   * For each list, assuming the size is N, valid indices of gather map ranges in [-N, N).
+   * Out of bound indices refer to null.
+   * @param gatherMap ListColumnView carrying lists of integral indices which maps the
+   * element in list of each row in the source columns to rows of lists in the result columns.
+   * @return the result.
+   */
+  public ColumnVector segmentedGather(ColumnView gatherMap) {
+    return segmentedGather(gatherMap, OutOfBoundsPolicy.NULLIFY);
+  }
+
+  /**
+   * Segmented gather of the elements within a list element in each row of a list column.
+   * @param gatherMap ListColumnView carrying lists of integral indices which maps the
+   * element in list of each row in the source columns to rows of lists in the result columns.
+   * @param policy OutOfBoundsPolicy, `DONT_CHECK` leads to undefined behaviour; `NULLIFY`
+   * replaces out of bounds with null.
+   * @return the result.
+   */
+  public ColumnVector segmentedGather(ColumnView gatherMap, OutOfBoundsPolicy policy) {
+    return new ColumnVector(segmentedGather(getNativeView(), gatherMap.getNativeView(),
+        policy.equals(OutOfBoundsPolicy.NULLIFY)));
+  }
+
   /**
    * Do a reduction on the values in a list. The output type will be the type of the data column
    * of this list.
@@ -3998,6 +4023,9 @@ private static native long scan(long viewHandle, long aggregation,
   private static native long segmentedReduce(long dataViewHandle, long offsetsViewHandle,
       long aggregation, boolean includeNulls, int dtype, int scale) throws CudfException;
 
+  private static native long segmentedGather(long sourceColumnHandle, long gatherMapListHandle,
+      boolean isNullifyOutBounds) throws CudfException;
+
   private static native long isNullNative(long viewHandle);
 
   private static native long isNanNative(long viewHandle);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 8c8e9b91e8d..6a294920d07 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -31,6 +31,7 @@
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/drop_list_duplicates.hpp>
 #include <cudf/lists/extract.hpp>
+#include <cudf/lists/gather.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/null_mask.hpp>
@@ -288,6 +289,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedReduce(
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedGather(
+    JNIEnv *env, jclass, jlong source_column, jlong gather_map_list, jboolean nullify_out_bounds) {
+  JNI_NULL_CHECK(env, source_column, "source column view is null", 0);
+  JNI_NULL_CHECK(env, gather_map_list, "gather map is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const &src_col =
+        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(source_column));
+    auto const &gather_map =
+        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(gather_map_list));
+    auto out_bounds_policy = nullify_out_bounds ? cudf::out_of_bounds_policy::NULLIFY :
+                                                  cudf::out_of_bounds_policy::DONT_CHECK;
+    return release_as_jlong(cudf::lists::segmented_gather(src_col, gather_map, out_bounds_policy));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(JNIEnv *env, jclass, jlong j_col_view,
                                                             jlong j_agg, jboolean is_inclusive,
                                                             jboolean include_nulls) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 58901d5743b..9189cd27303 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -20,6 +20,7 @@
 
 import ai.rapids.cudf.ColumnView.FindOptions;
 import ai.rapids.cudf.HostColumnVector.*;
+import com.google.common.collect.Lists;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
@@ -6259,4 +6260,28 @@ void testCopyWithBooleanColumnAsValidity() {
     });
     assertTrue(x.getMessage().contains("Exemplar and validity columns must have the same size"));
   }
+
+  @Test
+  void testSegmentedGather() {
+    HostColumnVector.DataType dt = new ListType(true, new BasicType(true, DType.STRING));
+    try (ColumnVector source = ColumnVector.fromLists(dt,
+        Lists.newArrayList("a", "b", null, "c"),
+        null,
+        Lists.newArrayList(),
+        Lists.newArrayList(null, "A", "B", "C", "D"));
+         ColumnVector gatherMap = ColumnVector.fromLists(
+             new ListType(false, new BasicType(false, DType.INT32)),
+             Lists.newArrayList(-3, 0, 2, 3, 4),
+             Lists.newArrayList(),
+             Lists.newArrayList(1),
+             Lists.newArrayList(1, -4, 5, -1, -6));
+         ColumnVector actual = source.segmentedGather(gatherMap);
+         ColumnVector expected = ColumnVector.fromLists(dt,
+             Lists.newArrayList("b", "a", null, "c", null),
+             null,
+             Lists.newArrayList((String) null),
+             Lists.newArrayList("A", "A", null, "D", null))) {
+      assertColumnsAreEqual(expected, actual);
+    }
+  }
 }

From 9dc728a5c4edb23e5d531409a120889d319f9a98 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 19 Apr 2022 01:34:55 -0700
Subject: [PATCH 15/26] Use Lists of Columns for Various Files (#10463)

This PR covers many low hanging fruits for https://github.com/rapidsai/cudf/issues/10153. All API accepting Frames now accepts a list of columns in the following files:

- hash.pyx
- interop.pyx
- join.pyx
- partitioning.pyx
- quantiles.pyx
- reshape.pyx
- search.pyx
- transform.pyx
- lists.pyx
- string/combine.pyx

This PR covers point 5 in the follow-ups to https://github.com/rapidsai/cudf/pull/9889.
Also, in `join.pyx`, gil was not released when dispatching workload to libcudf. This PR fixes that.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10463
---
 python/cudf/cudf/_lib/hash.pyx            |  32 ++---
 python/cudf/cudf/_lib/interop.pyx         |  65 ++++------
 python/cudf/cudf/_lib/join.pyx            |  51 +++-----
 python/cudf/cudf/_lib/lists.pyx           |  21 ++-
 python/cudf/cudf/_lib/partitioning.pyx    |  22 +---
 python/cudf/cudf/_lib/quantiles.pyx       |  14 +-
 python/cudf/cudf/_lib/reshape.pyx         |  23 ++--
 python/cudf/cudf/_lib/scalar.pyx          |  22 +---
 python/cudf/cudf/_lib/search.pyx          |  24 ++--
 python/cudf/cudf/_lib/strings/combine.pyx |   9 +-
 python/cudf/cudf/_lib/transform.pyx       |  20 +--
 python/cudf/cudf/core/column/column.py    |  20 +--
 python/cudf/cudf/core/column/lists.py     |   4 +-
 python/cudf/cudf/core/column/string.py    |   6 +-
 python/cudf/cudf/core/dataframe.py        |  85 ++++++++++---
 python/cudf/cudf/core/frame.py            | 148 +++++++---------------
 python/cudf/cudf/core/groupby/groupby.py  |   7 +-
 python/cudf/cudf/core/index.py            |   4 +-
 python/cudf/cudf/core/indexed_frame.py    |  58 +++++++--
 python/cudf/cudf/core/join/join.py        |  12 +-
 python/cudf/cudf/io/dlpack.py             |  13 +-
 python/cudf/cudf/tests/test_search.py     |  10 +-
 22 files changed, 299 insertions(+), 371 deletions(-)

diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 301f571f5fb..8bb8ab92a48 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -14,16 +14,14 @@ from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id
 from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
-def hash_partition(source_table, object columns_to_hash,
-                   int num_partitions, bool keep_index=True):
+def hash_partition(list source_columns, object columns_to_hash,
+                   int num_partitions):
     cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
     cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_table(
-        source_table, not keep_index
-    )
+    cdef table_view c_source_view = table_view_from_columns(source_columns)
 
     cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
     with nogil:
@@ -36,27 +34,17 @@ def hash_partition(source_table, object columns_to_hash,
         )
 
     # Note that the offsets (`c_result.second`) may be empty when
-    # the original table (`source_table`) is empty. We need to
+    # the original table (`source_columns`) is empty. We need to
     # return a list of zeros in this case.
     return (
-        *data_from_unique_ptr(
-            move(c_result.first),
-            column_names=source_table._column_names,
-            index_names=(
-                source_table._index_names
-                if keep_index is True
-                else None
-            )
-
-        ),
-        list(c_result.second) if c_result.second.size()
-        else [0] * num_partitions
+        columns_from_unique_ptr(move(c_result.first)),
+        list(c_result.second)
+        if c_result.second.size() else [0] * num_partitions
     )
 
 
-def hash(source_table, str method, int seed=0):
-    cdef table_view c_source_view = table_view_from_table(
-        source_table, ignore_index=True)
+def hash(list source_columns, str method, int seed=0):
+    cdef table_view c_source_view = table_view_from_columns(source_columns)
     cdef unique_ptr[column] c_result
     cdef cpp_hash_id c_hash_function
     if method == "murmur3":
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 06e287ee670..88c8b19ded0 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import cudf
 
@@ -20,12 +20,12 @@ from cudf._lib.cpp.interop cimport (
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def from_dlpack(dlpack_capsule):
     """
-    Converts a DLPack Tensor PyCapsule into a cudf Frame object.
+    Converts a DLPack Tensor PyCapsule into a list of columns.
 
     DLPack Tensor PyCapsule is expected to have the name "dltensor".
     """
@@ -40,31 +40,25 @@ def from_dlpack(dlpack_capsule):
             cpp_from_dlpack(dlpack_tensor)
         )
 
-    res = data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
-    )
+    res = columns_from_unique_ptr(move(c_result))
     dlpack_tensor.deleter(dlpack_tensor)
     return res
 
 
-def to_dlpack(source_table):
+def to_dlpack(list source_columns):
     """
-    Converts a cudf Frame into a DLPack Tensor PyCapsule.
+    Converts a list of columns into a DLPack Tensor PyCapsule.
 
     DLPack Tensor PyCapsule will have the name "dltensor".
     """
-    for column in source_table._columns:
-        if column.null_count:
-            raise ValueError(
-                "Cannot create a DLPack tensor with null values. \
-                    Input is required to have null count as zero."
-            )
+    if any(column.null_count for column in source_columns):
+        raise ValueError(
+            "Cannot create a DLPack tensor with null values. \
+                Input is required to have null count as zero."
+        )
 
     cdef DLManagedTensor *dlpack_tensor
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index=True
-    )
+    cdef table_view source_table_view = table_view_from_columns(source_columns)
 
     with nogil:
         dlpack_tensor = cpp_to_dlpack(
@@ -110,17 +104,14 @@ cdef vector[column_metadata] gather_metadata(object metadata) except *:
         raise ValueError("Malformed metadata has been encountered")
 
 
-def to_arrow(input_table,
-             object metadata,
-             bool keep_index=True):
-    """Convert from cudf Frame to PyArrow Table.
+def to_arrow(list source_columns, object metadata):
+    """Convert a list of columns from
+    cudf Frame to a PyArrow Table.
 
     Parameters
     ----------
-    input_table : cudf table
-    column_names : names for the pyarrow arrays
-    field_names : field names for nested type arrays
-    keep_index : whether index needs to be part of arrow table
+    source_columns : a list of columns to convert
+    metadata : a list of metadata, see `gather_metadata` for layout
 
     Returns
     -------
@@ -128,9 +119,7 @@ def to_arrow(input_table,
     """
 
     cdef vector[column_metadata] cpp_metadata = gather_metadata(metadata)
-    cdef table_view input_table_view = (
-        table_view_from_table(input_table, not keep_index)
-    )
+    cdef table_view input_table_view = table_view_from_columns(source_columns)
 
     cdef shared_ptr[CTable] cpp_arrow_table
     with nogil:
@@ -141,22 +130,16 @@ def to_arrow(input_table,
     return pyarrow_wrap_table(cpp_arrow_table)
 
 
-def from_arrow(
-    object input_table,
-    object column_names=None,
-    object index_names=None
-):
-    """Convert from PyArrow Table to cudf Frame.
+def from_arrow(object input_table):
+    """Convert from PyArrow Table to a list of columns.
 
     Parameters
     ----------
     input_table : PyArrow table
-    column_names : names for the cudf table data columns
-    index_names : names for the cudf table index columns
 
     Returns
     -------
-    cudf Frame
+    A list of columns to construct Frame object
     """
     cdef shared_ptr[CTable] cpp_arrow_table = (
         pyarrow_unwrap_table(input_table)
@@ -166,8 +149,4 @@ def from_arrow(
     with nogil:
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=column_names,
-        index_names=index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 5921f06d36e..1baef266dab 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from itertools import chain
 
@@ -16,31 +16,25 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
-from cudf._lib.utils cimport table_view_from_table
+from cudf._lib.utils cimport table_view_from_columns
 
 # The functions below return the *gathermaps* that represent
 # the join result when joining on the keys `lhs` and `rhs`.
 
-cpdef join(lhs, rhs, how=None):
+cpdef join(list lhs, list rhs, how=None):
     cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
-    cdef table_view c_lhs = table_view_from_table(lhs)
-    cdef table_view c_rhs = table_view_from_table(rhs)
+    cdef table_view c_lhs = table_view_from_columns(lhs)
+    cdef table_view c_rhs = table_view_from_columns(rhs)
 
     if how == "inner":
-        c_result = move(cpp_join.inner_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.inner_join(c_lhs, c_rhs))
     elif how == "left":
-        c_result = move(cpp_join.left_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.left_join(c_lhs, c_rhs))
     elif how == "outer":
-        c_result = move(cpp_join.full_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.full_join(c_lhs, c_rhs))
     else:
         raise ValueError(f"Invalid join type {how}")
 
@@ -49,30 +43,23 @@ cpdef join(lhs, rhs, how=None):
     return left_rows, right_rows
 
 
-cpdef semi_join(lhs, rhs, how=None):
+cpdef semi_join(list lhs, list rhs, how=None):
     # left-semi and left-anti joins
     cdef cpp_join.gather_map_type c_result
-    cdef table_view c_lhs = table_view_from_table(lhs)
-    cdef table_view c_rhs = table_view_from_table(rhs)
+    cdef table_view c_lhs = table_view_from_columns(lhs)
+    cdef table_view c_rhs = table_view_from_columns(rhs)
 
     if how == "leftsemi":
-        c_result = move(cpp_join.left_semi_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs))
     elif how == "leftanti":
-        c_result = move(cpp_join.left_anti_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs))
     else:
         raise ValueError(f"Invalid join type {how}")
 
     cdef Column left_rows = _gather_map_as_column(move(c_result))
-    return (
-        left_rows,
-        None
-    )
+    return left_rows, None
 
 
 cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 523686fafe6..e5a705ab603 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -42,7 +42,7 @@ from cudf.core.dtypes import ListDtype
 
 from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
 from cudf._lib.cpp.lists.extract cimport extract_list_element
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def count_elements(Column col):
@@ -61,8 +61,10 @@ def count_elements(Column col):
     return result
 
 
-def explode_outer(tbl, int explode_column_idx, bool ignore_index=False):
-    cdef table_view c_table_view = table_view_from_table(tbl, ignore_index)
+def explode_outer(
+    list source_columns, int explode_column_idx
+):
+    cdef table_view c_table_view = table_view_from_columns(source_columns)
     cdef size_type c_explode_column_idx = explode_column_idx
 
     cdef unique_ptr[table] c_result
@@ -70,11 +72,7 @@ def explode_outer(tbl, int explode_column_idx, bool ignore_index=False):
     with nogil:
         c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=tbl._column_names,
-        index_names=None if ignore_index else tbl._index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal):
@@ -197,18 +195,17 @@ def index_of(Column col, object py_search_key):
     return Column.from_unique_ptr(move(c_result))
 
 
-def concatenate_rows(tbl):
+def concatenate_rows(list source_columns):
     cdef unique_ptr[column] c_result
 
-    cdef table_view c_table_view = table_view_from_table(tbl)
+    cdef table_view c_table_view = table_view_from_columns(source_columns)
 
     with nogil:
         c_result = move(cpp_concatenate_rows(
             c_table_view,
         ))
 
-    result = Column.from_unique_ptr(move(c_result))
-    return result
+    return Column.from_unique_ptr(move(c_result))
 
 
 def concatenate_list_elements(Column input_column, dropna=False):
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index e53667e7589..f2f5a92aca1 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -11,21 +11,19 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.partitioning cimport partition as cpp_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
 cimport cudf._lib.cpp.types as libcudf_types
 
 
-def partition(source_table, Column partition_map,
-              object num_partitions, bool keep_index=True):
+def partition(list source_columns, Column partition_map,
+              object num_partitions):
 
     if num_partitions is None:
         num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
     cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_table(
-        source_table, not keep_index
-    )
+    cdef table_view c_source_view = table_view_from_columns(source_columns)
 
     cdef column_view c_partition_map_view = partition_map.view()
 
@@ -40,13 +38,5 @@ def partition(source_table, Column partition_map,
         )
 
     return (
-        *data_from_unique_ptr(
-            move(c_result.first),
-            column_names=source_table._column_names,
-            index_names=source_table._index_names if(
-                keep_index is True)
-            else None
-
-        ),
-        list(c_result.second)
+        columns_from_unique_ptr(move(c_result.first)), list(c_result.second)
     )
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index 497a71df89d..f65c29a55a8 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -31,7 +31,7 @@ from cudf._lib.cpp.types cimport (
     order_info,
     sorted,
 )
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def quantile(
@@ -74,14 +74,13 @@ def quantile(
     return Column.from_unique_ptr(move(c_result))
 
 
-def quantiles(source_table,
+def quantiles(list source_columns,
               vector[double] q,
               object interp,
               object is_input_sorted,
               list column_order,
               list null_precedence):
-    cdef table_view c_input = table_view_from_table(
-        source_table, ignore_index=True)
+    cdef table_view c_input = table_view_from_columns(source_columns)
     cdef vector[double] c_q = q
     cdef interpolation c_interp = <interpolation>(
         <underlying_type_t_interpolation> interp
@@ -119,7 +118,4 @@ def quantiles(source_table,
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names
-    )
+    return columns_from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index d64d0543892..29223947eea 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -13,32 +13,25 @@ from cudf._lib.cpp.reshape cimport (
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
-def interleave_columns(source_table):
-    cdef table_view c_view = table_view_from_table(
-        source_table, ignore_index=True)
+def interleave_columns(list source_columns):
+    cdef table_view c_view = table_view_from_columns(source_columns)
     cdef unique_ptr[column] c_result
 
     with nogil:
         c_result = move(cpp_interleave_columns(c_view))
 
-    return Column.from_unique_ptr(
-        move(c_result)
-    )
+    return Column.from_unique_ptr(move(c_result))
 
 
-def tile(source_table, size_type count):
+def tile(list source_columns, size_type count):
     cdef size_type c_count = count
-    cdef table_view c_view = table_view_from_table(source_table)
+    cdef table_view c_view = table_view_from_columns(source_columns)
     cdef unique_ptr[table] c_result
 
     with nogil:
         c_result = move(cpp_tile(c_view, c_count))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=source_table._index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 32d6cb2ea6d..a7acfa8f906 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -65,6 +65,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_us,
 )
 from cudf._lib.utils cimport (
+    columns_from_table_view,
     data_from_table_view,
     table_view_from_columns,
     table_view_from_table,
@@ -361,8 +362,8 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
             names=columns
         )
 
-    data, _ = from_arrow(pyarrow_table, column_names=columns)
-    cdef table_view struct_view = table_view_from_columns(data.values())
+    data = from_arrow(pyarrow_table)
+    cdef table_view struct_view = table_view_from_columns(data)
 
     s.reset(
         new struct_scalar(struct_view, valid)
@@ -373,18 +374,10 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s):
         return cudf.NA
 
     cdef table_view struct_table_view = (<struct_scalar*>s.get()).view()
-    columns = [str(i) for i in range(struct_table_view.num_columns())]
+    column_names = [str(i) for i in range(struct_table_view.num_columns())]
 
-    data, _ = data_from_table_view(
-        struct_table_view,
-        None,
-        column_names=columns
-    )
-    to_arrow_table = cudf.core.frame.Frame(
-        cudf.core.column_accessor.ColumnAccessor(data)
-    )
-
-    python_dict = to_arrow(to_arrow_table, columns).to_pydict()
+    columns = columns_from_table_view(struct_table_view, None)
+    python_dict = to_arrow(columns, column_names).to_pydict()
 
     return {k: _nested_na_replace(python_dict[k])[0] for k in python_dict}
 
@@ -415,9 +408,8 @@ cdef _get_py_list_from_list(unique_ptr[scalar]& s):
 
     cdef column_view list_col_view = (<list_scalar*>s.get()).view()
     cdef Column list_col = Column.from_column_view(list_col_view, None)
-    to_arrow_table = cudf.core.frame.Frame({"col": list_col})
 
-    arrow_table = to_arrow(to_arrow_table, [["col", []]])
+    arrow_table = to_arrow([list_col], [["col", []]])
     result = arrow_table['col'].to_pylist()
     return _nested_na_replace(result)
 
diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx
index f92ef753fc2..d5568f53231 100644
--- a/python/cudf/cudf/_lib/search.pyx
+++ b/python/cudf/cudf/_lib/search.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -10,20 +10,20 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport table_view_from_table
+from cudf._lib.utils cimport table_view_from_columns
 
 
 def search_sorted(
-    table, values, side, ascending=True, na_position="last"
+    list source, list values, side, ascending=True, na_position="last"
 ):
     """Find indices where elements should be inserted to maintain order
 
     Parameters
     ----------
-    table : Frame
-        Frame to search in
-    values : Frame
-        Frame of values to search for
+    source : list of columns
+        List of columns to search in
+    values : List of columns
+        List of value columns to search for
     side : str {‘left’, ‘right’} optional
         If ‘left’, the index of the first suitable location is given.
         If ‘right’, return the last such index
@@ -33,10 +33,8 @@ def search_sorted(
     cdef vector[libcudf_types.null_order] c_null_precedence
     cdef libcudf_types.order c_order
     cdef libcudf_types.null_order c_null_order
-    cdef table_view c_table_data = table_view_from_table(
-        table, ignore_index=True)
-    cdef table_view c_values_data = table_view_from_table(
-        values, ignore_index=True)
+    cdef table_view c_table_data = table_view_from_columns(source)
+    cdef table_view c_values_data = table_view_from_columns(values)
 
     # Note: We are ignoring index columns here
     c_order = (libcudf_types.order.ASCENDING
@@ -47,9 +45,9 @@ def search_sorted(
         if na_position=="last"
         else libcudf_types.null_order.BEFORE
     )
-    c_column_order = vector[libcudf_types.order](table._num_columns, c_order)
+    c_column_order = vector[libcudf_types.order](len(source), c_order)
     c_null_precedence = vector[libcudf_types.null_order](
-        table._num_columns, c_null_order
+        len(source), c_null_order
     )
 
     if side == 'left':
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 3b5ef33a668..eeb39f70728 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -18,10 +18,10 @@ from cudf._lib.cpp.strings.combine cimport (
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport table_view_from_table
+from cudf._lib.utils cimport table_view_from_columns
 
 
-def concatenate(source_strings,
+def concatenate(list source_strings,
                 object sep,
                 object na_rep):
     """
@@ -33,8 +33,7 @@ def concatenate(source_strings,
     cdef DeviceScalar narep = na_rep.device_value
 
     cdef unique_ptr[column] c_result
-    cdef table_view source_view = table_view_from_table(
-        source_strings, ignore_index=True)
+    cdef table_view source_view = table_view_from_columns(source_strings)
 
     cdef const string_scalar* scalar_separator = \
         <const string_scalar*>(separator.get_raw_ptr())
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 96d25cb92c9..175150b6865 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import numpy as np
 from numba.np import numpy_support
@@ -25,9 +25,9 @@ from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
     data_from_table_view,
-    data_from_unique_ptr,
-    table_view_from_table,
+    table_view_from_columns,
 )
 
 
@@ -123,21 +123,15 @@ def transform(Column input, op):
     return Column.from_unique_ptr(move(c_output))
 
 
-def table_encode(input):
-    cdef table_view c_input = table_view_from_table(
-        input, ignore_index=True)
+def table_encode(list source_columns):
+    cdef table_view c_input = table_view_from_columns(source_columns)
     cdef pair[unique_ptr[table], unique_ptr[column]] c_result
 
     with nogil:
         c_result = move(libcudf_transform.encode(c_input))
 
-    return (
-        *data_from_unique_ptr(
-            move(c_result.first),
-            column_names=input._column_names,
-        ),
-        Column.from_unique_ptr(move(c_result.second))
-    )
+    return columns_from_unique_ptr(
+        move(c_result.first)), Column.from_unique_ptr(move(c_result.second))
 
 
 def one_hot_encode(Column input_column, Column categories):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b2e3e42531b..5c9d8535798 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -229,13 +229,9 @@ def to_arrow(self) -> pa.Array:
           4
         ]
         """
-        return libcudf.interop.to_arrow(
-            cudf.core.frame.Frame(
-                cudf.core.column_accessor.ColumnAccessor({"None": self})
-            ),
-            [["None"]],
-            keep_index=False,
-        )["None"].chunk(0)
+        return libcudf.interop.to_arrow([self], [["None"]],)[
+            "None"
+        ].chunk(0)
 
     @classmethod
     def from_arrow(cls, array: pa.Array) -> ColumnBase:
@@ -280,12 +276,8 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 }
             )
 
-            codes = libcudf.interop.from_arrow(
-                indices_table, indices_table.column_names
-            )[0]["None"]
-            categories = libcudf.interop.from_arrow(
-                dictionaries_table, dictionaries_table.column_names
-            )[0]["None"]
+            codes = libcudf.interop.from_arrow(indices_table)[0]
+            categories = libcudf.interop.from_arrow(dictionaries_table)[0]
 
             return build_categorical_column(
                 categories=categories,
@@ -301,7 +293,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         ):
             return cudf.core.column.IntervalColumn.from_arrow(array)
 
-        result = libcudf.interop.from_arrow(data, data.column_names)[0]["None"]
+        result = libcudf.interop.from_arrow(data)[0]
 
         return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 8578bfe8147..b383f7bc321 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -113,9 +113,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             return NotImplemented
         if isinstance(other.dtype, ListDtype):
             if op == "__add__":
-                return concatenate_rows(
-                    cudf.core.frame.Frame({0: self, 1: other})
-                )
+                return concatenate_rows([self, other])
             else:
                 raise NotImplementedError(
                     "Lists concatenation for this operation is not yet"
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index d5d45c341d5..6f4a6334a1d 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -365,9 +365,7 @@ def cat(self, others=None, sep=None, na_rep=None):
             other_cols = _get_cols_list(self._parent, others)
             all_cols = [self._column] + other_cols
             data = libstrings.concatenate(
-                cudf.DataFrame(
-                    {index: value for index, value in enumerate(all_cols)}
-                ),
+                all_cols,
                 cudf.Scalar(sep),
                 cudf.Scalar(na_rep, "str"),
             )
@@ -5531,7 +5529,7 @@ def _binaryop(
                 return cast(
                     "column.ColumnBase",
                     libstrings.concatenate(
-                        cudf.DataFrame._from_data(data={0: lhs, 1: rhs}),
+                        [lhs, rhs],
                         sep=cudf.Scalar(""),
                         na_rep=cudf.Scalar(None, "str"),
                     ),
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4ffacfa2ccc..50255b07077 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3933,19 +3933,16 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
         -------
         partitioned: list of DataFrame
         """
-        idx = (
-            0
-            if (self._index is None or keep_index is False)
-            else self._index._num_columns
-        )
-        key_indices = [self._data.names.index(k) + idx for k in columns]
 
-        output_data, output_index, offsets = libcudf.hash.hash_partition(
-            self, key_indices, nparts, keep_index
+        key_indices = [self._column_names.index(k) for k in columns]
+        output_columns, offsets = libcudf.hash.hash_partition(
+            [*self._columns], key_indices, nparts
+        )
+        outdf = self._from_columns_like_self(
+            [*(self._index._columns if keep_index else ()), *output_columns],
+            self._column_names,
+            self._index_names if keep_index else None,
         )
-        outdf = self.__class__._from_data(output_data, output_index)
-        outdf._copy_type_metadata(self, include_index=keep_index)
-
         # Slice into partition
         return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])]
 
@@ -5678,22 +5675,24 @@ def stack(self, level=-1, dropna=True):
         """
         assert level in (None, -1)
         repeated_index = self.index.repeat(self.shape[1])
-        name_index = cudf.DataFrame._from_data({0: self._column_names}).tile(
-            self.shape[0]
+        name_index = libcudf.reshape.tile(
+            [as_column(self._column_names)], self.shape[0]
         )
-        new_index = list(repeated_index._columns) + [name_index._columns[0]]
+        new_index_columns = [*repeated_index._columns, *name_index]
         if isinstance(self._index, MultiIndex):
             index_names = self._index.names + [None]
         else:
-            index_names = [None] * len(new_index)
+            index_names = [None] * len(new_index_columns)
         new_index = MultiIndex.from_frame(
-            DataFrame(dict(zip(range(0, len(new_index)), new_index))),
+            DataFrame._from_data(
+                dict(zip(range(0, len(new_index_columns)), new_index_columns))
+            ),
             names=index_names,
         )
 
         # Collect datatypes and cast columns as that type
         common_type = np.result_type(*self.dtypes)
-        homogenized = DataFrame(
+        homogenized = DataFrame._from_data(
             {
                 c: (
                     self._data[c].astype(common_type)
@@ -5704,9 +5703,15 @@ def stack(self, level=-1, dropna=True):
             }
         )
 
-        data_col = libcudf.reshape.interleave_columns(homogenized)
+        result = Series._from_data(
+            {
+                None: libcudf.reshape.interleave_columns(
+                    [*homogenized._columns]
+                )
+            },
+            index=new_index,
+        )
 
-        result = Series(data=data_col, index=new_index)
         if dropna:
             return result.dropna()
         else:
@@ -6167,6 +6172,48 @@ def _from_columns_like_self(
         result._set_column_names_like(self)
         return result
 
+    @_cudf_nvtx_annotate
+    def interleave_columns(self):
+        """
+        Interleave Series columns of a table into a single column.
+
+        Converts the column major table `cols` into a row major column.
+
+        Parameters
+        ----------
+        cols : input Table containing columns to interleave.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({0: ['A1', 'A2', 'A3'], 1: ['B1', 'B2', 'B3']})
+        >>> df
+            0   1
+        0  A1  B1
+        1  A2  B2
+        2  A3  B3
+        >>> df.interleave_columns()
+        0    A1
+        1    B1
+        2    A2
+        3    B2
+        4    A3
+        5    B3
+        dtype: object
+
+        Returns
+        -------
+        The interleaved columns as a single column
+        """
+        if ("category" == self.dtypes).any():
+            raise ValueError(
+                "interleave_columns does not support 'category' dtype."
+            )
+
+        return self._constructor_sliced._from_data(
+            {None: libcudf.reshape.interleave_columns([*self._columns])}
+        )
+
 
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 806cdf14c71..d10f7c690bf 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -960,10 +960,16 @@ def scatter_by_map(
                     f"ERROR: map_size must be >= {count} (got {map_size})."
                 )
 
-        data, index, output_offsets = libcudf.partitioning.partition(
-            self, map_index, map_size, keep_index
+        partitioned_columns, output_offsets = libcudf.partitioning.partition(
+            [*(self._index._columns if keep_index else ()), *self._columns],
+            map_index,
+            map_size,
+        )
+        partitioned = self._from_columns_like_self(
+            partitioned_columns,
+            column_names=self._column_names,
+            index_names=self._index_names if keep_index else None,
         )
-        partitioned = self.__class__._from_data(data, index)
 
         # due to the split limitation mentioned
         # here: https://github.com/rapidsai/cudf/issues/4607
@@ -973,9 +979,6 @@ def scatter_by_map(
 
         result = partitioned._split(output_offsets, keep_index=keep_index)
 
-        for frame in result:
-            frame._copy_type_metadata(self, include_index=keep_index)
-
         if map_size:
             result += [
                 self._empty_like(keep_index)
@@ -1274,20 +1277,18 @@ def _quantiles(
             libcudf.types.NullOrder[key] for key in null_precedence
         ]
 
-        result = self.__class__._from_data(
-            *libcudf.quantiles.quantiles(
-                self,
+        return self._from_columns_like_self(
+            libcudf.quantiles.quantiles(
+                [*self._columns],
                 q,
                 interpolation,
                 is_sorted,
                 column_order,
                 null_precedence,
-            )
+            ),
+            column_names=self._column_names,
         )
 
-        result._copy_type_metadata(self)
-        return result
-
     @_cudf_nvtx_annotate
     def rank(
         self,
@@ -1466,30 +1467,33 @@ def from_arrow(cls, data):
 
             dict_indices_table = pa.table(dict_indices)
             data = data.drop(dict_indices_table.column_names)
-            cudf_indices_frame, _ = libcudf.interop.from_arrow(
-                dict_indices_table, dict_indices_table.column_names
-            )
+            indices_columns = libcudf.interop.from_arrow(dict_indices_table)
             # as dictionary size can vary, it can't be a single table
             cudf_dictionaries_columns = {
                 name: ColumnBase.from_arrow(dict_dictionaries[name])
                 for name in dict_dictionaries.keys()
             }
 
-            for name, codes in cudf_indices_frame.items():
-                cudf_category_frame[name] = build_categorical_column(
+            cudf_category_frame = {
+                name: build_categorical_column(
                     cudf_dictionaries_columns[name],
                     codes,
                     mask=codes.base_mask,
                     size=codes.size,
                     ordered=dict_ordered[name],
                 )
+                for name, codes in zip(
+                    dict_indices_table.column_names, indices_columns
+                )
+            }
 
         # Handle non-dict arrays
-        cudf_non_category_frame = (
-            {}
-            if data.num_columns == 0
-            else libcudf.interop.from_arrow(data, data.column_names)[0]
-        )
+        cudf_non_category_frame = {
+            name: col
+            for name, col in zip(
+                data.column_names, libcudf.interop.from_arrow(data)
+            )
+        }
 
         result = {**cudf_non_category_frame, **cudf_category_frame}
 
@@ -2027,76 +2031,6 @@ def notnull(self):
     # Alias for notnull
     notna = notnull
 
-    @_cudf_nvtx_annotate
-    def interleave_columns(self):
-        """
-        Interleave Series columns of a table into a single column.
-
-        Converts the column major table `cols` into a row major column.
-
-        Parameters
-        ----------
-        cols : input Table containing columns to interleave.
-
-        Examples
-        --------
-        >>> df = DataFrame([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3']])
-        >>> df
-        0    [A1, A2, A3]
-        1    [B1, B2, B3]
-        >>> df.interleave_columns()
-        0    A1
-        1    B1
-        2    A2
-        3    B2
-        4    A3
-        5    B3
-
-        Returns
-        -------
-        The interleaved columns as a single column
-        """
-        if ("category" == self.dtypes).any():
-            raise ValueError(
-                "interleave_columns does not support 'category' dtype."
-            )
-
-        result = self._constructor_sliced(
-            libcudf.reshape.interleave_columns(self)
-        )
-
-        return result
-
-    @_cudf_nvtx_annotate
-    def tile(self, count):
-        """
-        Repeats the rows from `self` DataFrame `count` times to form a
-        new DataFrame.
-
-        Parameters
-        ----------
-        self : input Table containing columns to interleave.
-        count : Number of times to tile "rows". Must be non-negative.
-
-        Examples
-        --------
-        >>> df  = Dataframe([[8, 4, 7], [5, 2, 3]])
-        >>> count = 2
-        >>> df.tile(df, count)
-           0  1  2
-        0  8  4  7
-        1  5  2  3
-        0  8  4  7
-        1  5  2  3
-
-        Returns
-        -------
-        The table containing the tiled "rows".
-        """
-        result = self.__class__._from_data(*libcudf.reshape.tile(self, count))
-        result._copy_type_metadata(self)
-        return result
-
     @_cudf_nvtx_annotate
     def searchsorted(
         self, values, side="left", ascending=True, na_position="last"
@@ -2166,12 +2100,24 @@ def searchsorted(
             scalar_flag = True
 
         if not isinstance(values, Frame):
-            values = as_column(values)
-            if values.dtype != self.dtype:
-                self = self.astype(values.dtype)
-            values = values.as_frame()
+            values = [as_column(values)]
+        else:
+            values = [*values._columns]
+        if len(values) != len(self._data):
+            raise ValueError("Mismatch number of columns to search for.")
+
+        sources = [
+            col
+            if is_dtype_equal(col.dtype, val.dtype)
+            else col.astype(val.dtype)
+            for col, val in zip(self._columns, values)
+        ]
         outcol = libcudf.search.search_sorted(
-            self, values, side, ascending=ascending, na_position=na_position
+            sources,
+            values,
+            side,
+            ascending=ascending,
+            na_position=na_position,
         )
 
         # Retrun result as cupy array if the values is non-scalar
@@ -2462,10 +2408,8 @@ def _split(self, splits):
 
     @_cudf_nvtx_annotate
     def _encode(self):
-        data, index, indices = libcudf.transform.table_encode(self)
-        for name, col in data.items():
-            data[name] = col._with_type_metadata(self._data[name].dtype)
-        keys = self.__class__._from_data(data, index)
+        columns, indices = libcudf.transform.table_encode([*self._columns])
+        keys = self._from_columns_like_self(columns)
         return keys, indices
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 76b0217df3b..249cb7f4343 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1105,16 +1105,11 @@ def _cov_or_corr(self, func, method_name):
             for i in range(0, len(cols_list), num_cols)
         ]
 
-        def combine_columns(gb_cov_corr, ys):
-            list_of_columns = [gb_cov_corr._data[y] for y in ys]
-            frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys)
-            return interleave_columns(frame)
-
         # interleave: combines the correlation or covariance results for each
         # column-pair into a single column
         res = cudf.DataFrame._from_data(
             {
-                x: combine_columns(gb_cov_corr, ys)
+                x: interleave_columns([gb_cov_corr._data[y] for y in ys])
                 for ys, x in zip(cols_split, column_names)
             }
         )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index aff13025e72..fd918f723fe 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -76,10 +76,10 @@ def _lexsorted_equal_range(
         sort_inds = None
         sort_vals = idx
     lower_bound = search_sorted(
-        sort_vals, key_as_table, side="left"
+        [*sort_vals._data.columns], [*key_as_table._columns], side="left"
     ).element_indexing(0)
     upper_bound = search_sorted(
-        sort_vals, key_as_table, side="right"
+        [*sort_vals._data.columns], [*key_as_table._columns], side="right"
     ).element_indexing(0)
 
     return lower_bound, upper_bound, sort_inds
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ea722ec3968..ddb3082af96 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -818,7 +818,8 @@ def hash_values(self, method="murmur3"):
         # calculation, necessitating the unfortunate circular reference to the
         # child class here.
         return cudf.Series._from_data(
-            {None: libcudf.hash.hash(self, method)}, index=self.index
+            {None: libcudf.hash.hash([*self._columns], method)},
+            index=self.index,
         )
 
     def _gather(
@@ -2690,21 +2691,52 @@ def _explode(self, explode_column: Any, ignore_index: bool):
         if not ignore_index and self._index is not None:
             explode_column_num += self._index.nlevels
 
-        data, index = libcudf.lists.explode_outer(
-            self, explode_column_num, ignore_index
+        exploded = libcudf.lists.explode_outer(
+            [
+                *(self._index._data.columns if not ignore_index else ()),
+                *self._columns,
+            ],
+            explode_column_num,
         )
-        res = self.__class__._from_data(
-            ColumnAccessor(
-                data,
-                multiindex=self._data.multiindex,
-                level_names=self._data._level_names,
-            ),
-            index=index,
+
+        return self._from_columns_like_self(
+            exploded,
+            self._column_names,
+            self._index_names if not ignore_index else None,
         )
 
-        if not ignore_index and self._index is not None:
-            res.index.names = self._index.names
-        return res
+    @_cudf_nvtx_annotate
+    def tile(self, count):
+        """Repeats the rows `count` times to form a new Frame.
+
+        Parameters
+        ----------
+        self : input Table containing columns to interleave.
+        count : Number of times to tile "rows". Must be non-negative.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df  = cudf.Dataframe([[8, 4, 7], [5, 2, 3]])
+        >>> count = 2
+        >>> df.tile(df, count)
+           0  1  2
+        0  8  4  7
+        1  5  2  3
+        0  8  4  7
+        1  5  2  3
+
+        Returns
+        -------
+        The indexed frame containing the tiled "rows".
+        """
+        return self._from_columns_like_self(
+            libcudf.reshape.tile(
+                [*self._index._columns, *self._columns], count
+            ),
+            column_names=self._column_names,
+            index_names=self._index_names,
+        )
 
     @_cudf_nvtx_annotate
     @docutils.doc_apply(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index c7e46cf0165..6a495ef8d9a 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -177,15 +177,15 @@ def __init__(
         )
 
     def perform_merge(self) -> Frame:
-        left_join_cols = {}
-        right_join_cols = {}
+        left_join_cols = []
+        right_join_cols = []
 
         for left_key, right_key in zip(self._left_keys, self._right_keys):
             lcol = left_key.get(self.lhs)
             rcol = right_key.get(self.rhs)
             lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how)
-            left_join_cols[left_key.name] = lcol_casted
-            right_join_cols[left_key.name] = rcol_casted
+            left_join_cols.append(lcol_casted)
+            right_join_cols.append(rcol_casted)
 
             # Categorical dtypes must be cast back from the underlying codes
             # type that was returned by _match_join_keys.
@@ -201,8 +201,8 @@ def perform_merge(self) -> Frame:
             right_key.set(self.rhs, rcol_casted, validate=False)
 
         left_rows, right_rows = self._joiner(
-            cudf.core.frame.Frame(left_join_cols),
-            cudf.core.frame.Frame(right_join_cols),
+            left_join_cols,
+            right_join_cols,
             how=self.how,
         )
 
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index 00a2cb4cee2..644643db83c 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 
 import cudf
@@ -34,12 +34,13 @@ def from_dlpack(pycapsule_obj):
     tensor is row-major, transpose it before passing it to this function.
     """
 
-    data, _ = libdlpack.from_dlpack(pycapsule_obj)
+    columns = libdlpack.from_dlpack(pycapsule_obj)
+    column_names = range(len(columns))
 
-    if len(data) == 1:
-        return cudf.Series._from_data(data)
+    if len(columns) == 1:
+        return cudf.Series._from_columns(columns, column_names=column_names)
     else:
-        return cudf.DataFrame._from_data(data)
+        return cudf.DataFrame._from_columns(columns, column_names=column_names)
 
 
 @ioutils.doc_to_dlpack()
@@ -91,4 +92,4 @@ def to_dlpack(cudf_obj):
     )
     gdf = gdf.astype(dtype)
 
-    return libdlpack.to_dlpack(gdf)
+    return libdlpack.to_dlpack([*gdf._columns])
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index cd029d02d79..d3433a589a7 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 import cupy
 import numpy as np
 import pandas as pd
@@ -73,6 +73,14 @@ def test_searchsorted_dataframe(side, multiindex):
         assert result == [2, 0, 4, 1]
 
 
+def test_search_sorted_dataframe_unequal_number_of_columns():
+    values = cudf.DataFrame({"a": [1, 0, 5, 1]})
+    base = cudf.DataFrame({"a": [1, 0, 5, 1], "b": ["x", "z", "w", "a"]})
+
+    with pytest.raises(ValueError, match="Mismatch number of columns"):
+        base.searchsorted(values)
+
+
 @pytest.mark.parametrize("side", ["left", "right"])
 def test_searchsorted_categorical(side):
 

From ba1173d326fc540183dd2563cc7b4b66127cd222 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 19 Apr 2022 18:50:04 +0530
Subject: [PATCH 16/26] cleanup benchmark includes (#10661)

- remove cudf_test unnecessary includes
- fix include order
- remove benchmark/benchmark.h when benchmark fixture is included

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10661
---
 cpp/benchmarks/copying/contiguous_split.cu       |  1 +
 cpp/benchmarks/copying/copy_if_else.cpp          |  1 -
 cpp/benchmarks/hashing/hash.cpp                  |  1 -
 cpp/benchmarks/io/csv/csv_reader.cpp             |  2 --
 cpp/benchmarks/io/csv/csv_writer.cpp             |  2 --
 cpp/benchmarks/io/cuio_common.hpp                |  4 ++--
 cpp/benchmarks/io/orc/orc_reader.cpp             |  2 --
 cpp/benchmarks/io/orc/orc_writer.cpp             |  2 --
 cpp/benchmarks/io/parquet/parquet_reader.cpp     |  2 --
 cpp/benchmarks/io/parquet/parquet_writer.cpp     |  2 --
 .../io/parquet/parquet_writer_chunks.cpp         | 11 ++---------
 cpp/benchmarks/io/text/multibyte_split.cpp       |  5 +----
 cpp/benchmarks/iterator/iterator.cu              |  9 ++++-----
 cpp/benchmarks/join/join_common.hpp              |  4 +---
 cpp/benchmarks/merge/merge.cpp                   | 11 ++++-------
 cpp/benchmarks/null_mask/set_null_mask.cpp       |  2 --
 cpp/benchmarks/reduction/scan.cpp                |  1 -
 cpp/benchmarks/replace/clamp.cpp                 |  1 -
 cpp/benchmarks/replace/nans.cpp                  |  1 -
 cpp/benchmarks/sort/rank.cpp                     | 13 +++----------
 cpp/benchmarks/sort/sort.cpp                     | 11 ++---------
 cpp/benchmarks/sort/sort_strings.cpp             |  1 -
 cpp/benchmarks/sort/sort_structs.cpp             |  5 ++---
 cpp/benchmarks/string/case.cpp                   |  1 -
 cpp/benchmarks/string/combine.cpp                |  2 --
 cpp/benchmarks/string/contains.cpp               |  1 -
 cpp/benchmarks/string/convert_datetime.cpp       |  1 -
 cpp/benchmarks/string/convert_durations.cpp      | 16 ++++++----------
 cpp/benchmarks/string/convert_fixed_point.cpp    |  6 ++----
 cpp/benchmarks/string/convert_numerics.cpp       |  6 ++----
 cpp/benchmarks/string/copy.cu                    |  2 +-
 cpp/benchmarks/string/extract.cpp                |  3 ++-
 cpp/benchmarks/string/factory.cu                 |  4 ++--
 cpp/benchmarks/string/filter.cpp                 |  4 ++--
 cpp/benchmarks/string/find.cpp                   |  4 ++--
 cpp/benchmarks/string/repeat_strings.cpp         |  1 -
 cpp/benchmarks/string/replace.cpp                |  8 ++++----
 cpp/benchmarks/string/replace_re.cpp             |  4 ++--
 cpp/benchmarks/string/split.cpp                  |  4 ++--
 cpp/benchmarks/string/substring.cpp              |  9 ++++-----
 cpp/benchmarks/string/translate.cpp              |  8 ++++----
 cpp/benchmarks/string/url_decode.cu              |  8 ++------
 cpp/benchmarks/text/ngrams.cpp                   |  2 --
 cpp/benchmarks/text/normalize.cpp                |  3 ---
 cpp/benchmarks/text/normalize_spaces.cpp         |  3 ---
 cpp/benchmarks/text/replace.cpp                  |  7 ++++---
 cpp/benchmarks/text/subword.cpp                  |  6 +++---
 cpp/benchmarks/text/tokenize.cpp                 |  5 ++---
 .../type_dispatcher/type_dispatcher.cu           |  2 --
 49 files changed, 68 insertions(+), 146 deletions(-)

diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
index 6b129a4a435..a61b18df8d1 100644
--- a/cpp/benchmarks/copying/contiguous_split.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
diff --git a/cpp/benchmarks/copying/copy_if_else.cpp b/cpp/benchmarks/copying/copy_if_else.cpp
index 6f094aba680..6f355118f49 100644
--- a/cpp/benchmarks/copying/copy_if_else.cpp
+++ b/cpp/benchmarks/copying/copy_if_else.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index 1110b6fe9ef..9c0ef5b528d 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/io/csv/csv_reader.cpp b/cpp/benchmarks/io/csv/csv_reader.cpp
index 6f5e7160cd3..b61ba75ce6e 100644
--- a/cpp/benchmarks/io/csv/csv_reader.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp
index 65aa31c68dc..079df45b1d8 100644
--- a/cpp/benchmarks/io/csv/csv_writer.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index ff900d20e6f..8ea29684aae 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -16,12 +16,12 @@
 
 #pragma once
 
+#include <cudf_test/file_utilities.hpp>
+
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/types.hpp>
 
-#include <cudf_test/file_utilities.hpp>
-
 using cudf::io::io_type;
 
 #define RD_BENCHMARK_DEFINE_ALL_SOURCES(benchmark, name, type_or_group)                  \
diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp
index fc76fbe7603..7d6eb432b5b 100644
--- a/cpp/benchmarks/io/orc/orc_reader.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
index f61dac7677b..4e7781b402a 100644
--- a/cpp/benchmarks/io/orc/orc_writer.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp
index b20534e8ac0..af7121d37dc 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index d25fae42d0e..776121028ef 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
index 30ed245ed9a..e22696b9c01 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
@@ -14,21 +14,14 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
-#include <cudf/column/column.hpp>
-#include <cudf/table/table.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf/column/column.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/table/table.hpp>
 
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index af6c2c5e030..d274f79a77c 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -19,10 +19,9 @@
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf_test/column_wrapper.hpp>
-
 #include <cudf_test/file_utilities.hpp>
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -38,8 +37,6 @@
 #include <fstream>
 #include <memory>
 
-using cudf::test::fixed_width_column_wrapper;
-
 temp_directory const temp_dir("cudf_gbench");
 
 enum data_chunk_source_type {
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index 595775ddf00..5eaaec23211 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <rmm/device_uvector.hpp>
 
@@ -31,8 +32,6 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
-#include <benchmark/benchmark.h>
-
 #include <random>
 
 template <typename T>
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 6ff2543cf7d..a031b4e656d 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -21,10 +21,8 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
 #include <cudf/ast/expressions.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/filling.hpp>
 #include <cudf/join.hpp>
diff --git a/cpp/benchmarks/merge/merge.cpp b/cpp/benchmarks/merge/merge.cpp
index 88354bcc731..82d89233a33 100644
--- a/cpp/benchmarks/merge/merge.cpp
+++ b/cpp/benchmarks/merge/merge.cpp
@@ -14,18 +14,15 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+#include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
+#include <cudf/merge.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
-#include <cudf/merge.hpp>
-
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 
 #include <random>
diff --git a/cpp/benchmarks/null_mask/set_null_mask.cpp b/cpp/benchmarks/null_mask/set_null_mask.cpp
index 2057951ff8d..429a62a2bfa 100644
--- a/cpp/benchmarks/null_mask/set_null_mask.cpp
+++ b/cpp/benchmarks/null_mask/set_null_mask.cpp
@@ -19,8 +19,6 @@
 
 #include <cudf/null_mask.hpp>
 
-#include <benchmark/benchmark.h>
-
 class SetNullmask : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index aef4960789a..8c434465795 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/replace/clamp.cpp b/cpp/benchmarks/replace/clamp.cpp
index d3a7415a478..e9a259d0c7b 100644
--- a/cpp/benchmarks/replace/clamp.cpp
+++ b/cpp/benchmarks/replace/clamp.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/replace/nans.cpp b/cpp/benchmarks/replace/nans.cpp
index e1b05bbc337..28ca798ebf0 100644
--- a/cpp/benchmarks/replace/nans.cpp
+++ b/cpp/benchmarks/replace/nans.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/sort/rank.cpp b/cpp/benchmarks/sort/rank.cpp
index c3c77ebd52f..0a5c1844c69 100644
--- a/cpp/benchmarks/sort/rank.cpp
+++ b/cpp/benchmarks/sort/rank.cpp
@@ -14,20 +14,13 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_view.hpp>
-#include <cudf/sorting.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+
 class Rank : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/sort/sort.cpp b/cpp/benchmarks/sort/sort.cpp
index 1a42daa5bb0..d7c33e7170e 100644
--- a/cpp/benchmarks/sort/sort.cpp
+++ b/cpp/benchmarks/sort/sort.cpp
@@ -14,19 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/sorting.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf/sorting.hpp>
+
 template <bool stable>
 class Sort : public cudf::benchmark {
 };
diff --git a/cpp/benchmarks/sort/sort_strings.cpp b/cpp/benchmarks/sort/sort_strings.cpp
index 30a7aee043b..a58b9a4f6da 100644
--- a/cpp/benchmarks/sort/sort_strings.cpp
+++ b/cpp/benchmarks/sort/sort_strings.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/sort/sort_structs.cpp b/cpp/benchmarks/sort/sort_structs.cpp
index 81f7ad8a4c1..9b6c32940f5 100644
--- a/cpp/benchmarks/sort/sort_structs.cpp
+++ b/cpp/benchmarks/sort/sort_structs.cpp
@@ -16,11 +16,10 @@
 
 #include <benchmarks/fixture/rmm_pool_raii.hpp>
 
-#include <cudf/detail/sorting.hpp>
-
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/detail/sorting.hpp>
+
 #include <nvbench/nvbench.cuh>
 
 #include <random>
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index 0d74d0a6b7c..daa22d25677 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp
index a0cfcd15fe8..85c48e18ce1 100644
--- a/cpp/benchmarks/string/combine.cpp
+++ b/cpp/benchmarks/string/combine.cpp
@@ -16,7 +16,6 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -24,7 +23,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 class StringCombine : public cudf::benchmark {
 };
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index 3a89b5646d7..6689e3611d1 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp
index 3782fea1e36..488ce95d397 100644
--- a/cpp/benchmarks/string/convert_datetime.cpp
+++ b/cpp/benchmarks/string/convert_datetime.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp
index 8af111d9a63..6e3a9e8faa9 100644
--- a/cpp/benchmarks/string/convert_durations.cpp
+++ b/cpp/benchmarks/string/convert_durations.cpp
@@ -13,21 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/durations.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-
-#include <benchmark/benchmark.h>
-
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
 #include <algorithm>
 #include <random>
 
diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp
index 05b87906eca..88657c409cd 100644
--- a/cpp/benchmarks/string/convert_fixed_point.cpp
+++ b/cpp/benchmarks/string/convert_fixed_point.cpp
@@ -14,11 +14,9 @@
  * limitations under the License.
  */
 
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/convert/convert_floats.hpp>
diff --git a/cpp/benchmarks/string/convert_numerics.cpp b/cpp/benchmarks/string/convert_numerics.cpp
index 71a23c76829..3025c32b888 100644
--- a/cpp/benchmarks/string/convert_numerics.cpp
+++ b/cpp/benchmarks/string/convert_numerics.cpp
@@ -14,11 +14,9 @@
  * limitations under the License.
  */
 
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu
index a8f9eb111fc..0280322a3a1 100644
--- a/cpp/benchmarks/string/copy.cu
+++ b/cpp/benchmarks/string/copy.cu
@@ -20,9 +20,9 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
index b8d206386f5..4ff29285482 100644
--- a/cpp/benchmarks/string/extract.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -20,9 +20,10 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <random>
 
diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu
index 2e0bf4afb36..dde0b7e4424 100644
--- a/cpp/benchmarks/string/factory.cu
+++ b/cpp/benchmarks/string/factory.cu
@@ -16,14 +16,14 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/benchmarks/string/filter.cpp b/cpp/benchmarks/string/filter.cpp
index b39cf25bc91..064b824619e 100644
--- a/cpp/benchmarks/string/filter.cpp
+++ b/cpp/benchmarks/string/filter.cpp
@@ -14,17 +14,17 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/strip.hpp>
 #include <cudf/strings/translate.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <limits>
 #include <vector>
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 55eb52c9b30..aaa7bd29b31 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <limits>
 
diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp
index 9044db18522..835a437e3b5 100644
--- a/cpp/benchmarks/string/repeat_strings.cpp
+++ b/cpp/benchmarks/string/repeat_strings.cpp
@@ -16,7 +16,6 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp
index 0a3607c64f0..10f6e2a19ed 100644
--- a/cpp/benchmarks/string/replace.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -14,20 +14,20 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include "string_bench_args.hpp"
+
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <limits>
 
-#include "string_bench_args.hpp"
-
 class StringReplace : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp
index b9d04630837..148cbe678bd 100644
--- a/cpp/benchmarks/string/replace_re.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -16,14 +16,14 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 class StringReplace : public cudf::benchmark {
 };
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index ad25cfe54de..97eb0ba6dbf 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <limits>
 
diff --git a/cpp/benchmarks/string/substring.cpp b/cpp/benchmarks/string/substring.cpp
index 2195cc56515..a18462385fc 100644
--- a/cpp/benchmarks/string/substring.cpp
+++ b/cpp/benchmarks/string/substring.cpp
@@ -16,21 +16,20 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
-#include <limits>
 
 #include <thrust/iterator/constant_iterator.h>
 
+#include <limits>
+
 class StringSubstring : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/string/translate.cpp b/cpp/benchmarks/string/translate.cpp
index 38c6ff9c701..2ed0ccceba6 100644
--- a/cpp/benchmarks/string/translate.cpp
+++ b/cpp/benchmarks/string/translate.cpp
@@ -16,19 +16,19 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/strings/translate.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <algorithm>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/translate.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <algorithm>
+
 class StringTranslate : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu
index 7971d44536d..40bf2b090d4 100644
--- a/cpp/benchmarks/string/url_decode.cu
+++ b/cpp/benchmarks/string/url_decode.cu
@@ -14,21 +14,17 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 157c27ae48a..b1e70517aea 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
@@ -22,7 +21,6 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
 
 #include <nvtext/generate_ngrams.hpp>
 
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 2cc083f4ae8..3b58a7dd187 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -14,15 +14,12 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <nvtext/normalize.hpp>
 
diff --git a/cpp/benchmarks/text/normalize_spaces.cpp b/cpp/benchmarks/text/normalize_spaces.cpp
index 3bd636d4aa9..1fe912e5740 100644
--- a/cpp/benchmarks/text/normalize_spaces.cpp
+++ b/cpp/benchmarks/text/normalize_spaces.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
@@ -22,8 +21,6 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <nvtext/normalize.hpp>
 
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
index 3fbb6054d5c..a093cd767b3 100644
--- a/cpp/benchmarks/text/replace.cpp
+++ b/cpp/benchmarks/text/replace.cpp
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/strings/strings_column_view.hpp>
+
 #include <nvtext/replace.hpp>
 
+#include <random>
+
 class TextReplace : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index b8311324f70..d8357dcf92c 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -15,12 +15,12 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <cudf/strings/strings_column_view.hpp>
-#include <nvtext/subword_tokenize.hpp>
 
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/strings/strings_column_view.hpp>
+#include <nvtext/subword_tokenize.hpp>
+
 #include <filesystem>
 #include <fstream>
 #include <iostream>
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index 4cb9c9e5271..fea1973c026 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <nvtext/ngrams_tokenize.hpp>
 #include <nvtext/tokenize.hpp>
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index aba78dad3fe..53dac455b04 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -17,8 +17,6 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf_test/column_wrapper.hpp>
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/utilities/cuda.cuh>

From 08cd4284229cce35c6824b10ac9bcebc7ccc5514 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 19 Apr 2022 11:35:01 -0400
Subject: [PATCH 17/26] Add device_memory_resource parameter to
 create_string_vector_from_column (#10673)

Adds the `rmm::mr::device_memory_resource` parameter to the `cudf::strings::detail::create_string_vector_from_column` function. This will be called in a future API in a later PR and the resulting memory object will returned to the user.

Also found and removed a few related functions that are no longer necessary and updated the callers appropriately simplifying the logic there.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10673
---
 cpp/include/cudf/strings/detail/scatter.cuh   | 21 ++----
 cpp/include/cudf/strings/detail/utilities.cuh | 22 ------
 cpp/include/cudf/strings/detail/utilities.hpp | 31 +-------
 cpp/src/lists/copying/scatter_helper.cu       | 40 ++--------
 cpp/src/strings/utilities.cu                  | 75 ++++---------------
 5 files changed, 33 insertions(+), 156 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index b6aa22cc316..f167206f36b 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -15,14 +15,13 @@
  */
 #pragma once
 
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/scatter.h>
@@ -71,17 +70,9 @@ std::unique_ptr<column> scatter(
   // do the scatter
   thrust::scatter(rmm::exec_policy(stream), begin, end, scatter_map, target_vector.begin());
 
-  // build offsets column
-  auto offsets_column = child_offsets_from_string_vector(target_vector, stream, mr);
-  // build chars column
-  auto chars_column =
-    child_chars_from_string_vector(target_vector, offsets_column->view(), stream, mr);
-
-  return make_strings_column(target.size(),
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             UNKNOWN_NULL_COUNT,
-                             cudf::detail::copy_bitmask(target.parent(), stream, mr));
+  // build the output column
+  auto sv_span = cudf::device_span<string_view const>(target_vector);
+  return make_strings_column(sv_span, string_view{nullptr, 0}, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index b9ea2d9ecff..bb7f29a4172 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -71,28 +71,6 @@ std::unique_ptr<column> make_offsets_child_column(
   return offsets_column;
 }
 
-/**
- * @brief Creates an offsets column from a string_view iterator, and size.
- *
- * @tparam Iter Iterator type that returns string_view instances
- * @param strings_begin Iterator to the beginning of the string_view sequence
- * @param num_strings The number of string_view instances in the sequence
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Child offsets column
- */
-template <typename Iter>
-std::unique_ptr<cudf::column> child_offsets_from_string_iterator(
-  Iter strings_begin,
-  cudf::size_type num_strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto transformer = [] __device__(string_view v) { return v.size_bytes(); };
-  auto begin       = thrust::make_transform_iterator(strings_begin, transformer);
-  return make_offsets_child_column(begin, begin + num_strings, stream, mr);
-}
-
 /**
  * @brief Copies input string data into a buffer and increments the pointer by the number of bytes
  * copied.
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 6424841ba86..c4f9e547148 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,36 +45,11 @@ std::unique_ptr<column> create_chars_child_column(
  *
  * @param strings Strings column instance.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned vector's device memory.
  * @return Device vector of string_views
  */
 rmm::device_uvector<string_view> create_string_vector_from_column(
-  cudf::strings_column_view const strings, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-
-/**
- * @brief Creates an offsets column from a string_view vector.
- *
- * @param strings Strings input data
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Child offsets column
- */
-std::unique_ptr<cudf::column> child_offsets_from_string_vector(
-  cudf::device_span<string_view> strings,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Creates a chars column from a string_view vector.
- *
- * @param strings Strings input data
- * @param d_offsets Offsets vector for placing strings into column's memory.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Child chars column
- */
-std::unique_ptr<cudf::column> child_chars_from_string_vector(
-  cudf::device_span<string_view> strings,
-  column_view const& offsets,
+  cudf::strings_column_view const strings,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index adc1b95a9e6..fecf6e1c1a1 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -21,8 +21,7 @@
 #include <cudf/lists/detail/copying.hpp>
 #include <cudf/lists/detail/scatter_helper.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -253,39 +252,16 @@ struct list_child_constructor {
         auto lists_column    = actual_list_row.get_column();
         auto lists_offsets_ptr    = lists_column.offsets().template data<offset_type>();
         auto child_strings_column = lists_column.child();
-        auto string_offsets_ptr =
-          child_strings_column.child(cudf::strings_column_view::offsets_column_index)
-            .template data<offset_type>();
-        auto string_chars_ptr =
-          child_strings_column.child(cudf::strings_column_view::chars_column_index)
-            .template data<char>();
-
-        auto strings_offset = lists_offsets_ptr[row_index] + intra_index;
-        auto char_offset    = string_offsets_ptr[strings_offset];
-        auto char_ptr       = string_chars_ptr + char_offset;
-        auto string_size =
-          string_offsets_ptr[strings_offset + 1] - string_offsets_ptr[strings_offset];
-        return string_view{char_ptr, string_size};
+        auto strings_offset       = lists_offsets_ptr[row_index] + intra_index;
+
+        return child_strings_column.is_null(strings_offset)
+                 ? string_view{nullptr, 0}
+                 : child_strings_column.template element<string_view>(strings_offset);
       });
 
     // string_views should now have been populated with source and target references.
-
-    auto string_offsets = cudf::strings::detail::child_offsets_from_string_iterator(
-      string_views.begin(), string_views.size(), stream, mr);
-
-    auto string_chars = cudf::strings::detail::child_chars_from_string_vector(
-      string_views, string_offsets->view(), stream, mr);
-    auto child_null_mask =
-      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
-        ? construct_child_nullmask(
-            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
-
-    return cudf::make_strings_column(num_child_rows,
-                                     std::move(string_offsets),
-                                     std::move(string_chars),
-                                     child_null_mask.second,  // Null count.
-                                     std::move(child_null_mask.first));
+    auto sv_span = cudf::device_span<string_view const>(string_views);
+    return cudf::make_strings_column(sv_span, string_view{nullptr, 0}, stream, mr);
   }
 
   /**
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index d7cc72fdfff..a7ef2afb47f 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -20,7 +20,6 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/error.hpp>
 
@@ -28,12 +27,8 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/transform_scan.h>
-
-#include <cstring>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
@@ -42,65 +37,27 @@ namespace detail {
 /**
  * @copydoc create_string_vector_from_column
  */
-rmm::device_uvector<string_view> create_string_vector_from_column(cudf::strings_column_view strings,
-                                                                  rmm::cuda_stream_view stream)
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-
-  rmm::device_uvector<string_view> strings_vector(strings.size(), stream);
-  string_view* d_strings = strings_vector.data();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings.size(),
-                     [d_column, d_strings] __device__(size_type idx) {
-                       if (d_column.is_null(idx))
-                         d_strings[idx] = string_view(nullptr, 0);
-                       else
-                         d_strings[idx] = d_column.element<string_view>(idx);
-                     });
-  return strings_vector;
-}
-
-/**
- * @copydoc child_offsets_from_string_vector
- */
-std::unique_ptr<cudf::column> child_offsets_from_string_vector(
-  cudf::device_span<string_view> strings,
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const input,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  return child_offsets_from_string_iterator(strings.begin(), strings.size(), stream, mr);
-}
+  auto d_strings = column_device_view::create(input.parent(), stream);
 
-/**
- * @copydoc child_chars_from_string_vector
- */
-std::unique_ptr<cudf::column> child_chars_from_string_vector(cudf::device_span<string_view> strings,
-                                                             column_view const& offsets,
-                                                             rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr)
-{
-  auto const d_strings = strings.data();
-  auto const bytes     = cudf::detail::get_value<int32_t>(offsets, strings.size(), stream);
-  auto const d_offsets = offsets.data<int32_t>();
-
-  // create column
-  auto chars_column = create_chars_child_column(bytes, stream, mr);
-  // get it's view
-  auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings.size(),
-                     [d_strings, d_offsets, d_chars] __device__(size_type idx) {
-                       string_view const d_str = d_strings[idx];
-                       memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
-                     });
-
-  return chars_column;
+  auto strings_vector = rmm::device_uvector<string_view>(input.size(), stream, mr);
+
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(input.size()),
+    strings_vector.begin(),
+    [d_strings = *d_strings] __device__(size_type idx) {
+      return d_strings.is_null(idx) ? string_view{nullptr, 0} : d_strings.element<string_view>(idx);
+    });
+
+  return strings_vector;
 }
 
-//
 std::unique_ptr<column> create_chars_child_column(cudf::size_type total_bytes,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)

From 565f4743f797ff31afd402f715a4c57547cb6c66 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 19 Apr 2022 12:17:05 -0400
Subject: [PATCH 18/26] Split up mixed-join kernels source files (#10671)

Split up `mixed_join_kernels.cu` and `mixed_join_size_kernels.cu` to improve overall build time.
Currently these take about 30 minutes each on the gpuCI build. Example of a recent build metrics report:
https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci-22-06/job/cudf/job/prb/job/cudf-cpu-cuda-build/CUDA=11.5/164/Build_20Metrics_20Report/

The nulls and non-nulls definitions are placed into separate source files.

The kernel source used for `mixed_join_kernels.cu` (both null and non-null) is moved to `mixed_join_kernel.cuh` and the nulls definition is moved to `mixed_join_kernel_nulls.cu`. For consistency the `mixed_join_kernels.cu` name is changed to just `mixed_join_kernel.cu` since it now only contains one definition.
This same pattern applies to `mixed_join_size_kernels.cu` splitting into `mixed_join_size_kernel.cuh`, `mixed_join_size_kernel_nulls.cu` and `mixed_join_size_kernel.cu`

No function behavior or actual code generation has changed. The source code has just moved into more source files to help better parallelize and speed up the build process. This improves compile time by 10% for a release build and ~25% for a debug build.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10671
---
 cpp/CMakeLists.txt                            |  6 ++-
 cpp/src/join/mixed_join_kernel.cu             | 38 +++++++++++++++++++
 ..._join_kernels.cu => mixed_join_kernel.cuh} | 31 ++-------------
 cpp/src/join/mixed_join_kernel_nulls.cu       | 38 +++++++++++++++++++
 cpp/src/join/mixed_join_size_kernel.cu        | 36 ++++++++++++++++++
 ..._kernels.cu => mixed_join_size_kernel.cuh} | 27 -------------
 cpp/src/join/mixed_join_size_kernel_nulls.cu  | 36 ++++++++++++++++++
 7 files changed, 155 insertions(+), 57 deletions(-)
 create mode 100644 cpp/src/join/mixed_join_kernel.cu
 rename cpp/src/join/{mixed_join_kernels.cu => mixed_join_kernel.cuh} (82%)
 create mode 100644 cpp/src/join/mixed_join_kernel_nulls.cu
 create mode 100644 cpp/src/join/mixed_join_size_kernel.cu
 rename cpp/src/join/{mixed_join_size_kernels.cu => mixed_join_size_kernel.cuh} (80%)
 create mode 100644 cpp/src/join/mixed_join_size_kernel_nulls.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d9422edaa8f..dbc55827a32 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -344,10 +344,12 @@ add_library(
   src/join/join.cu
   src/join/join_utils.cu
   src/join/mixed_join.cu
-  src/join/mixed_join_kernels.cu
+  src/join/mixed_join_kernel.cu
+  src/join/mixed_join_kernel_nulls.cu
   src/join/mixed_join_kernels_semi.cu
   src/join/mixed_join_semi.cu
-  src/join/mixed_join_size_kernels.cu
+  src/join/mixed_join_size_kernel.cu
+  src/join/mixed_join_size_kernel_nulls.cu
   src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
   src/lists/contains.cu
diff --git a/cpp/src/join/mixed_join_kernel.cu b/cpp/src/join/mixed_join_kernel.cu
new file mode 100644
index 00000000000..f8912f0c7bd
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernel.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mixed_join_kernel.cuh"
+
+namespace cudf {
+namespace detail {
+
+template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  size_type* join_output_l,
+  size_type* join_output_r,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const* join_result_offsets,
+  bool const swap_tables);
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels.cu b/cpp/src/join/mixed_join_kernel.cuh
similarity index 82%
rename from cpp/src/join/mixed_join_kernels.cu
rename to cpp/src/join/mixed_join_kernel.cuh
index efaea841e45..f7081cc4d63 100644
--- a/cpp/src/join/mixed_join_kernels.cu
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <join/hash_join.cuh>
 #include <join/join_common_utils.cuh>
 #include <join/join_common_utils.hpp>
@@ -32,6 +34,7 @@
 
 namespace cudf {
 namespace detail {
+
 namespace cg = cooperative_groups;
 
 template <cudf::size_type block_size, bool has_nulls>
@@ -107,34 +110,6 @@ __launch_bounds__(block_size) __global__
   }
 }
 
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  size_type* join_output_l,
-  size_type* join_output_r,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
-
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  size_type* join_output_l,
-  size_type* join_output_r,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
-
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernel_nulls.cu b/cpp/src/join/mixed_join_kernel_nulls.cu
new file mode 100644
index 00000000000..a911c62b349
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernel_nulls.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mixed_join_kernel.cuh"
+
+namespace cudf {
+namespace detail {
+
+template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  size_type* join_output_l,
+  size_type* join_output_r,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const* join_result_offsets,
+  bool const swap_tables);
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernel.cu b/cpp/src/join/mixed_join_size_kernel.cu
new file mode 100644
index 00000000000..cf8236e2be2
--- /dev/null
+++ b/cpp/src/join/mixed_join_size_kernel.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mixed_join_size_kernel.cuh"
+
+namespace cudf {
+namespace detail {
+
+template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernels.cu b/cpp/src/join/mixed_join_size_kernel.cuh
similarity index 80%
rename from cpp/src/join/mixed_join_size_kernels.cu
rename to cpp/src/join/mixed_join_size_kernel.cuh
index 22c71bfc33a..9eedc1a8015 100644
--- a/cpp/src/join/mixed_join_size_kernels.cu
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -99,32 +99,5 @@ __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
   if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
 }
 
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
 }  // namespace detail
-
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernel_nulls.cu b/cpp/src/join/mixed_join_size_kernel_nulls.cu
new file mode 100644
index 00000000000..f05d674b3b5
--- /dev/null
+++ b/cpp/src/join/mixed_join_size_kernel_nulls.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mixed_join_size_kernel.cuh"
+
+namespace cudf {
+namespace detail {
+
+template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row);
+
+}  // namespace detail
+}  // namespace cudf

From 304711a98c5786901b6c939d3fa8ae0f174840dd Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 19 Apr 2022 13:37:28 -0400
Subject: [PATCH 19/26] Handle RuntimeError thrown by CUDA Python in
 `validate_setup` (#10653)

The call to `getDeviceCount()` can raise a `RuntimeError` when `libcuda.so` is missing. We should handle that too in `validate_setup()`.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10653
---
 python/cudf/cudf/utils/gpu_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index a722d350ef4..ab3adc1651a 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -55,6 +55,12 @@ def validate_setup():
             raise e
         # If there is no GPU detected, set `gpus_count` to -1
         gpus_count = -1
+    except RuntimeError as e:
+        # getDeviceCount() can raise a RuntimeError
+        # when ``libcuda.so`` is missing.
+        # We don't want this to propagate up to the user.
+        warnings.warn(str(e))
+        return
 
     if gpus_count > 0:
         # Cupy throws RunTimeException to get GPU count,

From 31a5f44a23135a46beee019fa21f54a695d719f9 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 19 Apr 2022 10:47:22 -0700
Subject: [PATCH 20/26] Cython API Refactor: `transpose.pyx`, `sort.pyx`
 (#10675)

This PR contributes to #10153, refactors all cython APIs in `transpose.pyx`, `sort.pyx` to accept a list of columns as input.

This PR also includes several minor improvements in the code base, see comments below for detail.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/10675
---
 python/cudf/cudf/_lib/sort.pyx            | 101 ++++++++-----------
 python/cudf/cudf/_lib/transpose.pyx       |  60 ++----------
 python/cudf/cudf/_lib/utils.pyx           |   8 +-
 python/cudf/cudf/core/column/numerical.py |   4 +-
 python/cudf/cudf/core/dataframe.py        |  37 +++++--
 python/cudf/cudf/core/frame.py            | 113 +++++-----------------
 python/cudf/cudf/core/indexed_frame.py    |  87 +++++++++++++++++
 python/cudf/cudf/tests/test_dataframe.py  |  29 ------
 python/cudf/cudf/tests/test_series.py     |  29 ++++++
 9 files changed, 222 insertions(+), 246 deletions(-)

diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index 3aa0b35e90e..faa4279c1ca 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -1,6 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-import pandas as pd
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -23,19 +21,19 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport null_order, null_policy, order
 from cudf._lib.sort cimport underlying_type_t_rank_method
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def is_sorted(
-    source_table, object ascending=None, object null_position=None
+    list source_columns, object ascending=None, object null_position=None
 ):
     """
     Checks whether the rows of a `table` are sorted in lexicographical order.
 
     Parameters
     ----------
-    source_table : Frame
-        Frame whose columns are to be checked for sort order
+    source_columns : list of columns
+        columns to be checked for sort order
     ascending : None or list-like of booleans
         None or list-like of boolean values indicating expected sort order of
         each column. If list-like, size of list-like must be len(columns). If
@@ -58,51 +56,39 @@ def is_sorted(
     cdef vector[null_order] null_precedence
 
     if ascending is None:
-        column_order = vector[order](
-            source_table._num_columns, order.ASCENDING
-        )
-    elif pd.api.types.is_list_like(ascending):
-        if len(ascending) != source_table._num_columns:
+        column_order = vector[order](len(source_columns), order.ASCENDING)
+    else:
+        if len(ascending) != len(source_columns):
             raise ValueError(
-                f"Expected a list-like of length {source_table._num_columns}, "
+                f"Expected a list-like of length {len(source_columns)}, "
                 f"got length {len(ascending)} for `ascending`"
             )
         column_order = vector[order](
-            source_table._num_columns, order.DESCENDING
+            len(source_columns), order.DESCENDING
         )
         for idx, val in enumerate(ascending):
             if val:
                 column_order[idx] = order.ASCENDING
-    else:
-        raise TypeError(
-            f"Expected a list-like or None for `ascending`, got "
-            f"{type(ascending)}"
-        )
 
     if null_position is None:
         null_precedence = vector[null_order](
-            source_table._num_columns, null_order.AFTER
+            len(source_columns), null_order.AFTER
         )
-    elif pd.api.types.is_list_like(null_position):
-        if len(null_position) != source_table._num_columns:
+    else:
+        if len(null_position) != len(source_columns):
             raise ValueError(
-                f"Expected a list-like of length {source_table._num_columns}, "
+                f"Expected a list-like of length {len(source_columns)}, "
                 f"got length {len(null_position)} for `null_position`"
             )
         null_precedence = vector[null_order](
-            source_table._num_columns, null_order.AFTER
+            len(source_columns), null_order.AFTER
         )
         for idx, val in enumerate(null_position):
             if val:
                 null_precedence[idx] = null_order.BEFORE
-    else:
-        raise TypeError(
-            f"Expected a list-like or None for `null_position`, got "
-            f"{type(null_position)}"
-        )
 
     cdef bool c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(source_columns)
     with nogil:
         c_result = cpp_is_sorted(
             source_table_view,
@@ -113,34 +99,34 @@ def is_sorted(
     return c_result
 
 
-def order_by(source_table, object ascending, str na_position):
+def order_by(list columns_from_table, object ascending, str na_position):
     """
-    Sorting the table ascending/descending
+    Get index to sort the table in ascending/descending order.
 
     Parameters
     ----------
-    source_table : table which will be sorted
-    ascending : list of boolean values which correspond to each column
+    columns_from_table : columns from the table which will be sorted
+    ascending : sequence of boolean values which correspond to each column
                 in source_table signifying order of each column
                 True - Ascending and False - Descending
     na_position : whether null value should show up at the "first" or "last"
                 position of **all** sorted column.
     """
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index=True
+    cdef table_view source_table_view = table_view_from_columns(
+        columns_from_table
     )
     cdef vector[order] column_order
     column_order.reserve(len(ascending))
     cdef vector[null_order] null_precedence
     null_precedence.reserve(len(ascending))
 
-    for i in ascending:
-        if i is True:
+    for asc in ascending:
+        if asc:
             column_order.push_back(order.ASCENDING)
         else:
             column_order.push_back(order.DESCENDING)
 
-        if i ^ (na_position == "first"):
+        if asc ^ (na_position == "first"):
             null_precedence.push_back(null_order.AFTER)
         else:
             null_precedence.push_back(null_order.BEFORE)
@@ -154,21 +140,21 @@ def order_by(source_table, object ascending, str na_position):
     return Column.from_unique_ptr(move(c_result))
 
 
-def digitize(source_values_table, bins, bool right=False):
+def digitize(list source_columns, list bins, bool right=False):
     """
     Return the indices of the bins to which each value in source_table belongs.
 
     Parameters
     ----------
-    source_table : Input table to be binned.
-    bins : Frame containing columns of bins
+    source_columns : Input columns to be binned.
+    bins : List containing columns of bins
     right : Indicating whether the intervals include the
             right or the left bin edge.
     """
 
-    cdef table_view bins_view = table_view_from_table(bins)
-    cdef table_view source_values_table_view = table_view_from_table(
-        source_values_table
+    cdef table_view bins_view = table_view_from_columns(bins)
+    cdef table_view source_table_view = table_view_from_columns(
+        source_columns
     )
     cdef vector[order] column_order = (
         vector[order](
@@ -184,11 +170,11 @@ def digitize(source_values_table, bins, bool right=False):
     )
 
     cdef unique_ptr[column] c_result
-    if right is True:
+    if right:
         with nogil:
             c_result = move(lower_bound(
                 bins_view,
-                source_values_table_view,
+                source_table_view,
                 column_order,
                 null_precedence)
             )
@@ -196,7 +182,7 @@ def digitize(source_values_table, bins, bool right=False):
         with nogil:
             c_result = move(upper_bound(
                 bins_view,
-                source_values_table_view,
+                source_table_view,
                 column_order,
                 null_precedence)
             )
@@ -212,15 +198,13 @@ class RankMethod(IntEnum):
     DENSE = < underlying_type_t_rank_method > rank_method.DENSE
 
 
-def rank_columns(source_table, object method, str na_option,
+def rank_columns(list source_columns, object method, str na_option,
                  bool ascending, bool pct
                  ):
     """
     Compute numerical data ranks (1 through n) of each column in the dataframe
     """
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index=True
-    )
+    cdef table_view source_table_view = table_view_from_columns(source_columns)
 
     cdef rank_method c_rank_method = < rank_method > (
         < underlying_type_t_rank_method > method
@@ -260,7 +244,7 @@ def rank_columns(source_table, object method, str na_option,
     cdef vector[unique_ptr[column]] c_results
     cdef column_view c_view
     cdef Column col
-    for col in source_table._columns:
+    for col in source_columns:
         c_view = col.view()
         with nogil:
             c_results.push_back(move(
@@ -274,11 +258,6 @@ def rank_columns(source_table, object method, str na_option,
                 )
             ))
 
-    cdef unique_ptr[table] c_result
-    c_result.reset(new table(move(c_results)))
-    data, _ = data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=None
-    )
-    return data, source_table._index
+    return [Column.from_unique_ptr(
+        move(c_results[i])
+    ) for i in range(c_results.size())]
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index 931a2702612..b9eea6169bd 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,7 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-import cudf
-from cudf.api.types import is_categorical_dtype
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
@@ -9,65 +6,22 @@ from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.transpose cimport transpose as cpp_transpose
-from cudf._lib.utils cimport data_from_table_view, table_view_from_table
-
+from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
-def transpose(source):
-    """Transpose index and columns.
 
-    See Also
-    --------
-    cudf.core.DataFrame.transpose
+def transpose(list source_columns):
+    """Transpose m n-row columns into n m-row columns
     """
-
-    if source._num_columns == 0:
-        return source
-
-    cats = None
-    columns = source._columns
-    dtype = columns[0].dtype
-
-    if is_categorical_dtype(dtype):
-        if any(not is_categorical_dtype(c.dtype) for c in columns):
-            raise ValueError('Columns must all have the same dtype')
-        cats = list(c.categories for c in columns)
-        cats = cudf.core.column.concat_columns(cats).unique()
-        source = cudf.core.frame.Frame(index=source._index, data=[
-            (name, col._set_categories(cats, is_unique=True).codes)
-            for name, col in source._data.items()
-        ])
-    elif any(c.dtype != dtype for c in columns):
-        raise ValueError('Columns must all have the same dtype')
-
     cdef pair[unique_ptr[column], table_view] c_result
-    cdef table_view c_input = table_view_from_table(
-        source, ignore_index=True)
+    cdef table_view c_input = table_view_from_columns(source_columns)
 
     with nogil:
         c_result = move(cpp_transpose(c_input))
 
     result_owner = Column.from_unique_ptr(move(c_result.first))
-    data, _ = data_from_table_view(
+    return columns_from_table_view(
         c_result.second,
-        owner=result_owner,
-        column_names=range(c_input.num_rows())
+        owners=[result_owner] * c_result.second.num_columns()
     )
-
-    if cats is not None:
-        data= [
-            (name, cudf.core.column.column.build_categorical_column(
-                codes=cudf.core.column.column.build_column(
-                    col.base_data, dtype=col.dtype),
-                mask=col.base_mask,
-                size=col.size,
-                categories=cats,
-                offset=col.offset,
-            ))
-            for name, col in data.items()
-        ]
-
-    return data
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 8557f430e25..643a1adca9f 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -317,10 +317,10 @@ cdef columns_from_table_view(
 ):
     """
     Given a ``cudf::table_view``, construsts a list of columns from it,
-    along with referencing an ``owner`` Python object that owns the memory
-    lifetime. ``owner`` must be either None or a list of column. If ``owner``
-    is a list of columns, the owner of the `i`th ``cudf::column_view`` in the
-    table view is ``owners[i]``. For more about memory ownership,
+    along with referencing an owner Python object that owns the memory
+    lifetime. owner must be either None or a list of column. If owner
+    is a list of columns, the owner of the `i`th ``cudf::column_view``
+    in the table view is ``owners[i]``. For more about memory ownership,
     see ``Column.from_column_view``.
     """
 
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 216faaa8250..e7b8d62f886 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -774,6 +774,4 @@ def digitize(
     if bin_col.nullable:
         raise ValueError("`bins` cannot contain null entries.")
 
-    return as_column(
-        libcudf.sort.digitize(column.as_frame(), bin_col.as_frame(), right)
-    )
+    return as_column(libcudf.sort.digitize([column], [bin_col], right))
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 50255b07077..d87cb788a7e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3194,17 +3194,42 @@ def transpose(self):
         Difference from pandas:
         Not supporting *copy* because default and only behavior is copy=True
         """
-        # Never transpose a MultiIndex - remove the existing columns and
-        # replace with a RangeIndex. Afterward, reassign.
-        columns = self.index.copy(deep=False)
+
         index = self._data.to_pandas_index()
+        columns = self.index.copy(deep=False)
         if self._num_columns == 0 or self._num_rows == 0:
             return DataFrame(index=index, columns=columns)
+
+        # No column from index is transposed with libcudf.
+        source_columns = [*self._columns]
+        source_dtype = source_columns[0].dtype
+        if is_categorical_dtype(source_dtype):
+            if any(not is_categorical_dtype(c.dtype) for c in source_columns):
+                raise ValueError("Columns must all have the same dtype")
+            cats = list(c.categories for c in source_columns)
+            cats = cudf.core.column.concat_columns(cats).unique()
+            source_columns = [
+                col._set_categories(cats, is_unique=True).codes
+                for col in source_columns
+            ]
+
+        if any(c.dtype != source_columns[0].dtype for c in source_columns):
+            raise ValueError("Columns must all have the same dtype")
+
+        result_columns = libcudf.transpose.transpose(source_columns)
+
+        if is_categorical_dtype(source_dtype):
+            result_columns = [
+                codes._with_type_metadata(
+                    cudf.core.dtypes.CategoricalDtype(categories=cats)
+                )
+                for codes in result_columns
+            ]
+
         # Set the old column names as the new index
         result = self.__class__._from_data(
-            # Cython renames the columns to the range [0...ncols]
-            libcudf.transpose.transpose(self),
-            as_index(index),
+            {i: col for i, col in enumerate(result_columns)},
+            index=as_index(index),
         )
         # Set the old index as the new column names
         result.columns = columns
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index d10f7c690bf..e5863b52a5d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1289,89 +1289,6 @@ def _quantiles(
             column_names=self._column_names,
         )
 
-    @_cudf_nvtx_annotate
-    def rank(
-        self,
-        axis=0,
-        method="average",
-        numeric_only=None,
-        na_option="keep",
-        ascending=True,
-        pct=False,
-    ):
-        """
-        Compute numerical data ranks (1 through n) along axis.
-        By default, equal values are assigned a rank that is the average of the
-        ranks of those values.
-
-        Parameters
-        ----------
-        axis : {0 or 'index'}, default 0
-            Index to direct ranking.
-        method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
-            How to rank the group of records that have the same value
-            (i.e. ties):
-            * average: average rank of the group
-            * min: lowest rank in the group
-            * max: highest rank in the group
-            * first: ranks assigned in order they appear in the array
-            * dense: like 'min', but rank always increases by 1 between groups.
-        numeric_only : bool, optional
-            For DataFrame objects, rank only numeric columns if set to True.
-        na_option : {'keep', 'top', 'bottom'}, default 'keep'
-            How to rank NaN values:
-            * keep: assign NaN rank to NaN values
-            * top: assign smallest rank to NaN values if ascending
-            * bottom: assign highest rank to NaN values if ascending.
-        ascending : bool, default True
-            Whether or not the elements should be ranked in ascending order.
-        pct : bool, default False
-            Whether or not to display the returned rankings in percentile
-            form.
-
-        Returns
-        -------
-        same type as caller
-            Return a Series or DataFrame with data ranks as values.
-        """
-        if isinstance(self, cudf.BaseIndex):
-            warnings.warn(
-                "Index.rank is deprecated and will be removed.",
-                FutureWarning,
-            )
-
-        if method not in {"average", "min", "max", "first", "dense"}:
-            raise KeyError(method)
-
-        method_enum = libcudf.sort.RankMethod[method.upper()]
-        if na_option not in {"keep", "top", "bottom"}:
-            raise ValueError(
-                "na_option must be one of 'keep', 'top', or 'bottom'"
-            )
-
-        if axis not in (0, "index"):
-            raise NotImplementedError(
-                f"axis must be `0`/`index`, "
-                f"axis={axis} is not yet supported in rank"
-            )
-
-        source = self
-        if numeric_only:
-            numeric_cols = (
-                name
-                for name in self._data.names
-                if _is_non_decimal_numeric_dtype(self._data[name])
-            )
-            source = self._get_columns_by_label(numeric_cols)
-            if source.empty:
-                return source.astype("float64")
-
-        data, index = libcudf.sort.rank_columns(
-            source, method_enum, na_option, ascending, pct
-        )
-
-        return self._from_data(data, index).astype(np.float64)
-
     @_cudf_nvtx_annotate
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
@@ -2219,15 +2136,17 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
         # Get an int64 column consisting of the indices required to sort self
         # according to the columns specified in by.
 
-        to_sort = (
-            self
-            if by is None
-            else self._get_columns_by_label(list(by), downcast=False)
-        )
+        to_sort = [
+            *(
+                self
+                if by is None
+                else self._get_columns_by_label(list(by), downcast=False)
+            )._columns
+        ]
 
         # If given a scalar need to construct a sequence of length # of columns
         if np.isscalar(ascending):
-            ascending = [ascending] * to_sort._num_columns
+            ascending = [ascending] * len(to_sort)
 
         return libcudf.sort.order_by(to_sort, ascending, na_position)
 
@@ -2387,8 +2306,22 @@ def _is_sorted(self, ascending=None, null_position=None):
             Returns True, if sorted as expected by ``ascending`` and
             ``null_position``, False otherwise.
         """
+        if ascending is not None and not cudf.api.types.is_list_like(
+            ascending
+        ):
+            raise TypeError(
+                f"Expected a list-like or None for `ascending`, got "
+                f"{type(ascending)}"
+            )
+        if null_position is not None and not cudf.api.types.is_list_like(
+            null_position
+        ):
+            raise TypeError(
+                f"Expected a list-like or None for `null_position`, got "
+                f"{type(null_position)}"
+            )
         return libcudf.sort.is_sorted(
-            self, ascending=ascending, null_position=null_position
+            [*self._columns], ascending=ascending, null_position=null_position
         )
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ddb3082af96..fedbaed28db 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3577,6 +3577,93 @@ def ge(
             other=other, op="__ge__", fill_value=fill_value, can_reindex=True
         )
 
+    @_cudf_nvtx_annotate
+    def rank(
+        self,
+        axis=0,
+        method="average",
+        numeric_only=None,
+        na_option="keep",
+        ascending=True,
+        pct=False,
+    ):
+        """
+        Compute numerical data ranks (1 through n) along axis.
+
+        By default, equal values are assigned a rank that is the average of the
+        ranks of those values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}, default 0
+            Index to direct ranking.
+        method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
+            How to rank the group of records that have the same value
+            (i.e. ties):
+            * average: average rank of the group
+            * min: lowest rank in the group
+            * max: highest rank in the group
+            * first: ranks assigned in order they appear in the array
+            * dense: like 'min', but rank always increases by 1 between groups.
+        numeric_only : bool, optional
+            For DataFrame objects, rank only numeric columns if set to True.
+        na_option : {'keep', 'top', 'bottom'}, default 'keep'
+            How to rank NaN values:
+            * keep: assign NaN rank to NaN values
+            * top: assign smallest rank to NaN values if ascending
+            * bottom: assign highest rank to NaN values if ascending.
+        ascending : bool, default True
+            Whether or not the elements should be ranked in ascending order.
+        pct : bool, default False
+            Whether or not to display the returned rankings in percentile
+            form.
+
+        Returns
+        -------
+        same type as caller
+            Return a Series or DataFrame with data ranks as values.
+        """
+        if isinstance(self, cudf.BaseIndex):
+            warnings.warn(
+                "Index.rank is deprecated and will be removed.",
+                FutureWarning,
+            )
+
+        if method not in {"average", "min", "max", "first", "dense"}:
+            raise KeyError(method)
+
+        method_enum = libcudf.sort.RankMethod[method.upper()]
+        if na_option not in {"keep", "top", "bottom"}:
+            raise ValueError(
+                "na_option must be one of 'keep', 'top', or 'bottom'"
+            )
+
+        if axis not in (0, "index"):
+            raise NotImplementedError(
+                f"axis must be `0`/`index`, "
+                f"axis={axis} is not yet supported in rank"
+            )
+
+        source = self
+        if numeric_only:
+            numeric_cols = (
+                name
+                for name in self._data.names
+                if _is_non_decimal_numeric_dtype(self._data[name])
+            )
+            source = self._get_columns_by_label(numeric_cols)
+            if source.empty:
+                return source.astype("float64")
+
+        result_columns = libcudf.sort.rank_columns(
+            [*source._columns], method_enum, na_option, ascending, pct
+        )
+
+        return self.__class__._from_data(
+            dict(zip(source._column_names, result_columns)),
+            index=source._index,
+        ).astype(np.float64)
+
 
 def _check_duplicate_level_names(specified, level_names):
     """Raise if any of `specified` has duplicates in `level_names`."""
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2685524add4..957277d7f9b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2467,35 +2467,6 @@ def test_arrow_handle_no_index_name(pdf, gdf):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("num_rows", [1, 3, 10, 100])
-@pytest.mark.parametrize("num_bins", [1, 2, 4, 20])
-@pytest.mark.parametrize("right", [True, False])
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
-@pytest.mark.parametrize("series_bins", [True, False])
-def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
-    data = np.random.randint(0, 100, num_rows).astype(dtype)
-    bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype)))
-    s = cudf.Series(data)
-    if series_bins:
-        s_bins = cudf.Series(bins)
-        indices = s.digitize(s_bins, right)
-    else:
-        indices = s.digitize(bins, right)
-    np.testing.assert_array_equal(
-        np.digitize(data, bins, right), indices.to_numpy()
-    )
-
-
-def test_series_digitize_invalid_bins():
-    s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32")
-    bins = cudf.Series([2, None, None, 50, 90], dtype="int32")
-
-    with pytest.raises(
-        ValueError, match="`bins` cannot contain null entries."
-    ):
-        _ = s.digitize(bins)
-
-
 def test_pandas_non_contiguious():
     arr1 = np.random.sample([5000, 10])
     assert arr1.flags["C_CONTIGUOUS"] is True
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index fccb9f680d9..87fb9bff7ed 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1782,3 +1782,32 @@ def test_diff_many_dtypes(data):
     gs = cudf.from_pandas(ps)
     assert_eq(ps.diff(), gs.diff())
     assert_eq(ps.diff(periods=2), gs.diff(periods=2))
+
+
+@pytest.mark.parametrize("num_rows", [1, 100])
+@pytest.mark.parametrize("num_bins", [1, 10])
+@pytest.mark.parametrize("right", [True, False])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
+@pytest.mark.parametrize("series_bins", [True, False])
+def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
+    data = np.random.randint(0, 100, num_rows).astype(dtype)
+    bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype)))
+    s = cudf.Series(data)
+    if series_bins:
+        s_bins = cudf.Series(bins)
+        indices = s.digitize(s_bins, right)
+    else:
+        indices = s.digitize(bins, right)
+    np.testing.assert_array_equal(
+        np.digitize(data, bins, right), indices.to_numpy()
+    )
+
+
+def test_series_digitize_invalid_bins():
+    s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32")
+    bins = cudf.Series([2, None, None, 50, 90], dtype="int32")
+
+    with pytest.raises(
+        ValueError, match="`bins` cannot contain null entries."
+    ):
+        _ = s.digitize(bins)

From 65b1cbdeda9cab57243d0a98e646c860ef86039e Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 20 Apr 2022 03:24:43 +0530
Subject: [PATCH 21/26] add data generation to benchmark documentation (#10677)

add device data generation to benchmark documentation

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/10677
---
 cpp/docs/BENCHMARKING.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cpp/docs/BENCHMARKING.md b/cpp/docs/BENCHMARKING.md
index 8794c90d1db..270e7a87e85 100644
--- a/cpp/docs/BENCHMARKING.md
+++ b/cpp/docs/BENCHMARKING.md
@@ -35,6 +35,12 @@ provided in `cpp/benchmarks/synchronization/synchronization.hpp` to help with th
 can also optionally clear the GPU L2 cache in order to ensure cache hits do not artificially inflate
 performance in repeated iterations.
 
+## Data generation
+
+For generating benchmark input data, helper functions are available at [cpp/benchmarks/common/generate_input.hpp](/cpp/benchmarks/common/generate_input.hpp). The input data generation happens on device, in contrast to any `column_wrapper` where data generation happens on the host.
+* `create_sequence_table` can generate sequence columns starting with value 0 in first row and increasing by 1 in subsequent rows.
+* `create_random_table` can generate a table filled with random data. The random data parameters are configurable.
+
 ## What should we benchmark?
 
 In general, we should benchmark all features over a range of data sizes and types, so that we can

From 017d52a3c7aae758dbaca5495997db8482933b46 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 20 Apr 2022 11:07:29 -0400
Subject: [PATCH 22/26] Improve parquet dictionary encoding (#10635)

This PR includes several changes to improve parquet dictionary encoding:

- API cleanups: get rid of unused arguments
- Remove min block limit in ` __launch_bounds__`
- Simplify the grid-stride loop logic by using `while`
- All threads calculate start/end indices instead of one doing the calculation and broadcasting the result (no more shared memory or block-wide sync).

Other ideas tested but not eventually included in this PR due to zero or negative performance impact:

- Tuning hash map occupancy
- `cg::shfl` instead of shared memory + sync
- CG based `insert`/`find`
- Relaxed atomic for `num_dict_entries` and `uniq_data_size`
- `cg::reduce` instead of `cub::BlockReduce`

Before:
```
-----------------------------------------------------------------------------------------------------------------------
Benchmark                                                             Time             CPU   Iterations UserCounters...
-----------------------------------------------------------------------------------------------------------------------
ParquetWrite/integral_void_output/29/0/1/1/2/manual_time            734 ms          734 ms            1 bytes_per_second=697.128M/s encoded_file_size=530.706M peak_memory_usage=1.7804G
ParquetWrite/integral_void_output/29/1000/1/1/2/manual_time         303 ms          303 ms            2 bytes_per_second=1.65131G/s encoded_file_size=397.998M peak_memory_usage=1.49675G
ParquetWrite/integral_void_output/29/0/32/1/2/manual_time           734 ms          734 ms            1 bytes_per_second=697.713M/s encoded_file_size=530.706M peak_memory_usage=1.7804G
ParquetWrite/integral_void_output/29/1000/32/1/2/manual_time       61.9 ms         61.9 ms           11 bytes_per_second=8.07721G/s encoded_file_size=159.574M peak_memory_usage=1.49675G
ParquetWrite/integral_void_output/29/0/1/0/2/manual_time            690 ms          690 ms            1 bytes_per_second=742.205M/s encoded_file_size=531.066M peak_memory_usage=1.3148G
ParquetWrite/integral_void_output/29/1000/1/0/2/manual_time         282 ms          282 ms            2 bytes_per_second=1.76991G/s encoded_file_size=398.712M peak_memory_usage=1.49675G
ParquetWrite/integral_void_output/29/0/32/0/2/manual_time           690 ms          690 ms            1 bytes_per_second=742.268M/s encoded_file_size=531.066M peak_memory_usage=1.3148G
ParquetWrite/integral_void_output/29/1000/32/0/2/manual_time       59.5 ms         59.5 ms           12 bytes_per_second=8.40878G/s encoded_file_size=199.926M peak_memory_usage=1.49675G
```

Now:
```
-----------------------------------------------------------------------------------------------------------------------
Benchmark                                                             Time             CPU   Iterations UserCounters...
-----------------------------------------------------------------------------------------------------------------------
ParquetWrite/integral_void_output/29/0/1/1/2/manual_time            733 ms          733 ms            1 bytes_per_second=698.24M/s encoded_file_size=530.706M peak_memory_usage=1.7804G
ParquetWrite/integral_void_output/29/1000/1/1/2/manual_time         302 ms          302 ms            2 bytes_per_second=1.65496G/s encoded_file_size=397.998M peak_memory_usage=1.49675G
ParquetWrite/integral_void_output/29/0/32/1/2/manual_time           733 ms          733 ms            1 bytes_per_second=698.701M/s encoded_file_size=530.706M peak_memory_usage=1.7804G
ParquetWrite/integral_void_output/29/1000/32/1/2/manual_time       61.3 ms         61.3 ms           11 bytes_per_second=8.1533G/s encoded_file_size=159.572M peak_memory_usage=1.49675G
ParquetWrite/integral_void_output/29/0/1/0/2/manual_time            688 ms          688 ms            1 bytes_per_second=743.71M/s encoded_file_size=531.066M peak_memory_usage=1.3148G
ParquetWrite/integral_void_output/29/1000/1/0/2/manual_time         282 ms          282 ms            2 bytes_per_second=1.7712G/s encoded_file_size=398.712M peak_memory_usage=1.49675G
ParquetWrite/integral_void_output/29/0/32/0/2/manual_time           688 ms          688 ms            1 bytes_per_second=743.658M/s encoded_file_size=531.066M peak_memory_usage=1.3148G
ParquetWrite/integral_void_output/29/1000/32/0/2/manual_time       58.9 ms         58.9 ms           12 bytes_per_second=8.49093G/s encoded_file_size=199.926M peak_memory_usage=1.49675G
```

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/10635
---
 cpp/src/io/parquet/chunk_dict.cu   | 191 +++++++++++++----------------
 cpp/src/io/parquet/parquet_gpu.hpp |   8 +-
 cpp/src/io/parquet/writer_impl.cu  |   4 +-
 3 files changed, 88 insertions(+), 115 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index f61cfa83579..d3ac491f416 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -22,19 +22,24 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace io {
 namespace parquet {
 namespace gpu {
+namespace {
+constexpr int DEFAULT_BLOCK_SIZE = 256;
+}
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
+__global__ void __launch_bounds__(block_size)
   initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
 {
   auto chunk = chunks[blockIdx.x];
   auto t     = threadIdx.x;
   // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk
-  for (size_t i = 0; i < chunk.dict_map_size; i += block_size) {
+  for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
       new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
       new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
@@ -91,9 +96,8 @@ struct map_find_fn {
 };
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                                  cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+__global__ void __launch_bounds__(block_size)
+  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -102,70 +106,57 @@ __global__ void __launch_bounds__(block_size, 1)
   auto chunk   = frag.chunk;
   auto col     = chunk->col_desc;
 
-  size_type start_row = frag.start_row;
-  size_type end_row   = frag.start_row + frag.num_rows;
+  if (not chunk->use_dictionary) { return; }
 
-  __shared__ size_type s_start_value_idx;
-  __shared__ size_type s_num_values;
+  using block_reduce = cub::BlockReduce<size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
 
-  if (not chunk->use_dictionary) { return; }
+  size_type start_row = frag.start_row;
+  size_type end_row   = frag.start_row + frag.num_rows;
 
-  if (t == 0) {
-    // Find the bounds of values in leaf column to be inserted into the map for current chunk
-    auto cudf_col      = *(col->parent_column);
-    s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
-    auto end_value_idx = row_to_value_idx(end_row, cudf_col);
-    s_num_values       = end_value_idx - s_start_value_idx;
-  }
-  __syncthreads();
+  // Find the bounds of values in leaf column to be inserted into the map for current chunk
+  auto const cudf_col               = *(col->parent_column);
+  size_type const s_start_value_idx = row_to_value_idx(start_row, cudf_col);
+  size_type const end_value_idx     = row_to_value_idx(end_row, cudf_col);
 
   column_device_view const& data_col = *col->leaf_column;
-  using block_reduce                 = cub::BlockReduce<size_type, block_size>;
-  __shared__ typename block_reduce::TempStorage reduce_storage;
 
   // Make a view of the hash map
   auto hash_map_mutable = map_type::device_mutable_view(
     chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
-  auto hash_map = map_type::device_view(
-    chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
-  __shared__ int total_num_dict_entries;
-  for (size_type i = 0; i < s_num_values; i += block_size) {
-    // add the value to hash map
-    size_type val_idx = i + t + s_start_value_idx;
-    bool is_valid =
-      (i + t < s_num_values && val_idx < data_col.size()) and data_col.is_valid(val_idx);
+  __shared__ size_type total_num_dict_entries;
+  size_type val_idx = s_start_value_idx + t;
+  while (val_idx - block_size < end_value_idx) {
+    auto const is_valid =
+      val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
 
     // insert element at val_idx to hash map and count successful insertions
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
     if (is_valid) {
-      auto found_slot = type_dispatcher(data_col.type(), map_find_fn{hash_map}, data_col, val_idx);
-      if (found_slot == hash_map.end()) {
-        is_unique =
-          type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
-        uniq_elem_size = [&]() -> size_type {
-          if (not is_unique) { return 0; }
-          switch (col->physical_type) {
-            case Type::INT32: return 4;
-            case Type::INT64: return 8;
-            case Type::INT96: return 12;
-            case Type::FLOAT: return 4;
-            case Type::DOUBLE: return 8;
-            case Type::BYTE_ARRAY:
-              if (data_col.type().id() == type_id::STRING) {
-                // Strings are stored as 4 byte length + string bytes
-                return 4 + data_col.element<string_view>(val_idx).size_bytes();
-              }
-            case Type::FIXED_LEN_BYTE_ARRAY:
-              if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-            default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
-          }
-        }();
-      }
+      is_unique =
+        type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
+      uniq_elem_size = [&]() -> size_type {
+        if (not is_unique) { return 0; }
+        switch (col->physical_type) {
+          case Type::INT32: return 4;
+          case Type::INT64: return 8;
+          case Type::INT96: return 12;
+          case Type::FLOAT: return 4;
+          case Type::DOUBLE: return 8;
+          case Type::BYTE_ARRAY:
+            if (data_col.type().id() == type_id::STRING) {
+              // Strings are stored as 4 byte length + string bytes
+              return 4 + data_col.element<string_view>(val_idx).size_bytes();
+            }
+          case Type::FIXED_LEN_BYTE_ARRAY:
+            if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+          default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
+        }
+      }();
     }
 
-    __syncthreads();
     auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
     __syncthreads();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
@@ -178,11 +169,13 @@ __global__ void __launch_bounds__(block_size, 1)
 
     // Check if the num unique values in chunk has already exceeded max dict size and early exit
     if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
-  }
+
+    val_idx += block_size;
+  }  // while
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
+__global__ void __launch_bounds__(block_size)
   collect_map_entries_kernel(device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
@@ -192,31 +185,30 @@ __global__ void __launch_bounds__(block_size, 1)
   auto map =
     map_type::device_view(chunk.dict_map_slots, chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
-  __shared__ size_type counter;
-  if (t == 0) counter = 0;
+  __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
+  using cuda::std::memory_order_relaxed;
+  if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
   __syncthreads();
-  for (size_t i = 0; i < chunk.dict_map_size; i += block_size) {
+  for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
-      auto slot = map.begin_slot() + t + i;
-      auto key  = static_cast<map_type::key_type>(slot->first);
+      auto* slot = reinterpret_cast<map_type::value_type*>(map.begin_slot() + t + i);
+      auto key   = slot->first;
       if (key != KEY_SENTINEL) {
-        auto loc = atomicAdd(&counter, 1);
+        auto loc = counter.fetch_add(1, memory_order_relaxed);
         cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
         chunk.dict_data[loc] = key;
         // If sorting dict page ever becomes a hard requirement, enable the following statement and
         // add a dict sorting step before storing into the slot's second field.
         // chunk.dict_data_idx[loc] = t + i;
-        slot->second.store(loc);
-        // TODO: ^ This doesn't need to be atomic. Try casting to value_type ptr and just writing.
+        slot->second = loc;
       }
     }
   }
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                                cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+__global__ void __launch_bounds__(block_size)
+  get_dictionary_indices_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -225,47 +217,38 @@ __global__ void __launch_bounds__(block_size, 1)
   auto chunk   = frag.chunk;
   auto col     = chunk->col_desc;
 
+  if (not chunk->use_dictionary) { return; }
+
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
-  __shared__ size_type s_start_value_idx;
-  __shared__ size_type s_ck_start_val_idx;
-  __shared__ size_type s_num_values;
-
-  if (t == 0) {
-    // Find the bounds of values in leaf column to be searched in the map for current chunk
-    auto cudf_col      = *(col->parent_column);
-    s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
-    s_ck_start_val_idx = row_to_value_idx(chunk->start_row, cudf_col);
-    auto end_value_idx = row_to_value_idx(end_row, cudf_col);
-    s_num_values       = end_value_idx - s_start_value_idx;
-  }
-  __syncthreads();
-
-  if (not chunk->use_dictionary) { return; }
+  // Find the bounds of values in leaf column to be searched in the map for current chunk
+  auto const cudf_col           = *(col->parent_column);
+  auto const s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
+  auto const s_ck_start_val_idx = row_to_value_idx(chunk->start_row, cudf_col);
+  auto const end_value_idx      = row_to_value_idx(end_row, cudf_col);
 
   column_device_view const& data_col = *col->leaf_column;
 
   auto map = map_type::device_view(
     chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
-  for (size_t i = 0; i < s_num_values; i += block_size) {
-    if (t + i < s_num_values) {
-      auto val_idx = s_start_value_idx + t + i;
-      bool is_valid =
-        (i + t < s_num_values && val_idx < data_col.size()) ? data_col.is_valid(val_idx) : false;
-
-      if (is_valid) {
-        auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
-        cudf_assert(found_slot != map.end() &&
-                    "Unable to find value in map in dictionary index construction");
-        if (found_slot != map.end()) {
-          // No need for atomic as this is not going to be modified by any other thread
-          auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
-          chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
-        }
+  auto val_idx = s_start_value_idx + t;
+  while (val_idx < end_value_idx) {
+    auto const is_valid = val_idx < data_col.size() and data_col.is_valid(val_idx);
+
+    if (is_valid) {
+      auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
+      cudf_assert(found_slot != map.end() &&
+                  "Unable to find value in map in dictionary index construction");
+      if (found_slot != map.end()) {
+        // No need for atomic as this is not going to be modified by any other thread
+        auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
+        chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
       }
     }
+
+    val_idx += block_size;
   }
 }
 
@@ -276,15 +259,12 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
     <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                              cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
-  constexpr int block_size = 256;
   dim3 const dim_grid(frags.size().second, frags.size().first);
-
-  populate_chunk_hash_maps_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, frags);
+  populate_chunk_hash_maps_kernel<DEFAULT_BLOCK_SIZE>
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
 }
 
 void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -293,15 +273,12 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
   collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                            cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
-  constexpr int block_size = 256;
   dim3 const dim_grid(frags.size().second, frags.size().first);
-
-  get_dictionary_indices_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, frags);
+  get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
 }
 }  // namespace gpu
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 8d0aa8881c3..53b82c73a35 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -529,12 +529,10 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
 /**
  * @brief Insert chunk values into their respective hash maps
  *
- * @param chunks Column chunks [rowgroup][column]
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                              cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
@@ -554,12 +552,10 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
  * Since dict_data itself contains indices into the original cudf column, this means that
  * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
  *
- * @param chunks Column chunks [rowgroup][column]
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                            cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 4bc084c61d0..92d436e4566 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -895,7 +895,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   chunks.host_to_device(stream);
 
   gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  gpu::populate_chunk_hash_maps(chunks, frags, stream);
+  gpu::populate_chunk_hash_maps(frags, stream);
 
   chunks.device_to_host(stream, true);
 
@@ -944,7 +944,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   }
   chunks.host_to_device(stream);
   gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
-  gpu::get_dictionary_indices(chunks.device_view(), frags, stream);
+  gpu::get_dictionary_indices(frags, stream);
 
   return std::make_pair(std::move(dict_data), std::move(dict_index));
 }

From 5f6b70a1dfe000c0ac16536c507b0a7bbe6a9efc Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 20 Apr 2022 11:41:25 -0500
Subject: [PATCH 23/26] Fix sphinx/jupyter heading issue in UDF notebook
 (#10690)

Fixes an issue where sphinx was reading the `#`'s in the UDF guide markdown cells as section headings causing strange effects in the docs main index page.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/10690
---
 .../source/user_guide/guide-to-udfs.ipynb     | 44 ++++++++++++-------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 0d05ddb00b4..8026c378156 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -4,7 +4,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Overview of User Defined Functions with cuDF"
+    "Overview of User Defined Functions with cuDF\n",
+    "===================================="
    ]
   },
   {
@@ -40,7 +41,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Series UDFs\n",
+    "Series UDFs\n",
+    "--------------\n",
     "\n",
     "You can execute UDFs on Series in two ways:\n",
     "\n",
@@ -54,7 +56,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#  `cudf.Series.apply`"
+    "`cudf.Series.apply`\n",
+    "---------------------"
    ]
   },
   {
@@ -126,7 +129,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Functions with Additional Scalar Arguments"
+    "Functions with Additional Scalar Arguments\n",
+    "---------------------------------------------------"
    ]
   },
   {
@@ -181,7 +185,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Nullable Data"
+    "Nullable Data\n",
+    "----------------"
    ]
   },
   {
@@ -307,7 +312,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Lower level control with custom `numba` kernels"
+    "Lower level control with custom `numba` kernels\n",
+    "---------------------------------------------------------"
    ]
   },
   {
@@ -472,7 +478,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## DataFrame UDFs\n",
+    "DataFrame UDFs\n",
+    "--------------------\n",
     "\n",
     "Like `cudf.Series`, there are multiple ways of using UDFs on dataframes, which essentially amount to UDFs that expect multiple columns as input:\n",
     "\n",
@@ -485,7 +492,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# `cudf.DataFrame.apply`"
+    "`cudf.DataFrame.apply`\n",
+    "---------------------------"
    ]
   },
   {
@@ -1197,7 +1205,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Numba kernels for DataFrames"
+    "Numba kernels for DataFrames\n",
+    "------------------------------------"
    ]
   },
   {
@@ -1546,7 +1555,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Rolling Window UDFs\n",
+    "Rolling Window UDFs\n",
+    "-------------------------\n",
     "\n",
     "For time-series data, we may need to operate on a small \\\"window\\\" of our column at a time, processing each portion independently. We could slide (\\\"roll\\\") this window over the entire column to answer questions like \\\"What is the 3-day moving average of a stock price over the past year?\"\n",
     "\n",
@@ -1859,7 +1869,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## GroupBy DataFrame UDFs\n",
+    "GroupBy DataFrame UDFs\n",
+    "-------------------------------\n",
     "\n",
     "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n",
     "\n",
@@ -2155,7 +2166,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Numba Kernels on CuPy Arrays\n",
+    "Numba Kernels on CuPy Arrays\n",
+    "-------------------------------------\n",
     "\n",
     "We can also execute Numba kernels on CuPy NDArrays, again thanks to the `__cuda_array_interface__`. We can even run the same UDF on the Series and the CuPy array. First, we define a Series and then create a CuPy array from that Series."
    ]
@@ -2257,7 +2269,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Caveats"
+    "Caveats\n",
+    "---------"
    ]
   },
   {
@@ -2272,7 +2285,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Summary\n",
+    "Summary\n",
+    "-----------\n",
     "\n",
     "This guide has covered a lot of content. At this point, you should hopefully feel comfortable writing UDFs (with or without null values) that operate on\n",
     "\n",
@@ -2305,7 +2319,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,

From c8c727120b908d527ba70095bb6499fce9f88ac5 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 20 Apr 2022 14:49:54 -0500
Subject: [PATCH 24/26] Additional refactoring of hash functions (#10462)

Additional work related to #10081.

This is breaking because it reorganizes several public names/namespaces.

Summary of changes in this PR:
- The `cudf` namespace now wraps the contents of `hash_functions.cuh`, and some public names are now classified as `detail` APIs.
- `SparkMurmurHash3_32` has been updated to align with the design and naming conventions of `MurmurHash3_32`

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10462
---
 .../cudf/detail/utilities/hash_functions.cuh  | 231 +++++++++---------
 cpp/src/groupby/hash/groupby.cu               |  12 +-
 cpp/src/hash/concurrent_unordered_map.cuh     |   2 +-
 cpp/src/io/json/json_gpu.cu                   |   5 +-
 cpp/src/io/parquet/chunk_dict.cu              |   5 +-
 cpp/src/partitioning/partitioning.cu          |   4 +-
 cpp/src/text/subword/bpe_tokenizer.cu         |   5 +-
 cpp/src/text/subword/bpe_tokenizer.cuh        |   4 +-
 cpp/src/text/subword/load_merges_file.cu      |  13 +-
 9 files changed, 147 insertions(+), 134 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 09d94d10e79..9c6f3e9cb13 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -31,11 +31,24 @@
 #include <thrust/pair.h>
 #include <thrust/reverse.h>
 
+namespace cudf {
+
 using hash_value_type = uint32_t;
 
-namespace cudf {
 namespace detail {
 
+/**
+ * Normalization of floating point NaNs, passthrough for all other values.
+ */
+template <typename T>
+T __device__ inline normalize_nans(T const& key)
+{
+  if constexpr (cudf::is_floating_point<T>()) {
+    if (std::isnan(key)) { return std::numeric_limits<T>::quiet_NaN(); }
+  }
+  return key;
+}
+
 /**
  * Normalization of floating point NaNs and zeros, passthrough for all other values.
  */
@@ -43,13 +56,9 @@ template <typename T>
 T __device__ inline normalize_nans_and_zeros(T const& key)
 {
   if constexpr (cudf::is_floating_point<T>()) {
-    if (std::isnan(key)) {
-      return std::numeric_limits<T>::quiet_NaN();
-    } else if (key == T{0.0}) {
-      return T{0.0};
-    }
+    if (key == T{0.0}) { return T{0.0}; }
   }
-  return key;
+  return normalize_nans(key);
 }
 
 __device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r)
@@ -176,9 +185,6 @@ void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destinatio
   std::memcpy(destination, reinterpret_cast<uint8_t*>(&x), 8);
 }
 
-}  // namespace detail
-}  // namespace cudf
-
 // MurmurHash3_32 implementation from
 // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
 //-----------------------------------------------------------------------------
@@ -192,7 +198,7 @@ template <typename Key>
 struct MurmurHash3_32 {
   using result_type = hash_value_type;
 
-  MurmurHash3_32() = default;
+  constexpr MurmurHash3_32() = default;
   constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
   [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
@@ -214,24 +220,9 @@ struct MurmurHash3_32 {
     return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
   }
 
-  // TODO Do we need this operator() and/or compute? Probably not both.
   [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
   {
-    return compute(key);
-  }
-
-  // compute wrapper for floating point types
-  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
-  hash_value_type __device__ inline compute_floating_point(T const& key) const
-  {
-    if (key == T{0.0}) {
-      return compute(T{0.0});
-    } else if (std::isnan(key)) {
-      T nan = std::numeric_limits<T>::quiet_NaN();
-      return compute(nan);
-    } else {
-      return compute(key);
-    }
+    return compute(detail::normalize_nans_and_zeros(key));
   }
 
   template <typename T>
@@ -240,17 +231,32 @@ struct MurmurHash3_32 {
     return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
   }
 
+  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
+                                                        cudf::size_type len,
+                                                        cudf::size_type tail_offset,
+                                                        result_type h) const
+  {
+    // Process remaining bytes that do not fill a four-byte chunk.
+    uint32_t k1 = 0;
+    switch (len % 4) {
+      case 3: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 2]) << 16; [[fallthrough]];
+      case 2: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 1]) << 8; [[fallthrough]];
+      case 1:
+        k1 ^= std::to_integer<uint8_t>(data[tail_offset]);
+        k1 *= c1;
+        k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+        k1 *= c2;
+        h ^= k1;
+    };
+    return h;
+  }
+
   result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
   {
     constexpr cudf::size_type BLOCK_SIZE = 4;
     cudf::size_type const nblocks        = len / BLOCK_SIZE;
     cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
-    result_type h1                       = m_seed;
-    constexpr uint32_t c1                = 0xcc9e2d51;
-    constexpr uint32_t c2                = 0x1b873593;
-    constexpr uint32_t c3                = 0xe6546b64;
-    constexpr uint32_t rot_c1            = 15;
-    constexpr uint32_t rot_c2            = 13;
+    result_type h                        = m_seed;
 
     // Process all four-byte chunks.
     for (cudf::size_type i = 0; i < nblocks; i++) {
@@ -258,50 +264,44 @@ struct MurmurHash3_32 {
       k1 *= c1;
       k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
       k1 *= c2;
-      h1 ^= k1;
-      h1 = cudf::detail::rotate_bits_left(h1, rot_c2);
-      h1 = h1 * 5 + c3;
+      h ^= k1;
+      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
     }
 
-    // Process remaining bytes that do not fill a four-byte chunk.
-    uint32_t k1 = 0;
-    switch (len % 4) {
-      case 3: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 2]) << 16;
-      case 2: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 1]) << 8;
-      case 1:
-        k1 ^= std::to_integer<uint8_t>(data[tail_offset]);
-        k1 *= c1;
-        k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
-        k1 *= c2;
-        h1 ^= k1;
-    };
+    h = compute_remaining_bytes(data, len, tail_offset, h);
 
     // Finalize hash.
-    h1 ^= len;
-    h1 = fmix32(h1);
-    return h1;
+    h ^= len;
+    h = fmix32(h);
+    return h;
   }
 
  private:
   uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
+  static constexpr uint32_t c1     = 0xcc9e2d51;
+  static constexpr uint32_t c2     = 0x1b873593;
+  static constexpr uint32_t c3     = 0xe6546b64;
+  static constexpr uint32_t rot_c1 = 15;
+  static constexpr uint32_t rot_c2 = 13;
 };
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<bool>::operator()(bool const& key) const
 {
-  return this->compute(static_cast<uint8_t>(key));
+  return compute(static_cast<uint8_t>(key));
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<float>::operator()(float const& key) const
 {
-  return this->compute_floating_point(key);
+  return compute(detail::normalize_nans_and_zeros(key));
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<double>::operator()(double const& key) const
 {
-  return this->compute_floating_point(key);
+  return compute(detail::normalize_nans_and_zeros(key));
 }
 
 template <>
@@ -310,28 +310,28 @@ hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()(
 {
   auto const data = reinterpret_cast<std::byte const*>(key.data());
   auto const len  = key.size_bytes();
-  return this->compute_bytes(data, len);
+  return compute_bytes(data, len);
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<numeric::decimal32>::operator()(
   numeric::decimal32 const& key) const
 {
-  return this->compute(key.value());
+  return compute(key.value());
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<numeric::decimal64>::operator()(
   numeric::decimal64 const& key) const
 {
-  return this->compute(key.value());
+  return compute(key.value());
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<numeric::decimal128>::operator()(
   numeric::decimal128 const& key) const
 {
-  return this->compute(key.value());
+  return compute(key.value());
 }
 
 template <>
@@ -352,10 +352,10 @@ template <typename Key>
 struct SparkMurmurHash3_32 {
   using result_type = hash_value_type;
 
-  SparkMurmurHash3_32() = default;
+  constexpr SparkMurmurHash3_32() = default;
   constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  __device__ inline uint32_t fmix32(uint32_t h) const
+  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
   {
     h ^= h >> 16;
     h *= 0x85ebca6b;
@@ -365,18 +365,18 @@ struct SparkMurmurHash3_32 {
     return h;
   }
 
-  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
+  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
+                                                      cudf::size_type offset) const
+  {
+    // Read a 4-byte value from the data pointer as individual bytes for safe
+    // unaligned access (very likely for string types).
+    auto block = reinterpret_cast<uint8_t const*>(data + offset);
+    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
+  }
 
-  // compute wrapper for floating point types
-  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
-  hash_value_type __device__ inline compute_floating_point(T const& key) const
+  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
   {
-    if (std::isnan(key)) {
-      T nan = std::numeric_limits<T>::quiet_NaN();
-      return compute(nan);
-    } else {
-      return compute(key);
-    }
+    return compute(key);
   }
 
   template <typename T>
@@ -385,24 +385,35 @@ struct SparkMurmurHash3_32 {
     return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
   }
 
-  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
-                                                      cudf::size_type offset) const
+  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
+                                                        cudf::size_type len,
+                                                        cudf::size_type tail_offset,
+                                                        result_type h) const
   {
-    // Individual byte reads for unaligned accesses (very likely for strings)
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
+    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
+    // (does not conform to normal MurmurHash3).
+    for (auto i = tail_offset; i < len; i++) {
+      // We require a two-step cast to get the k1 value from the byte. First,
+      // we must cast to a signed int8_t. Then, the sign bit is preserved when
+      // casting to uint32_t under 2's complement. Java preserves the sign when
+      // casting byte-to-int, but C++ does not.
+      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
+      k1 *= c1;
+      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+      k1 *= c2;
+      h ^= k1;
+      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
+    }
+    return h;
   }
 
   result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
   {
     constexpr cudf::size_type BLOCK_SIZE = 4;
     cudf::size_type const nblocks        = len / BLOCK_SIZE;
-    result_type h1                       = m_seed;
-    constexpr uint32_t c1                = 0xcc9e2d51;
-    constexpr uint32_t c2                = 0x1b873593;
-    constexpr uint32_t c3                = 0xe6546b64;
-    constexpr uint32_t rot_c1            = 15;
-    constexpr uint32_t rot_c2            = 13;
+    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
+    result_type h                        = m_seed;
 
     // Process all four-byte chunks.
     for (cudf::size_type i = 0; i < nblocks; i++) {
@@ -410,78 +421,69 @@ struct SparkMurmurHash3_32 {
       k1 *= c1;
       k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
       k1 *= c2;
-      h1 ^= k1;
-      h1 = cudf::detail::rotate_bits_left(h1, rot_c2);
-      h1 = h1 * 5 + c3;
+      h ^= k1;
+      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
     }
 
-    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
-    // (does not conform to normal MurmurHash3).
-    for (cudf::size_type i = nblocks * 4; i < len; i++) {
-      // We require a two-step cast to get the k1 value from the byte. First,
-      // we must cast to a signed int8_t. Then, the sign bit is preserved when
-      // casting to uint32_t under 2's complement. Java preserves the
-      // signedness when casting byte-to-int, but C++ does not.
-      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
-      k1 *= c1;
-      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h1 ^= k1;
-      h1 = cudf::detail::rotate_bits_left(h1, rot_c2);
-      h1 = h1 * 5 + c3;
-    }
+    h = compute_remaining_bytes(data, len, tail_offset, h);
 
     // Finalize hash.
-    h1 ^= len;
-    h1 = fmix32(h1);
-    return h1;
+    h ^= len;
+    h = fmix32(h);
+    return h;
   }
 
  private:
   uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
+  static constexpr uint32_t c1     = 0xcc9e2d51;
+  static constexpr uint32_t c2     = 0x1b873593;
+  static constexpr uint32_t c3     = 0xe6546b64;
+  static constexpr uint32_t rot_c1 = 15;
+  static constexpr uint32_t rot_c2 = 13;
 };
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<bool>::operator()(bool const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<int8_t>::operator()(int8_t const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<uint8_t>::operator()(uint8_t const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<int16_t>::operator()(int16_t const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<uint16_t>::operator()(
   uint16_t const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(float const& key) const
 {
-  return this->compute_floating_point(key);
+  return compute<float>(detail::normalize_nans(key));
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(double const& key) const
 {
-  return this->compute_floating_point(key);
+  return compute<double>(detail::normalize_nans(key));
 }
 
 template <>
@@ -490,21 +492,21 @@ hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operat
 {
   auto const data = reinterpret_cast<std::byte const*>(key.data());
   auto const len  = key.size_bytes();
-  return this->compute_bytes(data, len);
+  return compute_bytes(data, len);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal32>::operator()(
   numeric::decimal32 const& key) const
 {
-  return this->compute<uint64_t>(key.value());
+  return compute<uint64_t>(key.value());
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal64>::operator()(
   numeric::decimal64 const& key) const
 {
-  return this->compute<uint64_t>(key.value());
+  return compute<uint64_t>(key.value());
 }
 
 template <>
@@ -546,7 +548,7 @@ hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>::oper
   __int128_t big_endian_value = 0;
   auto big_endian_data        = reinterpret_cast<std::byte*>(&big_endian_value);
   thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data);
-  return this->compute_bytes(big_endian_data, length);
+  return compute_bytes(big_endian_data, length);
 }
 
 template <>
@@ -593,3 +595,6 @@ struct IdentityHash {
 
 template <typename Key>
 using default_hash = MurmurHash3_32<Key>;
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 44df981f5bf..f225afaec71 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -429,17 +429,19 @@ auto create_hash_map(table_device_view const& d_keys,
   size_type constexpr unused_key{std::numeric_limits<size_type>::max()};
   size_type constexpr unused_value{std::numeric_limits<size_type>::max()};
 
-  using map_type = concurrent_unordered_map<size_type,
-                                            size_type,
-                                            row_hasher<default_hash, nullate::DYNAMIC>,
-                                            row_equality_comparator<nullate::DYNAMIC>>;
+  using map_type =
+    concurrent_unordered_map<size_type,
+                             size_type,
+                             row_hasher<cudf::detail::default_hash, nullate::DYNAMIC>,
+                             row_equality_comparator<nullate::DYNAMIC>>;
 
   using allocator_type = typename map_type::allocator_type;
 
   auto const null_keys_are_equal =
     include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL;
 
-  row_hasher<default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls}, d_keys};
+  row_hasher<cudf::detail::default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls},
+                                                                  d_keys};
   row_equality_comparator rows_equal{
     nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal};
 
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index 76f3fba4689..9136410a03d 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -113,7 +113,7 @@ union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
  */
 template <typename Key,
           typename Element,
-          typename Hasher    = default_hash<Key>,
+          typename Hasher    = cudf::detail::default_hash<Key>,
           typename Equality  = equal_to<Key>,
           typename Allocator = default_allocator<thrust::pair<Key, Element>>>
 class concurrent_unordered_map {
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 56a00191ae4..43411157319 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -356,7 +356,7 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
       ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
       : [&]() {
           auto const key_range = get_next_key(begin, end, opts.quotechar);
-          auto const key_hash  = MurmurHash3_32<cudf::string_view>{}(
+          auto const key_hash  = cudf::detail::MurmurHash3_32<cudf::string_view>{}(
             cudf::string_view(key_range.first, key_range.second - key_range.first));
           auto const hash_col = col_map.find(key_hash);
           // Fall back to field index if not found (parsing error)
@@ -667,7 +667,8 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
       keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
       keys_info->column(1).element<uint16_t>(idx) = len;
       keys_info->column(2).element<uint32_t>(idx) =
-        MurmurHash3_32<cudf::string_view>{}(cudf::string_view(field_range.key_begin, len));
+        cudf::detail::MurmurHash3_32<cudf::string_view>{}(
+          cudf::string_view(field_range.key_begin, len));
     }
   }
 }
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index d3ac491f416..9075a319ab3 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -60,7 +60,10 @@ struct equality_functor {
 template <typename T>
 struct hash_functor {
   column_device_view const& col;
-  __device__ auto operator()(size_type idx) { return MurmurHash3_32<T>{}(col.element<T>(idx)); }
+  __device__ auto operator()(size_type idx) const
+  {
+    return cudf::detail::MurmurHash3_32<T>{}(col.element<T>(idx));
+  }
 };
 
 struct map_insert_fn {
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 43686b7d257..09f07a1ca8c 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -779,10 +779,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
         if (!is_numeric(input.column(column_id).type()))
           CUDF_FAIL("IdentityHash does not support this data type");
       }
-      return detail::local::hash_partition<IdentityHash>(
+      return detail::local::hash_partition<detail::IdentityHash>(
         input, columns_to_hash, num_partitions, seed, stream, mr);
     case (hash_id::HASH_MURMUR3):
-      return detail::local::hash_partition<MurmurHash3_32>(
+      return detail::local::hash_partition<detail::MurmurHash3_32>(
         input, columns_to_hash, num_partitions, seed, stream, mr);
     default: CUDF_FAIL("Unsupported hash function in hash_partition");
   }
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index fb631b3f31f..404ecf1248c 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -22,6 +22,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -144,8 +145,8 @@ struct byte_pair_encoding_fn {
    * @param rhs Second string.
    * @return The hash value to match with `d_map`.
    */
-  __device__ hash_value_type compute_hash(cudf::string_view const& lhs,
-                                          cudf::string_view const& rhs)
+  __device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs,
+                                                cudf::string_view const& rhs)
   {
     __shared__ char shmem[48 * 1024];  // max for Pascal
     auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 31cc29a8d8a..24b10fc4a36 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -36,12 +36,12 @@ namespace detail {
 
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using merge_pairs_map_type = cuco::static_map<hash_value_type,
+using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
                                               cudf::size_type,
                                               cuda::thread_scope_device,
                                               hash_table_allocator_type>;
 
-using string_hasher_type = MurmurHash3_32<cudf::string_view>;
+using string_hasher_type = cudf::detail::MurmurHash3_32<cudf::string_view>;
 
 }  // namespace detail
 
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index 31f579dc9d4..1e0c9c81fcd 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -42,7 +43,7 @@ struct make_pair_function {
   /**
    * @brief Hash the merge pair entry
    */
-  __device__ cuco::pair_type<hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
+  __device__ cuco::pair_type<cudf::hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
   {
     auto const result = _hasher(d_strings.element<cudf::string_view>(idx));
     return cuco::make_pair(result, idx);
@@ -105,9 +106,9 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   // Ensure capacity is at least (size/0.7) as documented here:
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
-    static_cast<size_t>(input.size() * 2),        // capacity is 2x;
-    std::numeric_limits<hash_value_type>::max(),  // empty key;
-    -1,                                           // empty value is not used
+    static_cast<size_t>(input.size() * 2),              // capacity is 2x;
+    std::numeric_limits<cudf::hash_value_type>::max(),  // empty key;
+    -1,                                                 // empty value is not used
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
@@ -117,8 +118,8 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 
   merge_pairs_map->insert(iter,
                           iter + input.size(),
-                          cuco::detail::MurmurHash3_32<hash_value_type>{},
-                          thrust::equal_to<hash_value_type>{},
+                          cuco::detail::MurmurHash3_32<cudf::hash_value_type>{},
+                          thrust::equal_to<cudf::hash_value_type>{},
                           stream.value());
 
   return merge_pairs_map;

From 01d08af14a6706f82ce5fca0b6b1497ac1a8c02c Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 20 Apr 2022 22:11:22 +0200
Subject: [PATCH 25/26] KvikIO as an alternative GDS backend (#10593)

This PR is a new take on #10468 that is less intrusive. It keeps the existing GDS backend and adds a new option `LIBCUDF_CUFILE_POLICY=KVIKIO` that make cudf use KvikIO.

 The default policy is still `LIBCUDF_CUFILE_POLICY=GDS`

cc. @vuule, @devavret, @GregoryKimball

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert Maynard (https://github.com/robertmaynard)
  - Devavret Makkar (https://github.com/devavret)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10593
---
 cpp/CMakeLists.txt                            |  4 +-
 cpp/cmake/thirdparty/get_kvikio.cmake         | 31 +++++++++
 cpp/src/io/utilities/config_utils.cpp         |  5 +-
 cpp/src/io/utilities/config_utils.hpp         |  5 ++
 cpp/src/io/utilities/data_sink.cpp            | 44 +++++++++----
 cpp/src/io/utilities/datasource.cpp           | 52 ++++++++-------
 cpp/src/io/utilities/file_io_utilities.cpp    | 24 -------
 cpp/src/io/utilities/file_io_utilities.hpp    | 63 -------------------
 .../cudf/source/basics/io-gds-integration.rst |  9 ++-
 9 files changed, 111 insertions(+), 126 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_kvikio.cmake

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index dbc55827a32..7ed1aaed53b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -155,6 +155,8 @@ include(cmake/thirdparty/get_gtest.cmake)
 include(cmake/Modules/JitifyPreprocessKernels.cmake)
 # find cuFile
 include(cmake/Modules/FindcuFile.cmake)
+# find KvikIO
+include(cmake/thirdparty/get_kvikio.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
@@ -586,7 +588,7 @@ add_dependencies(cudf jitify_preprocess_run)
 target_link_libraries(
   cudf
   PUBLIC ${ARROW_LIBRARIES} libcudacxx::libcudacxx cudf::Thrust rmm::rmm
-  PRIVATE cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
+  PRIVATE cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
 )
 
 # Add Conda library, and include paths if specified
diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake
new file mode 100644
index 00000000000..800ab2d5c6f
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_kvikio.cmake
@@ -0,0 +1,31 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds KvikIO
+function(find_and_configure_kvikio VERSION)
+
+  rapids_cpm_find(
+    KvikIO ${VERSION}
+    GLOBAL_TARGETS kvikio::kvikio
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/rapidsai/kvikio.git
+    GIT_TAG branch-${VERSION}
+    GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
+    OPTIONS "KvikIO_BUILD_EXAMPLES OFF"
+  )
+
+endfunction()
+
+set(KVIKIO_MIN_VERSION_cudf "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}")
+find_and_configure_kvikio(${KVIKIO_MIN_VERSION_cudf})
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index ed8c3d6e1e3..08b5914cb19 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -35,7 +35,7 @@ namespace {
 /**
  * @brief Defines which cuFile usage to enable.
  */
-enum class usage_policy : uint8_t { OFF, GDS, ALWAYS };
+enum class usage_policy : uint8_t { OFF, GDS, ALWAYS, KVIKIO };
 
 /**
  * @brief Get the current usage policy.
@@ -46,6 +46,7 @@ usage_policy get_env_policy()
   if (env_val == "OFF") return usage_policy::OFF;
   if (env_val == "GDS") return usage_policy::GDS;
   if (env_val == "ALWAYS") return usage_policy::ALWAYS;
+  if (env_val == "KVIKIO") return usage_policy::KVIKIO;
   CUDF_FAIL("Invalid LIBCUDF_CUFILE_POLICY value: " + env_val);
 }
 }  // namespace
@@ -54,6 +55,8 @@ bool is_always_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
 
 bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_policy::GDS; }
 
+bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; }
+
 }  // namespace cufile_integration
 
 namespace nvcomp_integration {
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp
index 80c20529687..4f6a14091cf 100644
--- a/cpp/src/io/utilities/config_utils.hpp
+++ b/cpp/src/io/utilities/config_utils.hpp
@@ -48,6 +48,11 @@ bool is_always_enabled();
  */
 bool is_gds_enabled();
 
+/**
+ * @brief Returns true if KvikIO is enabled.
+ */
+bool is_kvikio_enabled();
+
 }  // namespace cufile_integration
 
 namespace nvcomp_integration {
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 63d0103ddec..042afc01253 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,9 @@
 #include "file_io_utilities.hpp"
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/config_utils.hpp>
 
+#include <kvikio/file_handle.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -30,10 +32,15 @@ namespace io {
 class file_sink : public data_sink {
  public:
   explicit file_sink(std::string const& filepath)
-    : _cufile_out(detail::make_cufile_output(filepath))
   {
     _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
     CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file");
+
+    if (detail::cufile_integration::is_kvikio_enabled()) {
+      _kvikio_file = kvikio::FileHandle(filepath, "w");
+    } else {
+      _cufile_out = detail::make_cufile_output(filepath);
+    }
   }
 
   virtual ~file_sink() { flush(); }
@@ -49,19 +56,15 @@ class file_sink : public data_sink {
 
   size_t bytes_written() override { return _bytes_written; }
 
-  [[nodiscard]] bool supports_device_write() const override { return _cufile_out != nullptr; }
-
-  [[nodiscard]] bool is_device_write_preferred(size_t size) const override
+  [[nodiscard]] bool supports_device_write() const override
   {
-    return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size);
+    return !_kvikio_file.closed() || _cufile_out != nullptr;
   }
 
-  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  [[nodiscard]] bool is_device_write_preferred(size_t size) const override
   {
-    if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
-
-    _cufile_out->write(gpu_data, _bytes_written, size);
-    _bytes_written += size;
+    return !_kvikio_file.closed() ||
+           (_cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size));
   }
 
   std::future<void> device_write_async(void const* gpu_data,
@@ -70,15 +73,30 @@ class file_sink : public data_sink {
   {
     if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
 
-    auto result = _cufile_out->write_async(gpu_data, _bytes_written, size);
+    size_t offset = _bytes_written;
     _bytes_written += size;
-    return result;
+
+    if (!_kvikio_file.closed()) {
+      // KvikIO's `pwrite()` returns a `std::future<size_t>` so we convert it
+      // to `std::future<void>`
+      return std::async(std::launch::deferred, [this, gpu_data, size, offset] {
+        _kvikio_file.pwrite(gpu_data, size, offset).get();
+      });
+    }
+    return _cufile_out->write_async(gpu_data, offset, size);
+  }
+
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
+    return device_write_async(gpu_data, _bytes_written, stream).get();
   }
 
  private:
   std::ofstream _output_stream;
   size_t _bytes_written = 0;
   std::unique_ptr<detail::cufile_output_impl> _cufile_out;
+  kvikio::FileHandle _kvikio_file;
 };
 
 /**
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 6f864ab509f..80e07f31dd9 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,9 @@
 #include <cudf/utilities/error.hpp>
 #include <io/utilities/config_utils.hpp>
 
+#include <kvikio/file_handle.hpp>
+#include <rmm/device_buffer.hpp>
+
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
@@ -33,28 +36,38 @@ namespace {
  */
 class file_source : public datasource {
  public:
-  explicit file_source(const char* filepath)
-    : _file(filepath, O_RDONLY), _cufile_in(detail::make_cufile_input(filepath))
+  explicit file_source(const char* filepath) : _file(filepath, O_RDONLY)
   {
+    if (detail::cufile_integration::is_kvikio_enabled()) {
+      _kvikio_file = kvikio::FileHandle(filepath);
+    } else {
+      _cufile_in = detail::make_cufile_input(filepath);
+    }
   }
 
   virtual ~file_source() = default;
 
-  [[nodiscard]] bool supports_device_read() const override { return _cufile_in != nullptr; }
+  [[nodiscard]] bool supports_device_read() const override
+  {
+    return !_kvikio_file.closed() || _cufile_in != nullptr;
+  }
 
   [[nodiscard]] bool is_device_read_preferred(size_t size) const override
   {
-    return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
+    return !_kvikio_file.closed() ||
+           (_cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size));
   }
 
-  std::unique_ptr<datasource::buffer> device_read(size_t offset,
-                                                  size_t size,
-                                                  rmm::cuda_stream_view stream) override
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
   {
     CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
 
     auto const read_size = std::min(size, _file.size() - offset);
-    return _cufile_in->read(offset, read_size, stream);
+    if (!_kvikio_file.closed()) { return _kvikio_file.pread(dst, read_size, offset); }
+    return _cufile_in->read_async(offset, read_size, dst, stream);
   }
 
   size_t device_read(size_t offset,
@@ -62,21 +75,17 @@ class file_source : public datasource {
                      uint8_t* dst,
                      rmm::cuda_stream_view stream) override
   {
-    CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
-
-    auto const read_size = std::min(size, _file.size() - offset);
-    return _cufile_in->read(offset, read_size, dst, stream);
+    return device_read_async(offset, size, dst, stream).get();
   }
 
-  std::future<size_t> device_read_async(size_t offset,
-                                        size_t size,
-                                        uint8_t* dst,
-                                        rmm::cuda_stream_view stream) override
+  std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                  size_t size,
+                                                  rmm::cuda_stream_view stream) override
   {
-    CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
-
-    auto const read_size = std::min(size, _file.size() - offset);
-    return _cufile_in->read_async(offset, read_size, dst, stream);
+    rmm::device_buffer out_data(size, stream);
+    size_t read = device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
+    out_data.resize(read, stream);
+    return datasource::buffer::create(std::move(out_data));
   }
 
   [[nodiscard]] size_t size() const override { return _file.size(); }
@@ -86,6 +95,7 @@ class file_source : public datasource {
 
  private:
   std::unique_ptr<detail::cufile_input_impl> _cufile_in;
+  kvikio::FileHandle _kvikio_file;
 };
 
 /**
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index f7e250f1d3f..c0dd85702e2 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -176,16 +176,6 @@ cufile_input_impl::cufile_input_impl(std::string const& filepath)
   pool.sleep_duration = 10;
 }
 
-std::unique_ptr<datasource::buffer> cufile_input_impl::read(size_t offset,
-                                                            size_t size,
-                                                            rmm::cuda_stream_view stream)
-{
-  rmm::device_buffer out_data(size, stream);
-  auto read_size = read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
-  out_data.resize(read_size, stream);
-  return datasource::buffer::create(std::move(out_data));
-}
-
 namespace {
 
 template <typename DataT,
@@ -234,15 +224,6 @@ std::future<size_t> cufile_input_impl::read_async(size_t offset,
   return std::async(std::launch::deferred, waiter, std::move(slice_tasks));
 }
 
-size_t cufile_input_impl::read(size_t offset,
-                               size_t size,
-                               uint8_t* dst,
-                               rmm::cuda_stream_view stream)
-{
-  auto result = read_async(offset, size, dst, stream);
-  return result.get();
-}
-
 cufile_output_impl::cufile_output_impl(std::string const& filepath)
   : shim{cufile_shim::instance()},
     cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664),
@@ -250,11 +231,6 @@ cufile_output_impl::cufile_output_impl(std::string const& filepath)
 {
 }
 
-void cufile_output_impl::write(void const* data, size_t offset, size_t size)
-{
-  write_async(data, offset, size).wait();
-}
-
 std::future<void> cufile_output_impl::write_async(void const* data, size_t offset, size_t size)
 {
   int device;
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index be3ecc49ab0..704ee77de8a 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -80,35 +80,6 @@ class cufile_io_base {
  */
 class cufile_input : public cufile_io_base {
  public:
-  /**
-   * @brief Reads into a new device buffer.
-   *
-   *  @throws cudf::logic_error on cuFile error
-   *
-   * @param offset Number of bytes from the start
-   * @param size Number of bytes to read
-   * @param stream CUDA stream to use
-   *
-   * @return The data buffer in the device memory
-   */
-  virtual std::unique_ptr<datasource::buffer> read(size_t offset,
-                                                   size_t size,
-                                                   rmm::cuda_stream_view stream) = 0;
-
-  /**
-   * @brief Reads into existing device memory.
-   *
-   *  @throws cudf::logic_error on cuFile error
-   *
-   * @param offset Number of bytes from the start
-   * @param size Number of bytes to read
-   * @param dst Address of the existing device memory
-   * @param stream CUDA stream to use
-   *
-   * @return The number of bytes read
-   */
-  virtual size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) = 0;
-
   /**
    * @brief Asynchronously reads into existing device memory.
    *
@@ -132,17 +103,6 @@ class cufile_input : public cufile_io_base {
  */
 class cufile_output : public cufile_io_base {
  public:
-  /**
-   * @brief Writes the data from a device buffer into a file.
-   *
-   *  @throws cudf::logic_error on cuFile error
-   *
-   * @param data Pointer to the buffer to be written into the output file
-   * @param offset Number of bytes from the start
-   * @param size Number of bytes to write
-   */
-  virtual void write(void const* data, size_t offset, size_t size) = 0;
-
   /**
    * @brief Asynchronously writes the data from a device buffer into a file.
    *
@@ -203,12 +163,6 @@ class cufile_input_impl final : public cufile_input {
  public:
   cufile_input_impl(std::string const& filepath);
 
-  std::unique_ptr<datasource::buffer> read(size_t offset,
-                                           size_t size,
-                                           rmm::cuda_stream_view stream) override;
-
-  size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override;
-
   std::future<size_t> read_async(size_t offset,
                                  size_t size,
                                  uint8_t* dst,
@@ -229,7 +183,6 @@ class cufile_output_impl final : public cufile_output {
  public:
   cufile_output_impl(std::string const& filepath);
 
-  void write(void const* data, size_t offset, size_t size) override;
   std::future<void> write_async(void const* data, size_t offset, size_t size) override;
 
  private:
@@ -241,18 +194,6 @@ class cufile_output_impl final : public cufile_output {
 
 class cufile_input_impl final : public cufile_input {
  public:
-  std::unique_ptr<datasource::buffer> read(size_t offset,
-                                           size_t size,
-                                           rmm::cuda_stream_view stream) override
-  {
-    CUDF_FAIL("Only used to compile without cufile library, should not be called");
-  }
-
-  size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override
-  {
-    CUDF_FAIL("Only used to compile without cufile library, should not be called");
-  }
-
   std::future<size_t> read_async(size_t offset,
                                  size_t size,
                                  uint8_t* dst,
@@ -264,10 +205,6 @@ class cufile_input_impl final : public cufile_input {
 
 class cufile_output_impl final : public cufile_output {
  public:
-  void write(void const* data, size_t offset, size_t size) override
-  {
-    CUDF_FAIL("Only used to compile without cufile library, should not be called");
-  }
   std::future<void> write_async(void const* data, size_t offset, size_t size) override
   {
     CUDF_FAIL("Only used to compile without cufile library, should not be called");
diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
index 5ff07ac29c5..ce774453386 100644
--- a/docs/cudf/source/basics/io-gds-integration.rst
+++ b/docs/cudf/source/basics/io-gds-integration.rst
@@ -10,10 +10,11 @@ GDS is also included in CUDA Toolkit 11.4 and higher.
 Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``.
 This variable also controls the GDS compatibility mode.
 
-There are three valid values for the environment variable:
+There are four valid values for the environment variable:
 
 - "GDS": Enable GDS use; GDS compatibility mode is *off*.
 - "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
+- "KVIKIO": Enable GDS through `KvikIO <https://github.com/rapidsai/kvikio>`_.
 - "OFF": Completely disable GDS use.
 
 If no value is set, behavior will be the same as the "GDS" option.
@@ -21,7 +22,9 @@ If no value is set, behavior will be the same as the "GDS" option.
 This environment variable also affects how cuDF treats GDS errors.
 When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
 When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on),
-cuDF throws an exception to propagate the error to te user.
+cuDF throws an exception to propagate the error to the user.
+When ``LIBCUDF_CUFILE_POLICY`` is set to "KVIKIO" and a KvikIO API call fails for any reason (unlikely, given that KvikIO implements its own compatibility mode) cuDF throws an exception to propagate the error to the user.
+For more information about error handling, compatibility mode, and tuning parameters in KvikIO see: https://github.com/rapidsai/kvikio
 
 Operations that support the use of GPUDirect Storage:
 
@@ -36,4 +39,4 @@ Several parameters that can be used to tune the performance of GDS-enabled I/O a
 
 - ``LIBCUDF_CUFILE_THREAD_COUNT``: Integral value, maximum number of parallel reads/writes per file (default 16);
 - ``LIBCUDF_CUFILE_SLICE_SIZE``: Integral value, maximum size of each GDS read/write, in bytes (default 4MB).
-  Larger I/O operations are split into multiple calls.
\ No newline at end of file
+  Larger I/O operations are split into multiple calls.

From d5e6941fe097c2eceedbe3268e8e6baea30d1a0e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 21 Apr 2022 09:02:21 -0400
Subject: [PATCH 26/26] Remove cudf::strings::string namespace (#10684)

Minor change to change the unnecessary `cudf::strings::string` namespace in the `string.cuh` functions' declarations/definitions to just `cudf::strings`. The extra `string` namespace is confusing an not needed. Calling functions have been updated appropriately.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10684
---
 cpp/include/cudf/strings/string.cuh         |  2 --
 cpp/src/strings/convert/convert_floats.cu   |  6 +++---
 cpp/src/strings/convert/convert_integers.cu | 13 ++++++-------
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh
index 0cfcaeb913e..d20080cc0e5 100644
--- a/cpp/include/cudf/strings/string.cuh
+++ b/cpp/include/cudf/strings/string.cuh
@@ -23,7 +23,6 @@
 
 namespace cudf {
 namespace strings {
-namespace string {
 /**
  * @addtogroup strings_classes
  * @{
@@ -150,6 +149,5 @@ inline __device__ bool is_float(string_view const& d_str)
 }
 
 /** @} */  // end of group
-}  // namespace string
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index b8a10a00f5b..8acf348ef05 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -67,8 +67,8 @@ __device__ inline double stod(string_view const& d_str)
   // special strings: NaN, Inf
   if ((in_ptr < end) && *in_ptr > '9') {
     auto const inf_nan = string_view(in_ptr, static_cast<size_type>(thrust::distance(in_ptr, end)));
-    if (string::is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
-    if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
+    if (is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
+    if (is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
   }
 
   // Parse and store the mantissa as much as we can,
@@ -567,7 +567,7 @@ std::unique_ptr<column> is_float(
                     d_results,
                     [d_column] __device__(size_type idx) {
                       if (d_column.is_null(idx)) return false;
-                      return string::is_float(d_column.element<string_view>(idx));
+                      return strings::is_float(d_column.element<string_view>(idx));
                     });
   results->set_null_count(strings.null_count());
   return results;
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 95ddf1822a7..75c2f851bab 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -150,14 +150,13 @@ std::unique_ptr<column> is_integer(
       d_column->pair_begin<string_view, true>(),
       d_column->pair_end<string_view, true>(),
       d_results,
-      [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
+      [] __device__(auto const& p) { return p.second ? strings::is_integer(p.first) : false; });
   } else {
-    thrust::transform(
-      rmm::exec_policy(stream),
-      d_column->pair_begin<string_view, false>(),
-      d_column->pair_end<string_view, false>(),
-      d_results,
-      [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
+    thrust::transform(rmm::exec_policy(stream),
+                      d_column->pair_begin<string_view, false>(),
+                      d_column->pair_end<string_view, false>(),
+                      d_results,
+                      [] __device__(auto const& p) { return strings::is_integer(p.first); });
   }
 
   // Calling mutable_view() on a column invalidates it's null count so we need to set it back