From e64e26eda09f8508b7760ddba9f742c4f4e827cb Mon Sep 17 00:00:00 2001
From: Ayush Dattagupta <ayushdg95@gmail.com>
Date: Thu, 23 Feb 2023 18:07:56 -0800
Subject: [PATCH 01/10] Expose seed argument to hash_values (#12795)

This PR exposes the `seed` param to `hash_values` that is already supported by libcudf's `hash` method.
Closes #12775

Authors:
  - Ayush Dattagupta (https://github.com/ayushdg)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/12795
---
 python/cudf/cudf/core/indexed_frame.py   | 24 +++++++++++++--
 python/cudf/cudf/tests/test_dataframe.py | 39 +++++++++++++++++++++---
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 43277fb55ff..2992cb005e5 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1629,7 +1629,7 @@ def memory_usage(self, index=True, deep=False):
         """
         raise NotImplementedError
 
-    def hash_values(self, method="murmur3"):
+    def hash_values(self, method="murmur3", seed=None):
         """Compute the hash of values in this column.
 
         Parameters
@@ -1639,6 +1639,12 @@ def hash_values(self, method="murmur3"):
             * murmur3: MurmurHash3 hash function.
             * md5: MD5 hash function.
 
+        seed : int, optional
+            Seed value to use for the hash function.
+            Note - This only has effect for the following supported
+            hash functions:
+            * murmur3: MurmurHash3 hash function.
+
         Returns
         -------
         Series
@@ -1665,6 +1671,11 @@ def hash_values(self, method="murmur3"):
         1    947ca8d2c5f0f27437f156cfbfab0969
         2    d0580ef52d27c043c8e341fd5039b166
         dtype: object
+        >>> series.hash_values(method="murmur3", seed=42)
+        0    2364453205
+        1     422621911
+        2    3353449140
+        dtype: uint32
 
         **DataFrame**
 
@@ -1686,11 +1697,20 @@ def hash_values(self, method="murmur3"):
         2    fe061786ea286a515b772d91b0dfcd70
         dtype: object
         """
+        seed_hash_methods = {"murmur3"}
+        if seed is None:
+            seed = 0
+        elif method not in seed_hash_methods:
+            warnings.warn(
+                "Provided seed value has no effect for hash method"
+                f" `{method}`. Refer to the docstring for information"
+                " on hash methods that support the `seed` param"
+            )
         # Note that both Series and DataFrame return Series objects from this
         # calculation, necessitating the unfortunate circular reference to the
         # child class here.
         return cudf.Series._from_data(
-            {None: libcudf.hash.hash([*self._columns], method)},
+            {None: libcudf.hash.hash([*self._columns], method, seed)},
             index=self.index,
         )
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 09b9f57356c..13f312f6f0c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -38,6 +38,7 @@
     NUMERIC_TYPES,
     assert_eq,
     assert_exceptions_equal,
+    assert_neq,
     does_not_raise,
     expect_warning_if,
     gen_rand,
@@ -1323,9 +1324,10 @@ def test_assign():
 
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
 @pytest.mark.parametrize("method", ["murmur3", "md5"])
-def test_dataframe_hash_values(nrows, method):
+@pytest.mark.parametrize("seed", [None, 42])
+def test_dataframe_hash_values(nrows, method, seed):
     gdf = cudf.DataFrame()
-    data = np.asarray(range(nrows))
+    data = np.arange(nrows)
     data[0] = data[-1]  # make first and last the same
     gdf["a"] = data
     gdf["b"] = gdf.a + 100
@@ -1334,12 +1336,41 @@ def test_dataframe_hash_values(nrows, method):
     assert len(out) == nrows
     assert out.dtype == np.uint32
 
+    warning_expected = (
+        True if seed is not None and method not in {"murmur3"} else False
+    )
     # Check single column
-    out_one = gdf[["a"]].hash_values(method=method)
+    if warning_expected:
+        with pytest.warns(
+            UserWarning, match="Provided seed value has no effect*"
+        ):
+            out_one = gdf[["a"]].hash_values(method=method, seed=seed)
+    else:
+        out_one = gdf[["a"]].hash_values(method=method, seed=seed)
     # First matches last
     assert out_one.iloc[0] == out_one.iloc[-1]
     # Equivalent to the cudf.Series.hash_values()
-    assert_eq(gdf["a"].hash_values(method=method), out_one)
+    if warning_expected:
+        with pytest.warns(
+            UserWarning, match="Provided seed value has no effect*"
+        ):
+            assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
+    else:
+        assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
+
+
+@pytest.mark.parametrize("method", ["murmur3"])
+def test_dataframe_hash_values_seed(method):
+    gdf = cudf.DataFrame()
+    data = np.arange(10)
+    data[0] = data[-1]  # make first and last the same
+    gdf["a"] = data
+    gdf["b"] = gdf.a + 100
+    out_one = gdf.hash_values(method=method, seed=0)
+    out_two = gdf.hash_values(method=method, seed=1)
+    assert out_one.iloc[0] == out_one.iloc[-1]
+    assert out_two.iloc[0] == out_two.iloc[-1]
+    assert_neq(out_one, out_two)
 
 
 @pytest.mark.parametrize("nrows", [3, 10, 100, 1000])

From 2e80eba6f75b03f039517c947f386ede65842a4c Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Fri, 24 Feb 2023 10:28:25 -0600
Subject: [PATCH 02/10] Fix parquet `RangeIndex` bug (#12838)

Possible fix for https://github.com/rapidsai/cudf/issues/12837
Avoids dropping RangeIndex when `columns` argument is passed to `read_parquet` (unless `columns=[]`).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/12838
---
 python/cudf/cudf/_lib/parquet.pyx      |  2 +-
 python/cudf/cudf/tests/test_parquet.py | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index e5520ae1987..464d9243408 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -170,7 +170,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     allow_range_index = True
     if columns is not None:
         cpp_columns.reserve(len(columns))
-        allow_range_index = False
+        allow_range_index = len(columns) > 0
         for col in columns:
             cpp_columns.push_back(str(col).encode())
         args.set_columns(cpp_columns)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ccd62729a9d..661497e4650 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2650,6 +2650,20 @@ def test_parquet_columns_and_index_param(index, columns):
     assert_eq(expected, got, check_index_type=True)
 
 
+@pytest.mark.parametrize("columns", [None, ["b", "a"]])
+def test_parquet_columns_and_range_index(columns):
+    buffer = BytesIO()
+    df = cudf.DataFrame(
+        {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=pd.RangeIndex(2, 5)
+    )
+    df.to_parquet(buffer)
+
+    expected = pd.read_parquet(buffer, columns=columns)
+    got = cudf.read_parquet(buffer, columns=columns)
+
+    assert_eq(expected, got, check_index_type=True)
+
+
 def test_parquet_nested_struct_list():
     buffer = BytesIO()
     data = {

From 0e4e6dd567964404934d96a1fe8fc14b1d25a526 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Fri, 24 Feb 2023 12:07:51 -0500
Subject: [PATCH 03/10] Add `always_nullable` flag to Dremel encoding (#12727)

Closes #12389 by fixing the bug describe here https://github.com/rapidsai/cudf/issues/12389#issuecomment-1419949751.

This flag, when `always_nullable=true`, generates `definition levels` in the Dremel encoding such that it considers every nested column and child to be `nullable`, even if they actually are not. In the context of `two_table_comparators`, this helps us with producing consistently mapped `definition levels` in case there are some nested columns or children that are not nullable in either one or both of the tables.

This PR now exposes two APIs:

1. `cudf::detail::get_dremel_data(...)` : This API is consistent with standard Dremel encoding
2. `cudf::detail::get_comparator_data(...)` : This API modifies the definition levels in Dremel encoding to produce the effect described above

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12727
---
 cpp/include/cudf/lists/detail/dremel.hpp      | 30 +++++++--
 .../cudf/table/experimental/row_operators.cuh |  3 +-
 cpp/src/lists/dremel.cu                       | 48 ++++++++++----
 cpp/src/table/row_operators.cu                |  2 +-
 cpp/tests/search/search_list_test.cpp         | 64 ++++++++++++++++++-
 5 files changed, 124 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
index 4e3aeec2499..d36a4091947 100644
--- a/cpp/include/cudf/lists/detail/dremel.hpp
+++ b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -183,16 +183,34 @@ struct dremel_data {
  *              - | - | -- | ---
  * ```
  *
- * @param col Column of LIST type
- * @param level_nullability Pre-determined nullability at each list level. Empty means infer from
- * `col`
+ * @param input Column of LIST type
+ * @param nullability Pre-determined nullability at each list level. Empty means infer from
+ * `input`
+ * @param output_as_byte_array if `true`, then any nested list level that has a child of type
+ * `uint8_t` will be considered as the last level
  * @param stream CUDA stream used for device memory operations and kernel launches.
- *
  * @return A struct containing dremel data
  */
-dremel_data get_dremel_data(column_view h_col,
+dremel_data get_dremel_data(column_view input,
                             std::vector<uint8_t> nullability,
                             bool output_as_byte_array,
                             rmm::cuda_stream_view stream);
 
+/**
+ * @brief Get Dremel offsets, repetition levels, and modified definition levels to be used for
+ *        lexicographical comparators. The modified definition levels are produced by treating
+ *        each nested column in the input as nullable
+ *
+ * @param input Column of LIST type
+ * @param nullability Pre-determined nullability at each list level. Empty means infer from
+ * `input`
+ * @param output_as_byte_array if `true`, then any nested list level that has a child of type
+ * `uint8_t` will be considered as the last level
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return A struct containing dremel data
+ */
+dremel_data get_comparator_data(column_view input,
+                                std::vector<uint8_t> nullability,
+                                bool output_as_byte_array,
+                                rmm::cuda_stream_view stream);
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index f9ffbfcdf7b..2a207d2a5c4 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -487,7 +487,8 @@ class device_row_comparator {
           // element_index because either both rows have a deeply nested NULL at the
           // same position, and we'll "continue" in our iteration, or we will early
           // exit if only one of the rows has a deeply nested NULL
-          if (lcol.nullable() and l_def_levels[l_dremel_index] == l_max_def_level - 1) {
+          if ((lcol.nullable() and l_def_levels[l_dremel_index] == l_max_def_level - 1) or
+              (rcol.nullable() and r_def_levels[r_dremel_index] == r_max_def_level - 1)) {
             ++element_index;
           }
           if (l_def_level == r_def_level) { continue; }
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index 26988622aee..c96a21df905 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -35,7 +35,7 @@
 #include <thrust/iterator/discard_iterator.h>
 
 namespace cudf::detail {
-
+namespace {
 /**
  * @brief Functor to get definition level value for a nested struct column until the leaf level or
  * the first list level.
@@ -46,6 +46,7 @@ struct def_level_fn {
   uint8_t const* d_nullability;
   uint8_t sub_level_start;
   uint8_t curr_def_level;
+  bool always_nullable;
 
   __device__ uint32_t operator()(size_type i)
   {
@@ -55,7 +56,7 @@ struct def_level_fn {
     auto col           = *parent_col;
     do {
       // If col not nullable then it does not contribute to def levels
-      if (d_nullability[l]) {
+      if (always_nullable or d_nullability[l]) {
         if (not col.nullable() or bit_is_set(col.null_mask(), i)) {
           ++def;
         } else {  // We have found the shallowest level at which this row is null
@@ -72,10 +73,11 @@ struct def_level_fn {
   }
 };
 
-dremel_data get_dremel_data(column_view h_col,
-                            std::vector<uint8_t> nullability,
-                            bool output_as_byte_array,
-                            rmm::cuda_stream_view stream)
+dremel_data get_encoding(column_view h_col,
+                         std::vector<uint8_t> nullability,
+                         bool output_as_byte_array,
+                         bool always_nullable,
+                         rmm::cuda_stream_view stream)
 {
   auto get_list_level = [](column_view col) {
     while (col.type().id() == type_id::STRUCT) {
@@ -173,14 +175,14 @@ dremel_data get_dremel_data(column_view h_col,
     uint32_t def = 0;
     start_at_sub_level.push_back(curr_nesting_level_idx);
     while (col.type().id() == type_id::STRUCT) {
-      def += (nullability[curr_nesting_level_idx]) ? 1 : 0;
+      def += (always_nullable or nullability[curr_nesting_level_idx]) ? 1 : 0;
       col = col.child(0);
       ++curr_nesting_level_idx;
     }
     // At the end of all those structs is either a list column or the leaf. List column contributes
     // at least one def level. Leaf contributes 1 level only if it is nullable.
-    def +=
-      (col.type().id() == type_id::LIST ? 1 : 0) + (nullability[curr_nesting_level_idx] ? 1 : 0);
+    def += (col.type().id() == type_id::LIST ? 1 : 0) +
+           (always_nullable or nullability[curr_nesting_level_idx] ? 1 : 0);
     def_at_level.push_back(def);
     ++curr_nesting_level_idx;
   };
@@ -209,7 +211,7 @@ dremel_data get_dremel_data(column_view h_col,
     }
   }
 
-  auto [device_view_owners, d_nesting_levels] =
+  [[maybe_unused]] auto [device_view_owners, d_nesting_levels] =
     contiguous_copy_column_device_views<column_device_view>(nesting_levels, stream);
 
   auto max_def_level = def_at_level.back();
@@ -297,7 +299,8 @@ dremel_data get_dremel_data(column_view h_col,
                                       def_level_fn{d_nesting_levels + level,
                                                    d_nullability.data(),
                                                    start_at_sub_level[level],
-                                                   def_at_level[level]});
+                                                   def_at_level[level],
+                                                   always_nullable});
 
     // `nesting_levels.size()` == no of list levels + leaf. Max repetition level = no of list levels
     auto input_child_rep_it = thrust::make_constant_iterator(nesting_levels.size() - 1);
@@ -306,7 +309,8 @@ dremel_data get_dremel_data(column_view h_col,
                                       def_level_fn{d_nesting_levels + level + 1,
                                                    d_nullability.data(),
                                                    start_at_sub_level[level + 1],
-                                                   def_at_level[level + 1]});
+                                                   def_at_level[level + 1],
+                                                   always_nullable});
 
     // Zip the input and output value iterators so that merge operation is done only once
     auto input_parent_zip_it =
@@ -389,7 +393,8 @@ dremel_data get_dremel_data(column_view h_col,
                                       def_level_fn{d_nesting_levels + level,
                                                    d_nullability.data(),
                                                    start_at_sub_level[level],
-                                                   def_at_level[level]});
+                                                   def_at_level[level],
+                                                   always_nullable});
 
     // Zip the input and output value iterators so that merge operation is done only once
     auto input_parent_zip_it =
@@ -459,5 +464,22 @@ dremel_data get_dremel_data(column_view h_col,
                      leaf_data_size,
                      max_def_level};
 }
+}  // namespace
+
+dremel_data get_dremel_data(column_view h_col,
+                            std::vector<uint8_t> nullability,
+                            bool output_as_byte_array,
+                            rmm::cuda_stream_view stream)
+{
+  return get_encoding(h_col, nullability, output_as_byte_array, false, stream);
+}
+
+dremel_data get_comparator_data(column_view h_col,
+                                std::vector<uint8_t> nullability,
+                                bool output_as_byte_array,
+                                rmm::cuda_stream_view stream)
+{
+  return get_encoding(h_col, nullability, output_as_byte_array, true, stream);
+}
 
 }  // namespace cudf::detail
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 766a1b63905..8a63a6f6411 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -264,7 +264,7 @@ auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream)
   std::vector<detail::dremel_device_view> dremel_device_views;
   for (auto const& col : table) {
     if (col.type().id() == type_id::LIST) {
-      dremel_data.push_back(detail::get_dremel_data(col, {}, false, stream));
+      dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream));
       dremel_device_views.push_back(dremel_data.back());
     }
   }
diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp
index 1393095037d..1e97933fa4d 100644
--- a/cpp/tests/search/search_list_test.cpp
+++ b/cpp/tests/search/search_list_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,8 @@
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
 
-using namespace cudf::test::iterators;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
 
 using bools_col   = cudf::test::fixed_width_column_wrapper<bool>;
 using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
@@ -347,3 +348,62 @@ TYPED_TEST(TypedListContainsTestColumnNeedles, ListsOfStructs)
   auto const result   = cudf::contains(*haystack, *needles);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity);
 }
+
+struct ListLowerBound : public cudf::test::BaseFixture {
+};
+
+TEST_F(ListLowerBound, ListWithNulls)
+{
+  {
+    using lcw           = cudf::test::lists_column_wrapper<double>;
+    auto const haystack = lcw{
+      lcw{-3.45967821e+12},  // 0
+      lcw{-3.6912186e-32},   // 1
+      lcw{9.721175},         // 2
+    };
+
+    auto const needles = lcw{
+      lcw{{0, 4.22671e+32}, null_at(0)},
+    };
+
+    auto const expect = int32s_col{0};
+    auto const result = cudf::lower_bound(cudf::table_view{{haystack}},
+                                          cudf::table_view{{needles}},
+                                          {cudf::order::ASCENDING},
+                                          {cudf::null_order::BEFORE});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *result);
+  }
+
+  {
+    using lcw       = cudf::test::lists_column_wrapper<int32_t, int32_t>;
+    auto const col1 = lcw{
+      lcw{{0}, null_at(0)},  // 0
+      lcw{-80},              // 1
+      lcw{-17},              // 2
+    };
+
+    auto const col2 = lcw{
+      lcw{27},               // 0
+      lcw{{0}, null_at(0)},  // 1
+      lcw{},                 // 2
+    };
+
+    auto const val1 = lcw{
+      lcw{87},
+    };
+
+    auto const val2 = lcw{
+      lcw{},
+    };
+
+    cudf::table_view input{{col1, col2}};
+    cudf::table_view values{{val1, val2}};
+    std::vector<cudf::order> column_order{cudf::order::ASCENDING, cudf::order::DESCENDING};
+    std::vector<cudf::null_order> null_order_flags{cudf::null_order::BEFORE,
+                                                   cudf::null_order::BEFORE};
+
+    auto const expect = int32s_col{3};
+    auto const result = cudf::lower_bound(input, values, column_order, null_order_flags);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *result);
+  }
+}

From 8a7fb2f14a73937d31f648a65f57bc47751e97c1 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 24 Feb 2023 12:25:49 -0600
Subject: [PATCH 04/10] Deprecate `inplace` parameters in categorical methods
 (#12824)

To get ready for pandas-2.0 compatibility, this PR deprecates `inplace` in the following APIs:

- [x] `as_ordered`
- [x] `as_unordered`
- [x] `add_categories`
- [x] `remove_categories`
- [x] `set_categories`
- [x] `reorder_categories`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/12824
---
 python/cudf/cudf/core/column/categorical.py | 78 ++++++++++++++++++++-
 python/cudf/cudf/tests/test_categorical.py  | 19 +++--
 2 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index a1526d25512..52f7c0b957f 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -141,6 +141,13 @@ def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]:
             or return a copy of this categorical with
             added categories.
 
+            .. deprecated:: 23.02
+
+               The `inplace` parameter is is deprecated and
+               will be removed in a future version of cudf.
+               Setting categories as ordered will always
+               return a new Categorical object.
+
         Returns
         -------
         Categorical
@@ -204,6 +211,13 @@ def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]:
             in-place or return a copy of this
             categorical with ordered set to False.
 
+            .. deprecated:: 23.02
+
+               The `inplace` parameter is is deprecated and
+               will be removed in a future version of cudf.
+               Setting categories as unordered will always
+               return a new Categorical object.
+
         Returns
         -------
         Categorical
@@ -286,6 +300,13 @@ def add_categories(
             or return a copy of this categorical with
             added categories.
 
+            .. deprecated:: 23.04
+
+               The `inplace` parameter is is deprecated and
+               will be removed in a future version of cudf.
+               Adding categories will always return a
+               new Categorical object.
+
         Returns
         -------
         cat
@@ -318,7 +339,14 @@ def add_categories(
         dtype: category
         Categories (5, int64): [1, 2, 0, 3, 4]
         """
-
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in cudf.Series.cat.add_categories "
+                "is deprecated and will be removed in a future version of "
+                "cudf. Adding categories will always return a new "
+                "Categorical object.",
+                FutureWarning,
+            )
         old_categories = self._column.categories
         new_categories = column.as_column(
             new_categories,
@@ -371,6 +399,13 @@ def remove_categories(
             inplace or return a copy of this categorical
             with removed categories.
 
+            .. deprecated:: 23.04
+
+               The `inplace` parameter is is deprecated and
+               will be removed in a future version of cudf.
+               Removing categories will always return a
+               new Categorical object.
+
         Returns
         -------
         cat
@@ -423,6 +458,16 @@ def remove_categories(
         dtype: category
         Categories (2, int64): [1, 2]
         """
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in "
+                "cudf.Series.cat.remove_categories is deprecated and "
+                "will be removed in a future version of cudf. "
+                "Removing categories will always return a new "
+                "Categorical object.",
+                FutureWarning,
+            )
+
         cats = self.categories.to_series()
         removals = cudf.Series(removals, dtype=cats.dtype)
         removals_mask = removals.isin(cats)
@@ -485,6 +530,13 @@ def set_categories(
             or return a copy of this categorical with
             reordered categories.
 
+            .. deprecated:: 23.04
+
+               The `inplace` parameter is is deprecated and
+               will be removed in a future version of cudf.
+               Setting categories will always return a
+               new Categorical object.
+
         Returns
         -------
         cat
@@ -524,6 +576,14 @@ def set_categories(
         dtype: category
         Categories (2, int64): [1, 10]
         """
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in cudf.Series.cat.set_categories is "
+                "deprecated and will be removed in a future version of cudf. "
+                "Setting categories will always return a new Categorical "
+                "object.",
+                FutureWarning,
+            )
         return self._return_or_inplace(
             self._column.set_categories(
                 new_categories=new_categories, ordered=ordered, rename=rename
@@ -556,6 +616,13 @@ def reorder_categories(
             inplace or return a copy of this categorical
             with reordered categories.
 
+            .. deprecated:: 23.04
+
+               The `inplace` parameter is is deprecated and
+               will be removed in a future version of cudf.
+               Reordering categories will always return a
+               new Categorical object.
+
         Returns
         -------
         cat
@@ -597,6 +664,15 @@ def reorder_categories(
         ValueError: items in new_categories are not the same as in
         old categories
         """
+        if inplace:
+            warnings.warn(
+                "The `inplace` parameter in "
+                "cudf.Series.cat.reorder_categories is deprecated "
+                "and will be removed in a future version of cudf. "
+                "Reordering categories will always return a new "
+                "Categorical object.",
+                FutureWarning,
+            )
         return self._return_or_inplace(
             self._column.reorder_categories(new_categories, ordered=ordered),
             inplace=inplace,
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index fa8981cf7e3..496039ca2f8 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -443,10 +443,13 @@ def test_categorical_reorder_categories(
         "reorder_categories"
     ):
         pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs)
-    cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs)
     if inplace:
+        with pytest.warns(FutureWarning):
+            cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs)
         pd_sr_1 = pd_sr
         cd_sr_1 = cd_sr
+    else:
+        cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs)
 
     assert_eq(pd_sr_1, cd_sr_1)
 
@@ -479,10 +482,14 @@ def test_categorical_add_categories(pd_str_cat, inplace):
         "add_categories"
     ):
         pd_sr_1 = pd_sr.cat.add_categories(["d"], inplace=inplace)
-    cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace)
+
     if inplace:
+        with pytest.warns(FutureWarning):
+            cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace)
         pd_sr_1 = pd_sr
         cd_sr_1 = cd_sr
+    else:
+        cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace)
 
     assert "d" in pd_sr_1.cat.categories.to_list()
     assert "d" in cd_sr_1.cat.categories.to_pandas().to_list()
@@ -516,10 +523,14 @@ def test_categorical_remove_categories(pd_str_cat, inplace):
         "remove_categories"
     ):
         pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace)
-    cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace)
+
     if inplace:
+        with pytest.warns(FutureWarning):
+            cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace)
         pd_sr_1 = pd_sr
         cd_sr_1 = cd_sr
+    else:
+        cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace)
 
     assert "a" not in pd_sr_1.cat.categories.to_list()
     assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list()
@@ -529,7 +540,7 @@ def test_categorical_remove_categories(pd_str_cat, inplace):
     # test using ordered operators
     with _hide_deprecated_pandas_categorical_inplace_warnings(
         "remove_categories"
-    ):
+    ) as _, pytest.warns(FutureWarning) as _:
         assert_exceptions_equal(
             lfunc=cd_sr.to_pandas().cat.remove_categories,
             rfunc=cd_sr.cat.remove_categories,

From 54ee14e36157fe63d0eb58ed7ac8bafc2b1e4932 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Fri, 24 Feb 2023 19:37:29 +0100
Subject: [PATCH 05/10] Update datasets download URL (#12840)

Update datasets download URL to reduce latency and costs

Authors:
  - Jordan Jacobelli (https://github.com/jjacobelli)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/12840
---
 python/cudf/cudf/benchmarks/get_datasets.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/benchmarks/get_datasets.py b/python/cudf/cudf/benchmarks/get_datasets.py
index f3b66eda512..7090539bcb0 100644
--- a/python/cudf/cudf/benchmarks/get_datasets.py
+++ b/python/cudf/cudf/benchmarks/get_datasets.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import argparse
 import os
@@ -9,10 +9,7 @@
 Dataset = namedtuple("Dataset", ["url", "dir"])
 datasets = {
     "cuio_dataset": Dataset(
-        (
-            "https://rapidsai-data.s3.us-east-2.amazonaws.com/cudf/"
-            "benchmark/avro_json_datasets.zip"
-        ),
+        "https://data.rapids.ai/cudf/benchmark/avro_json_datasets.zip",
         "cudf/benchmarks/cuio_data/",
     ),
 }

From 12e4501c49daac3d0e3837a3f65078e63e20b904 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 24 Feb 2023 13:42:49 -0500
Subject: [PATCH 06/10] Remove KAFKA_HOST_TEST from compute-sanitizer check
 (#12831)

Removes the `KAFKA_HOST_TEST` from the compute-sanitizer memcheck nighly runs.
The following error occurs when running this host test.
```
Running compute-sanitizer on KAFKA_HOST_TEST
========= COMPUTE-SANITIZER
Running main() from gmock_main.cc
[==========] Running 2 tests from 1 test suite.
[----------] Global test environment set-up.
[----------] 2 tests from KafkaDatasourceTest
[ RUN      ] KafkaDatasourceTest.MissingGroupID
[       OK ] KafkaDatasourceTest.MissingGroupID (0 ms)
[ RUN      ] KafkaDatasourceTest.InvalidConfigValues
[       OK ] KafkaDatasourceTest.InvalidConfigValues (0 ms)
[----------] 2 tests from KafkaDatasourceTest (0 ms total)

[----------] Global test environment tear-down
[==========] 2 tests from 1 test suite ran. (0 ms total)
[  PASSED  ] 2 tests.
========= Error: Target application terminated before first instrumented API call
========= Tracking kernels launched by child processes requires the --target-processes all option.
```
Adding the `--target-processes all` option gives the same error.

Disabling the check of this test since it is a host test that checks error conditions and does not appear to make any device calls.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12831
---
 ci/test_cpp_memcheck.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh
index 0cad4fc3a3f..db9ce143d51 100755
--- a/ci/test_cpp_memcheck.sh
+++ b/ci/test_cpp_memcheck.sh
@@ -11,7 +11,7 @@ set +e
 rapids-logger "Memcheck gtests with rmm_mode=cuda"
 export GTEST_CUDF_RMM_MODE=cuda
 COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
-for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do
+for gt in "$CONDA_PREFIX"/bin/gtests/libcudf/* ; do
     test_name=$(basename ${gt})
     if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then
         continue

From 77c2e03ec572527b5c5c7a3f7a48b0cabd29abde Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 24 Feb 2023 10:47:44 -0800
Subject: [PATCH 07/10] Consolidate linter configs into pyproject.toml (#12834)

This consolidation allows us to get rid of now unnecessary setup.cfg files (thanks to removing versioneer in #12741). It also allows us to move towards a fully pyproject.toml-driven build.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - David Wendt (https://github.com/davidwendt)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/12834
---
 .flake8                                       | 24 +++++++
 .pre-commit-config.yaml                       | 12 ++--
 ci/release/update-version.sh                  |  2 +-
 cpp/benchmarks/common/generate_input.cu       |  4 +-
 cpp/benchmarks/common/generate_input.hpp      |  6 +-
 .../developer_guide/contributing_guide.md     |  8 +--
 pyproject.toml                                | 38 +++++++++++
 python/cudf/cudf/_lib/utils.pyx               |  4 +-
 python/cudf/pyproject.toml                    | 43 +++++++++++++
 python/cudf/setup.cfg                         | 32 ----------
 python/cudf_kafka/pyproject.toml              | 46 +++++++++++++
 python/cudf_kafka/setup.cfg                   | 35 ----------
 python/custreamz/pyproject.toml               | 45 +++++++++++++
 python/custreamz/setup.cfg                    | 34 ----------
 python/dask_cudf/pyproject.toml               | 45 +++++++++++++
 python/dask_cudf/setup.cfg                    | 31 ---------
 setup.cfg                                     | 64 -------------------
 17 files changed, 261 insertions(+), 212 deletions(-)
 create mode 100644 .flake8
 delete mode 100644 python/cudf/setup.cfg
 delete mode 100644 python/cudf_kafka/setup.cfg
 delete mode 100644 python/custreamz/setup.cfg
 delete mode 100644 setup.cfg

diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000000..e80e3afc443
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,24 @@
+# Copyright (c) 2017-2023, NVIDIA CORPORATION.
+
+[flake8]
+filename = *.py, *.pyx, *.pxd, *.pxi
+exclude = __init__.py, *.egg, build, docs, .git
+force-check = True
+ignore =
+    # line break before binary operator
+    W503,
+    # whitespace before :
+    E203
+per-file-ignores =
+    # Rules ignored only in Cython:
+    # E211: whitespace before '(' (used in multi-line imports)
+    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
+    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
+    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
+    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
+    # E402: invalid syntax (works for Python, not Cython)
+    # E999: invalid syntax (works for Python, not Cython)
+    # W504: line break after binary operator (breaks lines that end with a pointer)
+    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 244fc0d3872..e252af717ce 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,7 +34,7 @@ repos:
         rev: 5.0.4
         hooks:
               - id: flake8
-                args: ["--config=setup.cfg"]
+                args: ["--config=.flake8"]
                 files: python/.*$
                 types: [file]
                 types_or: [python, cython]
@@ -48,7 +48,7 @@ repos:
         hooks:
               - id: mypy
                 additional_dependencies: [types-cachetools]
-                args: ["--config-file=setup.cfg",
+                args: ["--config-file=pyproject.toml",
                        "python/cudf/cudf",
                        "python/custreamz/custreamz",
                        "python/cudf_kafka/cudf_kafka",
@@ -58,7 +58,9 @@ repos:
         rev: 6.1.1
         hooks:
               - id: pydocstyle
-                args: ["--config=setup.cfg"]
+                # https://github.com/PyCQA/pydocstyle/issues/603
+                additional_dependencies: [toml]
+                args: ["--config=pyproject.toml"]
       - repo: https://github.com/pre-commit/mirrors-clang-format
         rev: v11.1.0
         hooks:
@@ -138,9 +140,11 @@ repos:
                 pass_filenames: false
                 verbose: false
       - repo: https://github.com/codespell-project/codespell
-        rev: v2.1.0
+        rev: v2.2.2
         hooks:
               - id: codespell
+                additional_dependencies: [tomli]
+                args: ["--toml", "pyproject.toml"]
                 exclude: |
                   (?x)^(
                     .*test.*|
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index c8875fda641..831b91bb2a6 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -94,7 +94,7 @@ sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/setup
 sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/setup.py
 
 # Dependency versions in pyproject.toml
-sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml
+sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/pyproject.toml
 
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index dee7e2b8586..2829d14070c 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -501,7 +501,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
   rmm::device_uvector<cudf::size_type> offsets(num_rows + 1, cudf::get_default_stream());
   thrust::exclusive_scan(
     thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin());
-  // offfsets are ready.
+  // offsets are ready.
   auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1);
   rmm::device_uvector<char> chars(chars_length, cudf::get_default_stream());
   thrust::for_each_n(thrust::device,
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index f8ea194f0c4..e65aa69763b 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -373,13 +373,13 @@ class data_profile {
 
   void set_bool_probability_true(double p)
   {
-    CUDF_EXPECTS(p >= 0. and p <= 1., "probablity must be in range [0...1]");
+    CUDF_EXPECTS(p >= 0. and p <= 1., "probability must be in range [0...1]");
     bool_probability_true = p;
   }
   void set_null_probability(std::optional<double> p)
   {
     CUDF_EXPECTS(p.value_or(0.) >= 0. and p.value_or(0.) <= 1.,
-                 "probablity must be in range [0...1]");
+                 "probability must be in range [0...1]");
     null_probability = p;
   }
   void set_cardinality(cudf::size_type c) { cardinality = c; }
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 34071f44914..bb3479cf4c1 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -22,16 +22,16 @@ Specifically, cuDF uses the following tools:
   In conjunction with [type hints](https://docs.python.org/3/library/typing.html),
   `mypy` can help catch various bugs that are otherwise difficult to find.
 - [`pydocstyle`](https://github.com/PyCQA/pydocstyle/) lints docstring style.
+- [`codespell`](https://github.com/codespell-project/codespell) finds spelling errors.
 
 Linter config data is stored in a number of files.
-We generally use `pyproject.toml` over `setup.cfg` and avoid project-specific files (e.g. `setup.cfg` > `python/cudf/setup.cfg`).
+We generally use `pyproject.toml` over `setup.cfg` and avoid project-specific files (e.g. `pyproject.toml` > `python/cudf/pyproject.toml`).
 However, differences between tools and the different packages in the repo result in the following caveats:
 
-- `flake8` has no plans to support `pyproject.toml`, so it must live in `setup.cfg`.
+- `flake8` has no plans to support `pyproject.toml`, so it must live in `.flake8`.
 - `isort` must be configured per project to set which project is the "first party" project.
 
-Additionally, our use of `versioneer` means that each project must have a `setup.cfg`.
-As a result, we currently maintain both root and project-level `pyproject.toml` and `setup.cfg` files.
+As a result, we currently maintain both root and project-level `pyproject.toml` files as well as a `.flake8` file.
 
 For more information on how to use pre-commit hooks, see the code formatting section of the
 [overall contributing guide](https://github.com/rapidsai/cudf/blob/main/CONTRIBUTING.md#python--pre-commit-hooks).
diff --git a/pyproject.toml b/pyproject.toml
index dfd22f33785..3940d9119ae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,3 +17,41 @@ force-exclude = '''
     dist
 )/
 '''
+
+[tool.pydocstyle]
+# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
+# than include using match-dir. Note that as discussed in
+# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
+# unlike the match option above this match-dir will have no effect when
+# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
+# also be maintained in the pre-commit config file.
+match-dir = "^(?!(ci|cpp|conda|docs|java|notebooks)).*$"
+# Allow missing docstrings for docutils
+ignore-decorators = ".*(docutils|doc_apply|copy_docstring).*"
+select = "D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418"
+    # Would like to enable the following rules in the future:
+    # D200, D202, D205, D400
+
+[tool.mypy]
+ignore_missing_imports = true
+# If we don't specify this, then mypy will check excluded files if
+# they are imported by a checked file.
+follow_imports = "skip"
+exclude = [
+    "cudf/_lib/",
+    "cudf/cudf/benchmarks/",
+    "cudf/cudf/tests/",
+    "cudf/cudf/utils/metadata/orc_column_statistics_pb2.py",
+    "custreamz/custreamz/tests/",
+    "dask_cudf/dask_cudf/tests/",
+ ]
+
+[tool.codespell]
+# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
+# this is only to allow you to run codespell interactively
+skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp"
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
+ignore-words-list = "inout,unparseable,falsy"
+builtin = "clear"
+quiet-level = 3
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 5f4d3e17fbc..56918799cca 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pyarrow as pa
@@ -315,7 +315,7 @@ cdef columns_from_table_view(
     object owners,
 ):
     """
-    Given a ``cudf::table_view``, construsts a list of columns from it,
+    Given a ``cudf::table_view``, constructs a list of columns from it,
     along with referencing an owner Python object that owns the memory
     lifetime. owner must be either None or a list of column. If owner
     is a list of columns, the owner of the `i`th ``cudf::column_view``
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 49c4d83245f..305e8822030 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -15,3 +15,46 @@ requires = [
     "protoc-wheel",
     "rmm==23.4.*",
 ]
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_dask = [
+    "dask",
+    "distributed",
+    "dask_cuda",
+]
+known_rapids = [
+    "rmm",
+]
+known_first_party = [
+    "cudf",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
diff --git a/python/cudf/setup.cfg b/python/cudf/setup.cfg
deleted file mode 100644
index 8380da371f9..00000000000
--- a/python/cudf/setup.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
-
-[isort]
-line_length=79
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-combine_as_imports=True
-order_by_type=True
-known_dask=
-    dask
-    distributed
-    dask_cuda
-known_rapids=
-    rmm
-known_first_party=
-    cudf
-default_section=THIRDPARTY
-sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
-skip=
-    thirdparty
-    .eggs
-    .git
-    .hg
-    .mypy_cache
-    .tox
-    .venv
-    _build
-    buck-out
-    build
-    dist
-    __init__.py
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 0924fc90352..308a7869bc0 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -7,3 +7,49 @@ requires = [
     "setuptools",
     "cython>=0.29,<0.30",
 ]
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_dask = [
+    "dask",
+    "distributed",
+    "dask_cuda",
+    "streamz",
+]
+known_rapids = [
+    "rmm",
+    "cudf",
+    "dask_cudf",
+]
+known_first_party = [
+    "cudf_kafka",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
diff --git a/python/cudf_kafka/setup.cfg b/python/cudf_kafka/setup.cfg
deleted file mode 100644
index ee0d783b184..00000000000
--- a/python/cudf_kafka/setup.cfg
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-[isort]
-line_length=79
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-combine_as_imports=True
-order_by_type=True
-known_dask=
-    dask
-    distributed
-    dask_cuda
-    streamz
-known_rapids=
-    rmm
-    cudf
-    dask_cudf
-known_first_party=
-    cudf_kafka
-default_section=THIRDPARTY
-sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
-skip=
-    thirdparty
-    .eggs
-    .git
-    .hg
-    .mypy_cache
-    .tox
-    .venv
-    _build
-    buck-out
-    build
-    dist
-    __init__.py
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 806848c356e..d5c41945482 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -6,3 +6,48 @@ requires = [
     "wheel",
     "setuptools",
 ]
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_dask = [
+    "dask",
+    "distributed",
+    "dask_cuda",
+]
+known_rapids = [
+    "rmm",
+    "cudf",
+    "dask_cudf",
+]
+known_first_party = [
+    "streamz",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
diff --git a/python/custreamz/setup.cfg b/python/custreamz/setup.cfg
deleted file mode 100644
index 8c038db9349..00000000000
--- a/python/custreamz/setup.cfg
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-[isort]
-line_length=79
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-combine_as_imports=True
-order_by_type=True
-known_dask=
-    dask
-    distributed
-    dask_cuda
-known_rapids=
-    rmm
-    cudf
-    dask_cudf
-known_first_party=
-    streamz
-default_section=THIRDPARTY
-sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
-skip=
-    thirdparty
-    .eggs
-    .git
-    .hg
-    .mypy_cache
-    .tox
-    .venv
-    _build
-    buck-out
-    build
-    dist
-    __init__.py
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 806848c356e..8cf823d4291 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -6,3 +6,48 @@ requires = [
     "wheel",
     "setuptools",
 ]
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+
+known_dask = [
+    "dask",
+    "distributed",
+    "dask_cuda",
+]
+known_rapids = [
+    "rmm",
+    "cudf",
+]
+known_first_party = [
+    "dask_cudf",
+]
+
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+]
diff --git a/python/dask_cudf/setup.cfg b/python/dask_cudf/setup.cfg
index 66f4b8891d0..8139b3c7dc6 100644
--- a/python/dask_cudf/setup.cfg
+++ b/python/dask_cudf/setup.cfg
@@ -1,36 +1,5 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-[isort]
-line_length=79
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-combine_as_imports=True
-order_by_type=True
-known_dask=
-    dask
-    distributed
-    dask_cuda
-known_rapids=
-    rmm
-    cudf
-known_first_party=
-    dask_cudf
-default_section=THIRDPARTY
-sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
-skip=
-    thirdparty
-    .eggs
-    .git
-    .hg
-    .mypy_cache
-    .tox
-    .venv
-    _build
-    buck-out
-    build
-    dist
-
 [options.entry_points]
 dask.dataframe.backends =
     cudf = dask_cudf.backends:CudfBackendEntrypoint
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 962b7d73bbe..00000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2017-2023, NVIDIA CORPORATION.
-
-[flake8]
-filename = *.py, *.pyx, *.pxd, *.pxi
-exclude = __init__.py, *.egg, build, docs, .git
-force-check = True
-ignore =
-    # line break before binary operator
-    W503,
-    # whitespace before :
-    E203
-per-file-ignores =
-    # Rules ignored only in Cython:
-    # E211: whitespace before '(' (used in multi-line imports)
-    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
-    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
-    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
-    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
-    # E402: invalid syntax (works for Python, not Cython)
-    # E999: invalid syntax (works for Python, not Cython)
-    # W504: line break after binary operator (breaks lines that end with a pointer)
-    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
-
-[pydocstyle]
-# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
-# than include using match-dir. Note that as discussed in
-# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
-# unlike the match option above this match-dir will have no effect when
-# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
-# also be maintained in the pre-commit config file.
-match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks)).*$
-# Allow missing docstrings for docutils
-ignore-decorators = .*(docutils|doc_apply|copy_docstring).*
-select =
-    D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418
-    # Would like to enable the following rules in the future:
-    # D200, D202, D205, D400
-
-[mypy]
-ignore_missing_imports = True
-# If we don't specify this, then mypy will check excluded files if
-# they are imported by a checked file.
-follow_imports = skip
-exclude = (?x)(
-  cudf/_lib/
-  | cudf/cudf/benchmarks/
-  | cudf/cudf/tests/
-  | cudf/cudf/utils/metadata/orc_column_statistics_pb2.py
-  | custreamz/custreamz/tests/
-  | dask_cudf/dask_cudf/tests/
-  # This close paren cannot be in column zero otherwise the config parser barfs
- )
-
-[codespell]
-# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
-# this is only to allow you to run codespell interactively
-skip = ./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp
-# ignore short words, and typename parameters like OffsetT
-ignore-regex = \b(.{1,4}|[A-Z]\w*T)\b
-ignore-words-list = inout,unparseable
-builtin = clear
-quiet-level = 3

From 4f2f37987fbd66de0cc9116734d2094ca4a39948 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 24 Feb 2023 17:04:59 -0600
Subject: [PATCH 08/10] Enable nbqa pre-commit hooks for isort and black.
 (#12848)

This enables `black` and `isort` linters for ipynb notebooks via [nbqa](https://github.com/nbQA-dev/nbQA). I propose this change to avoid manually linting notebooks like https://github.com/rapidsai/cudf/pull/12595. cc: @galipremsagar

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/12848
---
 .pre-commit-config.yaml                       |  10 ++
 docs/cudf/source/user_guide/10min.ipynb       |   1 +
 .../cudf/source/user_guide/cupy-interop.ipynb |  34 ++--
 .../source/user_guide/guide-to-udfs.ipynb     | 149 +++++++++---------
 .../cudf/source/user_guide/missing-data.ipynb |  56 ++++---
 5 files changed, 141 insertions(+), 109 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e252af717ce..a030f3bd25b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -61,6 +61,16 @@ repos:
                 # https://github.com/PyCQA/pydocstyle/issues/603
                 additional_dependencies: [toml]
                 args: ["--config=pyproject.toml"]
+      - repo: https://github.com/nbQA-dev/nbQA
+        rev: 1.6.3
+        hooks:
+              - id: nbqa-isort
+                # Use the cudf_kafka isort orderings in notebooks so that dask
+                # and RAPIDS packages have their own sections.
+                args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
+              - id: nbqa-black
+                # Explicitly specify the pyproject.toml at the repo root, not per-project.
+                args: ["--config=pyproject.toml"]
       - repo: https://github.com/pre-commit/mirrors-clang-format
         rev: v11.1.0
         hooks:
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index af938b79a29..0352c624e04 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -35,6 +35,7 @@
     "\n",
     "import cupy as cp\n",
     "import pandas as pd\n",
+    "\n",
     "import cudf\n",
     "import dask_cudf\n",
     "\n",
diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
index 3e169984ace..c98a4ddea23 100644
--- a/docs/cudf/source/user_guide/cupy-interop.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -18,9 +18,10 @@
    "outputs": [],
    "source": [
     "import timeit\n",
-    "from packaging import version\n",
     "\n",
     "import cupy as cp\n",
+    "from packaging import version\n",
+    "\n",
     "import cudf\n",
     "\n",
     "if version.parse(cp.__version__) >= version.parse(\"10.0.0\"):\n",
@@ -63,10 +64,13 @@
    ],
    "source": [
     "nelem = 10000\n",
-    "df = cudf.DataFrame({'a':range(nelem),\n",
-    "                     'b':range(500, nelem + 500),\n",
-    "                     'c':range(1000, nelem + 1000)}\n",
-    "                   )\n",
+    "df = cudf.DataFrame(\n",
+    "    {\n",
+    "        \"a\": range(nelem),\n",
+    "        \"b\": range(500, nelem + 500),\n",
+    "        \"c\": range(1000, nelem + 1000),\n",
+    "    }\n",
+    ")\n",
     "\n",
     "%timeit arr_cupy = cupy_from_dlpack(df.to_dlpack())\n",
     "%timeit arr_cupy = df.values\n",
@@ -138,7 +142,7 @@
     }
    ],
    "source": [
-    "col = 'a'\n",
+    "col = \"a\"\n",
     "\n",
     "%timeit cola_cupy = cp.asarray(df[col])\n",
     "%timeit cola_cupy = cupy_from_dlpack(df[col].to_dlpack())\n",
@@ -1088,14 +1092,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def cudf_to_cupy_sparse_matrix(data, sparseformat='column'):\n",
-    "    \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\n",
-    "    \"\"\"\n",
-    "    if sparseformat not in ('row', 'column',):\n",
+    "def cudf_to_cupy_sparse_matrix(data, sparseformat=\"column\"):\n",
+    "    \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\"\"\"\n",
+    "    if sparseformat not in (\n",
+    "        \"row\",\n",
+    "        \"column\",\n",
+    "    ):\n",
     "        raise ValueError(\"Let's focus on column and row formats for now.\")\n",
-    "    \n",
+    "\n",
     "    _sparse_constructor = cp.sparse.csc_matrix\n",
-    "    if sparseformat == 'row':\n",
+    "    if sparseformat == \"row\":\n",
     "        _sparse_constructor = cp.sparse.csr_matrix\n",
     "\n",
     "    return _sparse_constructor(cupy_from_dlpack(data.to_dlpack()))"
@@ -1121,8 +1127,8 @@
     "nonzero = 1000\n",
     "for i in range(20):\n",
     "    arr = cp.random.normal(5, 5, nelem)\n",
-    "    arr[cp.random.choice(arr.shape[0], nelem-nonzero, replace=False)] = 0\n",
-    "    df['a' + str(i)] = arr"
+    "    arr[cp.random.choice(arr.shape[0], nelem - nonzero, replace=False)] = 0\n",
+    "    df[\"a\" + str(i)] = arr"
    ]
   },
   {
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 943fc980a31..ba8c65784d2 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -15,9 +15,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import numpy as np\n",
+    "\n",
     "import cudf\n",
-    "from cudf.datasets import randomdata\n",
-    "import numpy as np"
+    "from cudf.datasets import randomdata"
    ]
   },
   {
@@ -375,7 +376,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sr = cudf.Series(['', 'abc', 'some_example'])"
+    "sr = cudf.Series([\"\", \"abc\", \"some_example\"])"
    ]
   },
   {
@@ -387,9 +388,9 @@
    "source": [
     "def f(st):\n",
     "    if len(st) > 0:\n",
-    "        if st.startswith('a'):\n",
+    "        if st.startswith(\"a\"):\n",
     "            return 1\n",
-    "        elif 'example' in st:\n",
+    "        elif \"example\" in st:\n",
     "            return 2\n",
     "        else:\n",
     "            return -1\n",
@@ -443,6 +444,7 @@
    "outputs": [],
    "source": [
     "from cudf.core.udf.utils import set_malloc_heap_size\n",
+    "\n",
     "set_malloc_heap_size(int(2e9))"
    ]
   },
@@ -472,7 +474,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)"
+    "df = randomdata(nrows=5, dtypes={\"a\": int, \"b\": int, \"c\": int}, seed=12)"
    ]
   },
   {
@@ -484,10 +486,11 @@
    "source": [
     "from numba import cuda\n",
     "\n",
+    "\n",
     "@cuda.jit\n",
     "def multiply(in_col, out_col, multiplier):\n",
     "    i = cuda.grid(1)\n",
-    "    if i < in_col.size: # boundary guard\n",
+    "    if i < in_col.size:  # boundary guard\n",
     "        out_col[i] = in_col[i] * multiplier"
    ]
   },
@@ -508,9 +511,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "size = len(df['a'])\n",
-    "df['e'] = 0.0\n",
-    "multiply.forall(size)(df['a'], df['e'], 10.0)"
+    "size = len(df[\"a\"])\n",
+    "df[\"e\"] = 0.0\n",
+    "multiply.forall(size)(df[\"a\"], df[\"e\"], 10.0)"
    ]
   },
   {
@@ -658,7 +661,7 @@
    "outputs": [],
    "source": [
     "def f(row):\n",
-    "    return row['A'] + row['B']"
+    "    return row[\"A\"] + row[\"B\"]"
    ]
   },
   {
@@ -733,10 +736,7 @@
     }
    ],
    "source": [
-    "df = cudf.DataFrame({\n",
-    "    'A': [1,2,3],\n",
-    "    'B': [4,cudf.NA,6]\n",
-    "})\n",
+    "df = cudf.DataFrame({\"A\": [1, 2, 3], \"B\": [4, cudf.NA, 6]})\n",
     "df"
    ]
   },
@@ -881,13 +881,14 @@
    ],
    "source": [
     "def f(row):\n",
-    "    x = row['a']\n",
+    "    x = row[\"a\"]\n",
     "    if x is cudf.NA:\n",
     "        return 0\n",
     "    else:\n",
     "        return x + 1\n",
     "\n",
-    "df = cudf.DataFrame({'a': [1, cudf.NA, 3]})\n",
+    "\n",
+    "df = cudf.DataFrame({\"a\": [1, cudf.NA, 3]})\n",
     "df"
    ]
   },
@@ -988,17 +989,15 @@
    ],
    "source": [
     "def f(row):\n",
-    "    x = row['a']\n",
-    "    y = row['b']\n",
+    "    x = row[\"a\"]\n",
+    "    y = row[\"b\"]\n",
     "    if x + y > 3:\n",
     "        return cudf.NA\n",
     "    else:\n",
     "        return x + y\n",
     "\n",
-    "df = cudf.DataFrame({\n",
-    "    'a': [1, 2, 3], \n",
-    "    'b': [2, 1, 1]\n",
-    "})\n",
+    "\n",
+    "df = cudf.DataFrame({\"a\": [1, 2, 3], \"b\": [2, 1, 1]})\n",
     "df"
    ]
   },
@@ -1099,12 +1098,10 @@
    ],
    "source": [
     "def f(row):\n",
-    "     return row['a'] + row['b']\n",
+    "    return row[\"a\"] + row[\"b\"]\n",
+    "\n",
     "\n",
-    "df = cudf.DataFrame({\n",
-    "    'a': [1, 2, 3], \n",
-    "    'b': [0.5, cudf.NA, 3.14]\n",
-    "})\n",
+    "df = cudf.DataFrame({\"a\": [1, 2, 3], \"b\": [0.5, cudf.NA, 3.14]})\n",
     "df"
    ]
   },
@@ -1214,15 +1211,14 @@
    ],
    "source": [
     "def f(row):\n",
-    "    x = row['a']\n",
+    "    x = row[\"a\"]\n",
     "    if x > 3:\n",
-    "            return x\n",
+    "        return x\n",
     "    else:\n",
-    "            return 1.5\n",
+    "        return 1.5\n",
+    "\n",
     "\n",
-    "df = cudf.DataFrame({\n",
-    "    'a': [1, 3, 5]\n",
-    "})\n",
+    "df = cudf.DataFrame({\"a\": [1, 3, 5]})\n",
     "df"
    ]
   },
@@ -1335,15 +1331,18 @@
    ],
    "source": [
     "def f(row):\n",
-    "    return row['a'] + (row['b'] - (row['c'] / row['d'])) % row['e']\n",
+    "    return row[\"a\"] + (row[\"b\"] - (row[\"c\"] / row[\"d\"])) % row[\"e\"]\n",
     "\n",
-    "df = cudf.DataFrame({\n",
-    "    'a': [1, 2, 3],\n",
-    "    'b': [4, 5, 6],\n",
-    "    'c': [cudf.NA, 4, 4],\n",
-    "    'd': [8, 7, 8],\n",
-    "    'e': [7, 1, 6]\n",
-    "})\n",
+    "\n",
+    "df = cudf.DataFrame(\n",
+    "    {\n",
+    "        \"a\": [1, 2, 3],\n",
+    "        \"b\": [4, 5, 6],\n",
+    "        \"c\": [cudf.NA, 4, 4],\n",
+    "        \"d\": [8, 7, 8],\n",
+    "        \"e\": [7, 1, 6],\n",
+    "    }\n",
+    ")\n",
     "df"
    ]
   },
@@ -1451,10 +1450,9 @@
     }
    ],
    "source": [
-    "str_df = cudf.DataFrame({\n",
-    "    'str_col': ['abc', 'ABC', 'Example'],\n",
-    "    'scale': [1, 2, 3]\n",
-    "})\n",
+    "str_df = cudf.DataFrame(\n",
+    "    {\"str_col\": [\"abc\", \"ABC\", \"Example\"], \"scale\": [1, 2, 3]}\n",
+    ")\n",
     "str_df"
    ]
   },
@@ -1466,9 +1464,9 @@
    "outputs": [],
    "source": [
     "def f(row):\n",
-    "    st = row['str_col']\n",
-    "    scale = row['scale']\n",
-    "    \n",
+    "    st = row[\"str_col\"]\n",
+    "    scale = row[\"scale\"]\n",
+    "\n",
     "    if len(st) > 5:\n",
     "        return len(st) + scale\n",
     "    else:\n",
@@ -1626,11 +1624,12 @@
     }
    ],
    "source": [
-    "df = df.apply_rows(conditional_add, \n",
-    "                   incols={'a':'x', 'e':'y'},\n",
-    "                   outcols={'out': np.float64},\n",
-    "                   kwargs={}\n",
-    "                  )\n",
+    "df = df.apply_rows(\n",
+    "    conditional_add,\n",
+    "    incols={\"a\": \"x\", \"e\": \"y\"},\n",
+    "    outcols={\"out\": np.float64},\n",
+    "    kwargs={},\n",
+    ")\n",
     "df.head()"
    ]
   },
@@ -1738,10 +1737,11 @@
     "    for i, (x, y) in enumerate(zip(a, b)):\n",
     "        out[i] = x + y\n",
     "\n",
-    "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)\n",
-    "df.loc[2, 'a'] = None\n",
-    "df.loc[3, 'b'] = None\n",
-    "df.loc[1, 'c'] = None\n",
+    "\n",
+    "df = randomdata(nrows=5, dtypes={\"a\": int, \"b\": int, \"c\": int}, seed=12)\n",
+    "df.loc[2, \"a\"] = None\n",
+    "df.loc[3, \"b\"] = None\n",
+    "df.loc[1, \"c\"] = None\n",
     "df.head()"
    ]
   },
@@ -1841,10 +1841,9 @@
     }
    ],
    "source": [
-    "df = df.apply_rows(gpu_add, \n",
-    "              incols=['a', 'b'],\n",
-    "              outcols={'out':np.float64},\n",
-    "              kwargs={})\n",
+    "df = df.apply_rows(\n",
+    "    gpu_add, incols=[\"a\", \"b\"], outcols={\"out\": np.float64}, kwargs={}\n",
+    ")\n",
     "df.head()"
    ]
   },
@@ -1892,7 +1891,7 @@
     }
    ],
    "source": [
-    "ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype='float64')\n",
+    "ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype=\"float64\")\n",
     "ser"
    ]
   },
@@ -1935,12 +1934,13 @@
    "source": [
     "import math\n",
     "\n",
+    "\n",
     "def example_func(window):\n",
     "    b = 0\n",
     "    for a in window:\n",
     "        b = max(b, math.sqrt(a))\n",
     "    if b == 8:\n",
-    "        return 100    \n",
+    "        return 100\n",
     "    return b"
    ]
   },
@@ -2064,8 +2064,8 @@
    ],
    "source": [
     "df2 = cudf.DataFrame()\n",
-    "df2['a'] = np.arange(55, 65, dtype='float64')\n",
-    "df2['b'] = np.arange(55, 65, dtype='float64')\n",
+    "df2[\"a\"] = np.arange(55, 65, dtype=\"float64\")\n",
+    "df2[\"b\"] = np.arange(55, 65, dtype=\"float64\")\n",
     "df2.head()"
    ]
   },
@@ -2279,7 +2279,9 @@
     }
    ],
    "source": [
-    "df = randomdata(nrows=10, dtypes={'a':float, 'b':bool, 'c':str, 'e': float}, seed=12)\n",
+    "df = randomdata(\n",
+    "    nrows=10, dtypes={\"a\": float, \"b\": bool, \"c\": str, \"e\": float}, seed=12\n",
+    ")\n",
     "df.head()"
    ]
   },
@@ -2290,7 +2292,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "grouped = df.groupby(['b'])"
+    "grouped = df.groupby([\"b\"])"
    ]
   },
   {
@@ -2469,9 +2471,9 @@
     }
    ],
    "source": [
-    "results = grouped.apply_grouped(rolling_avg,\n",
-    "                               incols=['e'],\n",
-    "                               outcols=dict(rolling_avg_e=np.float64))\n",
+    "results = grouped.apply_grouped(\n",
+    "    rolling_avg, incols=[\"e\"], outcols=dict(rolling_avg_e=np.float64)\n",
+    ")\n",
     "results"
    ]
   },
@@ -2554,8 +2556,9 @@
     "    i = cuda.grid(1)\n",
     "    if i < x.size:\n",
     "        out[i] = x[i] * 5\n",
-    "        \n",
-    "out = cudf.Series(cp.zeros(len(s), dtype='int32'))\n",
+    "\n",
+    "\n",
+    "out = cudf.Series(cp.zeros(len(s), dtype=\"int32\"))\n",
     "multiply_by_5.forall(s.shape[0])(s, out)\n",
     "out"
    ]
diff --git a/docs/cudf/source/user_guide/missing-data.ipynb b/docs/cudf/source/user_guide/missing-data.ipynb
index ac5bddd34cf..f1404ce0b77 100644
--- a/docs/cudf/source/user_guide/missing-data.ipynb
+++ b/docs/cudf/source/user_guide/missing-data.ipynb
@@ -39,8 +39,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import cudf\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "\n",
+    "import cudf"
    ]
   },
   {
@@ -50,7 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = cudf.DataFrame({'a': [1, 2, None, 4], 'b':[0.1, None, 2.3, 17.17]})"
+    "df = cudf.DataFrame({\"a\": [1, 2, None, 4], \"b\": [0.1, None, 2.3, 17.17]})"
    ]
   },
   {
@@ -221,7 +222,7 @@
     }
    ],
    "source": [
-    "df['a'].notna()"
+    "df[\"a\"].notna()"
    ]
   },
   {
@@ -304,7 +305,7 @@
     }
    ],
    "source": [
-    "df['b'] == np.nan"
+    "df[\"b\"] == np.nan"
    ]
   },
   {
@@ -535,7 +536,10 @@
    ],
    "source": [
     "import pandas as pd\n",
-    "datetime_series = cudf.Series([pd.Timestamp(\"20120101\"), pd.NaT, pd.Timestamp(\"20120101\")])\n",
+    "\n",
+    "datetime_series = cudf.Series(\n",
+    "    [pd.Timestamp(\"20120101\"), pd.NaT, pd.Timestamp(\"20120101\")]\n",
+    ")\n",
     "datetime_series"
    ]
   },
@@ -618,7 +622,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df1 = cudf.DataFrame({'a':[1, None, 2, 3, None], 'b':cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False)})"
+    "df1 = cudf.DataFrame(\n",
+    "    {\n",
+    "        \"a\": [1, None, 2, 3, None],\n",
+    "        \"b\": cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False),\n",
+    "    }\n",
+    ")"
    ]
   },
   {
@@ -628,7 +637,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2 = cudf.DataFrame({'a':[1, 11, 2, 34, 10], 'b':cudf.Series([0.23, 22, 3.2, None, 1])})"
+    "df2 = cudf.DataFrame(\n",
+    "    {\"a\": [1, 11, 2, 34, 10], \"b\": cudf.Series([0.23, 22, 3.2, None, 1])}\n",
+    ")"
    ]
   },
   {
@@ -899,7 +910,7 @@
     }
    ],
    "source": [
-    "df1['a']"
+    "df1[\"a\"]"
    ]
   },
   {
@@ -920,7 +931,7 @@
     }
    ],
    "source": [
-    "df1['a'].sum()"
+    "df1[\"a\"].sum()"
    ]
   },
   {
@@ -949,7 +960,7 @@
     }
    ],
    "source": [
-    "df1['a'].mean()"
+    "df1[\"a\"].mean()"
    ]
   },
   {
@@ -980,7 +991,7 @@
     }
    ],
    "source": [
-    "df1['a'].sum(skipna=False)"
+    "df1[\"a\"].sum(skipna=False)"
    ]
   },
   {
@@ -1001,7 +1012,7 @@
     }
    ],
    "source": [
-    "df1['a'].mean(skipna=False)"
+    "df1[\"a\"].mean(skipna=False)"
    ]
   },
   {
@@ -1035,7 +1046,7 @@
     }
    ],
    "source": [
-    "df1['a'].cumsum()"
+    "df1[\"a\"].cumsum()"
    ]
   },
   {
@@ -1069,7 +1080,7 @@
     }
    ],
    "source": [
-    "df1['a'].cumsum(skipna=False)"
+    "df1[\"a\"].cumsum(skipna=False)"
    ]
   },
   {
@@ -1148,7 +1159,7 @@
     }
    ],
    "source": [
-    "cudf.Series([], dtype='float64').sum()"
+    "cudf.Series([], dtype=\"float64\").sum()"
    ]
   },
   {
@@ -1219,7 +1230,7 @@
     }
    ],
    "source": [
-    "cudf.Series([], dtype='float64').prod()"
+    "cudf.Series([], dtype=\"float64\").prod()"
    ]
   },
   {
@@ -1382,7 +1393,7 @@
     }
    ],
    "source": [
-    "df1.groupby('a').mean()"
+    "df1.groupby(\"a\").mean()"
    ]
   },
   {
@@ -1463,7 +1474,7 @@
     }
    ],
    "source": [
-    "df1.groupby('a', dropna=False).mean()"
+    "df1.groupby(\"a\", dropna=False).mean()"
    ]
   },
   {
@@ -1670,7 +1681,7 @@
     }
    ],
    "source": [
-    "df1['b'].fillna(10)"
+    "df1[\"b\"].fillna(10)"
    ]
   },
   {
@@ -1697,7 +1708,8 @@
    "outputs": [],
    "source": [
     "import cupy as cp\n",
-    "dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list('ABC'))"
+    "\n",
+    "dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list(\"ABC\"))"
    ]
   },
   {
@@ -2339,7 +2351,7 @@
     }
    ],
    "source": [
-    "df1['a'].dropna()"
+    "df1[\"a\"].dropna()"
    ]
   },
   {

From d14d980b63402a779a3f75cc64cb3a5a0be7898d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 24 Feb 2023 15:30:07 -0800
Subject: [PATCH 09/10] Add dfg as a pre-commit hook (#12819)

This change allows local and remote runs to handle calls to dfg identically, and removes the need for a separate CI check.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/12819
---
 .github/workflows/pr.yaml | 2 ++
 .pre-commit-config.yaml   | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 952b58abda5..3a80139e333 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -30,6 +30,8 @@ jobs:
   checks:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04
+    with:
+      enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a030f3bd25b..1eb2c508db9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -160,6 +160,11 @@ repos:
                     .*test.*|
                     ^CHANGELOG.md$
                   )
+      - repo: https://github.com/rapidsai/dependency-file-generator
+        rev: v1.4.0
+        hooks:
+            - id: rapids-dependency-file-generator
+              args: ["--clean"]
 
 default_language_version:
       python: python3

From eb4da9345f172c3911f78c5e851757ec2ec222b9 Mon Sep 17 00:00:00 2001
From: Carl Simon Adorf <sadorf@nvidia.com>
Date: Sat, 25 Feb 2023 01:13:34 +0100
Subject: [PATCH 10/10] CI: Remove specification of manual stage for
 check_style.sh script. (#12803)

Do not explicitly specify to run the "manual" stage when running pre-commits as part of the ci/check_style.sh script.

Authors:
  - Carl Simon Adorf (https://github.com/csadorf)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/12803
---
 ci/check_style.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/check_style.sh b/ci/check_style.sh
index 020143095ce..f9bfea7b47c 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -20,4 +20,4 @@ mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
 
 # Run pre-commit checks
-pre-commit run --hook-stage manual --all-files --show-diff-on-failure
+pre-commit run --all-files --show-diff-on-failure