rapidsai · rapids-bot · Mar 15, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -23,13 +23,6 @@ repos:
         args: ["--config-root=python/", "--resolve-all-configs"]
         files: python/.*
         types_or: [python, cython, pyi]
-  - repo: https://github.com/psf/black
-    rev: 23.12.1
-    hooks:
-      - id: black
-        files: python/.*
-        # Explicitly specify the pyproject.toml at the repo root, not per-project.
-        args: ["--config", "pyproject.toml"]
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.0
     hooks:
@@ -155,6 +148,8 @@ repos:
     hooks:
       - id: ruff
         files: python/.*$
+      - id: ruff-format
+        files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
     rev: v0.0.1
     hooks:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,22 +1,4 @@
-[tool.black]
-line-length = 79
-target-version = ["py39"]
-include = '\.py?$'
-force-exclude = '''
-/(
-    thirdparty |
-    \.eggs |
-    \.git |
-    \.hg |
-    \.mypy_cache |
-    \.tox |
-    \.venv |
-    _build |
-    buck-out |
-    build |
-    dist
-)/
-'''
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 [tool.pydocstyle]
 # Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather

diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -30,9 +30,7 @@ def bench_eval_func(benchmark, expr, dataframe):
     [2, 3, 4],
 )
 def bench_merge(benchmark, dataframe, num_key_cols):
-    benchmark(
-        dataframe.merge, dataframe, on=list(dataframe.columns[:num_key_cols])
-    )
+    benchmark(dataframe.merge, dataframe, on=list(dataframe.columns[:num_key_cols]))
 
 
 # TODO: Some of these cases could be generalized to an IndexedFrame benchmark
@@ -67,9 +65,7 @@ def random_state(request):
 def bench_sample(benchmark, dataframe, axis, frac, random_state):
     if axis == 1 and isinstance(random_state, cupy.random.RandomState):
         pytest.skip("Unsupported params.")
-    benchmark(
-        dataframe.sample, frac=frac, axis=axis, random_state=random_state
-    )
+    benchmark(dataframe.sample, frac=frac, axis=axis, random_state=random_state)
 
 
 @benchmark_with_object(cls="dataframe", dtype="int")
@@ -121,10 +117,7 @@ def bench_groupby(benchmark, dataframe, num_key_cols):
     [
         "sum",
         ["sum", "mean"],
-        {
-            f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"]
-            for i in range(6)
-        },
+        {f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"] for i in range(6)},
     ],
 )
 @pytest.mark.parametrize(
@@ -154,9 +147,7 @@ def bench_groupby_sample(
         kwargs = {"frac": target_sample_frac, "replace": replace}
     else:
         minsize = grouper.size().min()
-        target_size = numpy.round(
-            target_sample_frac * minsize, decimals=0
-        ).astype(int)
+        target_size = numpy.round(target_sample_frac * minsize, decimals=0).astype(int)
         kwargs = {"n": target_size, "replace": replace}
 
     benchmark(grouper.sample, **kwargs)
@@ -165,9 +156,7 @@ def bench_groupby_sample(
 @benchmark_with_object(cls="dataframe", dtype="int")
 @pytest.mark.parametrize("num_cols_to_sort", [1])
 def bench_sort_values(benchmark, dataframe, num_cols_to_sort):
-    benchmark(
-        dataframe.sort_values, list(dataframe.columns[:num_cols_to_sort])
-    )
+    benchmark(dataframe.sort_values, list(dataframe.columns[:num_cols_to_sort]))
 
 
 @benchmark_with_object(cls="dataframe", dtype="int")

diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py
@@ -9,9 +9,7 @@
 from utils import benchmark_with_object
 
 
-@pytest_cases.parametrize_with_cases(
-    "objs", prefix="concat", cases="cases_functions"
-)
+@pytest_cases.parametrize_with_cases("objs", prefix="concat", cases="cases_functions")
 @pytest.mark.parametrize(
     "axis",
     [
@@ -21,9 +19,7 @@
 @pytest.mark.parametrize("join", ["inner", "outer"])
 @pytest.mark.parametrize("ignore_index", [True, False])
 def bench_concat_axis_1(benchmark, objs, axis, join, ignore_index):
-    benchmark(
-        cudf.concat, objs=objs, axis=axis, join=join, ignore_index=ignore_index
-    )
+    benchmark(cudf.concat, objs=objs, axis=axis, join=join, ignore_index=ignore_index)
 
 
 @pytest.mark.parametrize("size", [10_000, 100_000])
@@ -51,9 +47,7 @@ def bench_get_dummies_simple(benchmark, prefix):
             "col3": cudf.Series(list(range(100, 110)), dtype="category"),
         }
     )
-    benchmark(
-        cudf.get_dummies, df, columns=["col1", "col2", "col3"], prefix=prefix
-    )
+    benchmark(cudf.get_dummies, df, columns=["col1", "col2", "col3"], prefix=prefix)
 
 
 @benchmark_with_object(cls="dataframe", dtype="int", cols=6)

diff --git a/python/cudf/benchmarks/API/bench_multiindex.py b/python/cudf/benchmarks/API/bench_multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Benchmarks of MultiIndex methods."""
 
@@ -31,9 +31,7 @@ def bench_from_pandas(benchmark, pidx):
 
 
 def bench_constructor(benchmark, midx):
-    benchmark(
-        cudf.MultiIndex, codes=midx.codes, levels=midx.levels, names=midx.names
-    )
+    benchmark(cudf.MultiIndex, codes=midx.codes, levels=midx.levels, names=midx.names)
 
 
 def bench_from_frame(benchmark, midx):

diff --git a/python/cudf/benchmarks/API/cases_functions.py b/python/cudf/benchmarks/API/cases_functions.py
@@ -28,9 +28,7 @@ def concat_case_contiguous_indexes(nr):
 @pytest_cases.parametrize("nr", NUM_ROWS)
 def concat_case_contiguous_indexes_different_cols(nr):
     return [
-        cudf.DataFrame(
-            {"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)}
-        ),
+        cudf.DataFrame({"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)}),
         cudf.DataFrame(
             {"c": cupy.tile([4, 5, 7], nr)},
             index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3),
@@ -117,30 +115,22 @@ def concat_case_unique_columns(nr):
 @pytest_cases.parametrize("nr", NUM_ROWS)
 def concat_case_unique_columns_with_different_range_index(nr):
     return [
-        cudf.DataFrame(
-            {"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)}
-        ),
+        cudf.DataFrame({"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)}),
         cudf.DataFrame(
             {"c": cupy.tile([4, 5, 7], nr)},
             index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3),
         ),
-        cudf.DataFrame(
-            {"d": cupy.tile([1, 2, 3], nr), "e": cupy.tile([4, 5, 7], nr)}
-        ),
+        cudf.DataFrame({"d": cupy.tile([1, 2, 3], nr), "e": cupy.tile([4, 5, 7], nr)}),
         cudf.DataFrame(
             {"f": cupy.tile([4, 5, 7], nr)},
             index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3),
         ),
-        cudf.DataFrame(
-            {"g": cupy.tile([1, 2, 3], nr), "h": cupy.tile([4, 5, 7], nr)}
-        ),
+        cudf.DataFrame({"g": cupy.tile([1, 2, 3], nr), "h": cupy.tile([4, 5, 7], nr)}),
         cudf.DataFrame(
             {"i": cupy.tile([4, 5, 7], nr)},
             index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3),
         ),
-        cudf.DataFrame(
-            {"j": cupy.tile([1, 2, 3], nr), "k": cupy.tile([4, 5, 7], nr)}
-        ),
+        cudf.DataFrame({"j": cupy.tile([1, 2, 3], nr), "k": cupy.tile([4, 5, 7], nr)}),
         cudf.DataFrame(
             {"l": cupy.tile([4, 5, 7], nr)},
             index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3),

diff --git a/python/cudf/benchmarks/common/utils.py b/python/cudf/benchmarks/common/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Common utilities for fixture creation and benchmarking."""
 
@@ -42,9 +42,7 @@ def make_boolean_mask_column(size):
     return cudf.core.column.as_column(rstate.randint(0, 2, size).astype(bool))
 
 
-def benchmark_with_object(
-    cls, *, dtype="int", nulls=None, cols=None, rows=None
-):
+def benchmark_with_object(cls, *, dtype="int", nulls=None, cols=None, rows=None):
     """Pass "standard" cudf fixtures to functions without renaming parameters.
 
     The fixture generation logic in conftest.py provides a plethora of useful

diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
@@ -93,10 +93,7 @@ def make_dataframe(nr, nc, column_generator=column_generator):
             string.ascii_lowercase
         ), "make_dataframe only supports a maximum of 26 columns"
         return cudf.DataFrame(
-            {
-                f"{string.ascii_lowercase[i]}": column_generator(nr)
-                for i in range(nc)
-            }
+            {f"{string.ascii_lowercase[i]}": column_generator(nr) for i in range(nc)}
         )
 
     for nr in NUM_ROWS:
@@ -108,9 +105,7 @@ def make_dataframe(nr, nc, column_generator=column_generator):
         # https://github.com/smarie/python-pytest-cases/issues/278
         # Once that is fixed we could remove all the extraneous `request`
         # fixtures in these fixtures.
-        def series_nulls_false(
-            request, nr=nr, column_generator=column_generator
-        ):
+        def series_nulls_false(request, nr=nr, column_generator=column_generator):
             return cudf.Series(column_generator(nr))
 
         make_fixture(
@@ -120,9 +115,7 @@ def series_nulls_false(
             fixtures,
         )
 
-        def series_nulls_true(
-            request, nr=nr, column_generator=column_generator
-        ):
+        def series_nulls_true(request, nr=nr, column_generator=column_generator):
             s = cudf.Series(column_generator(nr))
             s.iloc[::2] = None
             return s
@@ -135,9 +128,7 @@ def series_nulls_true(
         )
 
         # For now, not bothering to include a nullable index fixture.
-        def index_nulls_false(
-            request, nr=nr, column_generator=column_generator
-        ):
+        def index_nulls_false(request, nr=nr, column_generator=column_generator):
             return cudf.Index(column_generator(nr))
 
         make_fixture(

diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py
@@ -31,9 +31,7 @@ def bench_unique_single_column(benchmark, column):
 @pytest.mark.parametrize("nullify", [True, False])
 @pytest.mark.parametrize("gather_how", ["sequence", "reverse", "random"])
 def bench_take(benchmark, column, gather_how, nullify):
-    gather_map = make_gather_map(
-        column.size * 0.4, column.size, gather_how
-    )._column
+    gather_map = make_gather_map(column.size * 0.4, column.size, gather_how)._column
     benchmark(column.take, gather_map, nullify=nullify)
 
 
@@ -107,8 +105,6 @@ def setitem_case_int_column_align_to_col_size(column):
 #           column (len(val) != len(key) and len == num_true)
 
 
-@pytest_cases.parametrize_with_cases(
-    "column,key,value", cases=".", prefix="setitem"
-)
+@pytest_cases.parametrize_with_cases("column,key,value", cases=".", prefix="setitem")
 def bench_setitem(benchmark, column, key, value):
     benchmark(column.__setitem__, key, value)
diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 import io
@@ -69,17 +69,14 @@ def generate_input(self):
                 - cudf.utils.dtypes.TIMEDELTA_TYPES
             )
 
-            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
-            )
+            dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list)
             self._current_params["dtypes_meta"] = dtypes_meta
             seed = random.randint(0, 2**32 - 1)
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_cols"] = num_cols
         logging.info(
-            f"Generating DataFrame with rows: {num_rows} "
-            f"and columns: {num_cols}"
+            f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}"
         )
         table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
         df = pyarrow_to_pandas(table)

diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -53,16 +53,13 @@ def generate_input(self):
             seed = random.randint(0, 2**32 - 1)
             random.seed(seed)
             dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
-            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
-            )
+            dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list)
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_columns"] = num_cols
         logging.info(
-            f"Generating DataFrame with rows: {num_rows} "
-            f"and columns: {num_cols}"
+            f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}"
         )
         table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
         df = pyarrow_to_pandas(table)
@@ -84,18 +81,12 @@ def set_rand_params(self, params):
                     col_val = np.random.choice(
                         [
                             None,
-                            np.unique(
-                                np.random.choice(self._df.columns, col_size)
-                            ),
+                            np.unique(np.random.choice(self._df.columns, col_size)),
                         ]
                     )
-                    params_dict[param] = (
-                        col_val if col_val is None else list(col_val)
-                    )
+                    params_dict[param] = col_val if col_val is None else list(col_val)
                 elif param == "dtype":
-                    dtype_val = np.random.choice(
-                        [None, self._df.dtypes.to_dict()]
-                    )
+                    dtype_val = np.random.choice([None, self._df.dtypes.to_dict()])
                     if dtype_val is not None:
                         dtype_val = {
                             col_name: "category"
@@ -110,13 +101,9 @@ def set_rand_params(self, params):
                     )
                     params_dict[param] = header_val
                 elif param == "skiprows":
-                    params_dict[param] = np.random.randint(
-                        low=0, high=len(self._df)
-                    )
+                    params_dict[param] = np.random.randint(low=0, high=len(self._df))
                 elif param == "skipfooter":
-                    params_dict[param] = np.random.randint(
-                        low=0, high=len(self._df)
-                    )
+                    params_dict[param] = np.random.randint(low=0, high=len(self._df))
                 elif param == "nrows":
                     nrows_val = np.random.choice(
                         [None, np.random.randint(low=0, high=len(self._df))]
@@ -158,16 +145,13 @@ def generate_input(self):
             seed = random.randint(0, 2**32 - 1)
             random.seed(seed)
             dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
-            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
-            )
+            dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list)
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_columns"] = num_cols
         logging.info(
-            f"Generating DataFrame with rows: {num_rows} "
-            f"and columns: {num_cols}"
+            f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}"
         )
         table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
         df = pyarrow_to_pandas(table)
@@ -188,9 +172,7 @@ def set_rand_params(self, params):
                     col_size = self._rand(len(self._current_buffer.columns))
                     params_dict[param] = list(
                         np.unique(
-                            np.random.choice(
-                                self._current_buffer.columns, col_size
-                            )
+                            np.random.choice(self._current_buffer.columns, col_size)
                         )
                     )
                 elif param == "chunksize":