diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9235c80bdc9..67a71021a63 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,13 +23,6 @@ repos: args: ["--config-root=python/", "--resolve-all-configs"] files: python/.* types_or: [python, cython, pyi] - - repo: https://github.com/psf/black - rev: 23.12.1 - hooks: - - id: black - files: python/.* - # Explicitly specify the pyproject.toml at the repo root, not per-project. - args: ["--config", "pyproject.toml"] - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.16.0 hooks: @@ -64,9 +57,6 @@ repos: # Use the cudf_kafka isort orderings in notebooks so that dask # and RAPIDS packages have their own sections. args: ["--settings-file=python/cudf_kafka/pyproject.toml"] - - id: nbqa-black - # Explicitly specify the pyproject.toml at the repo root, not per-project. - args: ["--config=pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v16.0.6 hooks: @@ -155,6 +145,8 @@ repos: hooks: - id: ruff files: python/.*$ + - id: ruff-format + files: python/.*$ - repo: https://github.com/rapidsai/pre-commit-hooks rev: v0.0.1 hooks: diff --git a/pyproject.toml b/pyproject.toml index 4048eb9452c..c71394058df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,22 +1,4 @@ -[tool.black] -line-length = 79 -target-version = ["py39"] -include = '\.py?$' -force-exclude = ''' -/( - thirdparty | - \.eggs | - \.git | - \.hg | - \.mypy_cache | - \.tox | - \.venv | - _build | - buck-out | - build | - dist -)/ -''' +# Copyright (c) 2019-2024, NVIDIA CORPORATION. [tool.pydocstyle] # Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather @@ -60,13 +42,15 @@ select = ["E", "F", "W"] ignore = [ # whitespace before : "E203", + # line-too-long (due to Copyright header) + "E501", ] fixable = ["ALL"] exclude = [ # TODO: Remove this in a follow-up where we fix __all__. "__init__.py", ] -line-length = 88 +line-length = 79 [tool.ruff.per-file-ignores] # Lots of pytest implicitly injected attributes in conftest-patch.py diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 053425fff8d..4e2fad08d56 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -85,8 +85,9 @@ def _read_tzfile_as_frame(tzdir, zone_name): if not transition_times_and_offsets: # this happens for UTC-like zones min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]") - transition_times_and_offsets = as_column([min_date]), as_column( - [np.timedelta64(0, "s")] + transition_times_and_offsets = ( + as_column([min_date]), + as_column([np.timedelta64(0, "s")]), ) return DataFrame._from_data( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3e0ec4b5cd7..f13d8cf12f7 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1731,7 +1731,8 @@ def as_column( If None (default), treats NaN values in arbitrary as null if there is no mask passed along with it. If True, combines the mask and NaNs to form a new validity mask. If False, leaves NaN values as is. - Only applies when arbitrary is not a cudf object (Index, Series, Column). + Only applies when arbitrary is not a cudf object + (Index, Series, Column). dtype : optional Optionally typecast the constructed Column to the given dtype. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0440512c467..35588725655 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -470,9 +470,12 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): _frame: DataFrame def __getitem__(self, arg): - row_key, ( - col_is_scalar, - column_names, + ( + row_key, + ( + col_is_scalar, + column_names, + ), ) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame) row_spec = indexing_utils.parse_row_iloc_indexer( row_key, len(self._frame) @@ -6901,16 +6904,18 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): if future_stack: if dropna is not no_default: raise ValueError( - "dropna must be unspecified with future_stack=True as the new " - "implementation does not introduce rows of NA values. This " - "argument will be removed in a future version of cudf." + "dropna must be unspecified with future_stack=True as " + "the new implementation does not introduce rows of NA " + "values. This argument will be removed in a future " + "version of cudf." ) else: if dropna is not no_default or self._data.nlevels > 1: warnings.warn( - "The previous implementation of stack is deprecated and will be " - "removed in a future version of cudf. Specify future_stack=True " - "to adopt the new implementation and silence this warning.", + "The previous implementation of stack is deprecated and " + "will be removed in a future version of cudf. Specify " + "future_stack=True to adopt the new implementation and " + "silence this warning.", FutureWarning, ) if dropna is no_default: @@ -7028,9 +7033,13 @@ def unnamed_group_generator(): unique_named_levels, axis=0, fill_value=-1 ).values else: - yield grpdf.reindex( - unique_named_levels, axis=0, fill_value=-1 - ).sort_index().values + yield ( + grpdf.reindex( + unique_named_levels, axis=0, fill_value=-1 + ) + .sort_index() + .values + ) else: if future_stack: yield column_idx_df.values diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e5030eb634b..d995964057b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -282,9 +282,12 @@ def __iter__(self): if isinstance(group_names, cudf.BaseIndex): group_names = group_names.to_pandas() for i, name in enumerate(group_names): - yield (name,) if isinstance(self._by, list) and len( - self._by - ) == 1 else name, grouped_values[offsets[i] : offsets[i + 1]] + yield ( + (name,) + if isinstance(self._by, list) and len(self._by) == 1 + else name, + grouped_values[offsets[i] : offsets[i + 1]], + ) @property def dtypes(self): @@ -2269,8 +2272,8 @@ def fillna( """ warnings.warn( "groupby fillna is deprecated and " - "will be removed in a future version. Use groupby ffill or groupby bfill " - "for forward or backward filling instead.", + "will be removed in a future version. Use groupby ffill " + "or groupby bfill for forward or backward filling instead.", FutureWarning, ) if inplace: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 94d862d52b4..ca9d5590044 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -211,8 +211,8 @@ def _get_label_range_or_mask(index, start, stop, step): return slice(start_loc, stop_loc) else: raise KeyError( - "Value based partial slicing on non-monotonic DatetimeIndexes " - "with non-existing keys is not allowed.", + "Value based partial slicing on non-monotonic " + "DatetimeIndexes with non-existing keys is not allowed.", ) elif start is not None: boolean_mask = index >= start @@ -2449,7 +2449,8 @@ def squeeze(self, axis: Literal["index", "columns", 0, 1, None] = None): ---------- axis : {0 or 'index', 1 or 'columns', None}, default None A specific axis to squeeze. By default, all length-1 axes are - squeezed. For `Series` this parameter is unused and defaults to `None`. + squeezed. For `Series` this parameter is unused and defaults + to `None`. Returns ------- @@ -5835,9 +5836,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rfloordiv( - self, other, axis, level=None, fill_value=None - ): # noqa: D102 + def rfloordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5967,9 +5966,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def eq( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def eq(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__eq__", fill_value=fill_value, can_reindex=True ) @@ -6009,9 +6006,7 @@ def eq( ), ) ) - def ne( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def ne(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__ne__", fill_value=fill_value, can_reindex=True ) @@ -6051,9 +6046,7 @@ def ne( ), ) ) - def lt( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def lt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__lt__", fill_value=fill_value, can_reindex=True ) @@ -6093,9 +6086,7 @@ def lt( ), ) ) - def le( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def le(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__le__", fill_value=fill_value, can_reindex=True ) @@ -6135,9 +6126,7 @@ def le( ), ) ) - def gt( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def gt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__gt__", fill_value=fill_value, can_reindex=True ) @@ -6177,9 +6166,7 @@ def gt( ), ) ) - def ge( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def ge(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__ge__", fill_value=fill_value, can_reindex=True ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index d182b7b4a7c..65f97c99934 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -164,9 +164,9 @@ def to_datetime( if errors == "ignore": warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_datetime without passing `errors` and catch exceptions " - "explicitly instead", + "errors='ignore' is deprecated and will raise in a " + "future version. Use to_datetime without passing `errors` " + "and catch exceptions explicitly instead", FutureWarning, ) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index e1424459c8f..68b23f1e059 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -97,9 +97,9 @@ def to_numeric(arg, errors="raise", downcast=None): raise ValueError("invalid error value specified") elif errors == "ignore": warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_numeric without passing `errors` and catch exceptions " - "explicitly instead", + "errors='ignore' is deprecated and will raise in " + "a future version. Use to_numeric without passing `errors` " + "and catch exceptions explicitly instead", FutureWarning, ) diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index fdce404d887..3c02ee52b25 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import operator from functools import partial @@ -249,7 +249,7 @@ def replace_impl(context, builder, sig, args): replacement_ptr = builder.alloca(args[2].type) builder.store(args[0], src_ptr) - builder.store(args[1], to_replace_ptr), + builder.store(args[1], to_replace_ptr) builder.store(args[2], replacement_ptr) udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 12baf1ea6d1..bc1f4f2557e 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -41,9 +41,7 @@ from cudf.utils.utils import initfunc # Maximum size of a string column is 2 GiB -_STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get( - "STRINGS_UDF_HEAP_SIZE", 2**31 -) +_STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get("STRINGS_UDF_HEAP_SIZE", 2**31) _heap_size = 0 _cudf_str_dtype = dtype(str) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 3f5df18eae1..e811ba1351a 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -437,9 +437,7 @@ def __get__(self, obj, owner=None) -> Any: # methods because dir for the method won't be the same as for # the pure unbound function, but the alternative is # materializing the slow object when we don't really want to. - result._fsproxy_slow_dir = dir( - slow_result_type - ) # type: ignore + result._fsproxy_slow_dir = dir(slow_result_type) # type: ignore return result diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py index c5662d06e09..0124d411e3b 100644 --- a/python/cudf/cudf/pandas/profiler.py +++ b/python/cudf/cudf/pandas/profiler.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -124,7 +124,7 @@ def get_namespaced_function_name( _MethodProxy, type[_FinalProxy], type[_IntermediateProxy], - ] + ], ): if isinstance(func_obj, _MethodProxy): # Extract classname from method object @@ -177,17 +177,15 @@ def _tracefunc(self, frame, event, arg): if self._currkey is not None and arg is not None: if arg[1]: # fast run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey][ - "gpu_time" - ] = run_time + self._results[self._currkey].get( - "gpu_time", 0 + self._results[self._currkey]["gpu_time"] = ( + run_time + + self._results[self._currkey].get("gpu_time", 0) ) else: run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey][ - "cpu_time" - ] = run_time + self._results[self._currkey].get( - "cpu_time", 0 + self._results[self._currkey]["cpu_time"] = ( + run_time + + self._results[self._currkey].get("cpu_time", 0) ) frame_locals = inspect.getargvalues(frame).locals diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e034a3f5e10..ead1ab2da6c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2351,7 +2351,7 @@ def test_dataframe_reductions(data, axis, func, skipna): for kwargs in all_kwargs: if expected_exception is not None: with pytest.raises(expected_exception): - getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs), + (getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs),) else: expect = getattr(pdf, func)(axis=axis, skipna=skipna, **kwargs) with expect_warning_if( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 51e9a3022f4..05213d7601c 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1721,8 +1721,7 @@ def test_get_indexer_single_unique_numeric(idx, key, method): if ( # `method` only applicable to monotonic index - not pi.is_monotonic_increasing - and method is not None + not pi.is_monotonic_increasing and method is not None ): assert_exceptions_equal( lfunc=pi.get_loc, diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 69ddd936eee..a9bca7d8b98 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -608,7 +608,8 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): from pyarrow import orc supported_stat_types = supported_numpy_dtypes + ["str"] - # Writing bool columns to multiple row groups is disabled until #6763 is fixed + # Writing bool columns to multiple row groups is disabled + # until #6763 is fixed if nrows == 100000: supported_stat_types.remove("bool") @@ -683,7 +684,8 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): np.random.seed(0) supported_stat_types = supported_numpy_dtypes + ["str"] - # Writing bool columns to multiple row groups is disabled until #6763 is fixed + # Writing bool columns to multiple row groups is disabled + # until #6763 is fixed if nrows == 200000: supported_stat_types.remove("bool") @@ -697,8 +699,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): # Make a dataframe gdf = cudf.DataFrame( { - "col_" - + str(dtype): gen_rand_series( + "col_" + str(dtype): gen_rand_series( dtype, nrows // 2, has_nulls=True, @@ -716,8 +717,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): # write and no pointers are saved into the original table gdf = cudf.DataFrame( { - "col_" - + str(dtype): gen_rand_series( + "col_" + str(dtype): gen_rand_series( dtype, nrows // 2, has_nulls=True, diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 18efd4417a1..8b72fe84359 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1087,8 +1087,9 @@ def struct_gen(gen, skip_rows, num_rows, include_validity=False): def R(first_val, num_fields): return { - "col" - + str(f): (gen[f](first_val, first_val) if f % 4 != 0 else None) + "col" + str(f): ( + gen[f](first_val, first_val) if f % 4 != 0 else None + ) if include_validity else (gen[f](first_val, first_val)) for f in range(len(gen)) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 925fd24e6c8..85abf438efb 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -85,9 +85,7 @@ 0 10 hello 1 20 rapids 2 30 ai -""".format( - remote_data_sources=_docstring_remote_sources -) +""".format(remote_data_sources=_docstring_remote_sources) doc_read_avro = docfmt_partial(docstring=_docstring_read_avro) _docstring_read_parquet_metadata = """ @@ -1416,9 +1414,7 @@ list of Filepath strings or in-memory buffers of data. compression : str Type of compression algorithm for the content - """.format( - bytes_per_thread=_BYTES_PER_THREAD_DEFAULT -) + """.format(bytes_per_thread=_BYTES_PER_THREAD_DEFAULT) doc_get_reader_filepath_or_buffer = docfmt_partial(