From d1bad33caef34b8fa95543c7494780f2084ee603 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 2 Dec 2024 21:48:26 +0000 Subject: [PATCH 1/2] Update the hook versions in pre-commit (#17462) The major change here is to move to ruff 0.8 which, among other things, introduces automatic sorting for `__all__` and `__slots__` (so I've turned those on and fixed things). Notable actual bug fix: https://github.com/rapidsai/cudf/commit/b2cfb9c88db13228a94628970c4c8c01a5527d56 Authors: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) - Jake Awe (https://github.com/AyodeAwe) - Nghia Truong (https://github.com/ttnghia) - Richard (Rick) Zamora (https://github.com/rjzamora) URL: https://github.com/rapidsai/cudf/pull/17462 --- .pre-commit-config.yaml | 14 ++--- cpp/src/lists/set_operations.cu | 2 +- pyproject.toml | 25 +++++++-- python/cudf/benchmarks/common/config.py | 4 +- python/cudf/benchmarks/conftest.py | 16 +++--- python/cudf/cudf/__init__.py | 2 +- python/cudf/cudf/_fuzz_testing/fuzzer.py | 2 +- python/cudf/cudf/core/_base_index.py | 8 +-- python/cudf/cudf/core/buffer/spill_manager.py | 4 +- .../cudf/cudf/core/buffer/spillable_buffer.py | 2 +- python/cudf/cudf/core/column/__init__.py | 48 ++++++++++++----- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/datetime.py | 4 +- python/cudf/cudf/core/column/decimal.py | 11 ++-- python/cudf/cudf/core/column/interval.py | 3 +- python/cudf/cudf/core/column/lists.py | 3 +- python/cudf/cudf/core/column/numerical.py | 9 ++-- .../cudf/cudf/core/column/numerical_base.py | 2 +- python/cudf/cudf/core/column/string.py | 13 ++--- python/cudf/cudf/core/column/struct.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 6 ++- python/cudf/cudf/core/column_accessor.py | 6 +-- python/cudf/cudf/core/cut.py | 2 +- python/cudf/cudf/core/dataframe.py | 50 ++++++++++++------ python/cudf/cudf/core/dtypes.py | 18 +++---- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/groupby/groupby.py | 4 +- python/cudf/cudf/core/index.py | 22 ++++---- python/cudf/cudf/core/indexed_frame.py | 42 +++++++-------- python/cudf/cudf/core/mixins/scans.py | 4 +- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/core/scalar.py | 2 +- python/cudf/cudf/core/series.py | 14 ++--- python/cudf/cudf/core/single_column_frame.py | 6 +-- python/cudf/cudf/core/udf/masked_typing.py | 4 +- python/cudf/cudf/datasets.py | 2 +- python/cudf/cudf/io/parquet.py | 7 +-- python/cudf/cudf/options.py | 2 +- python/cudf/cudf/pandas/__init__.py | 4 +- python/cudf/cudf/pandas/__main__.py | 2 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 6 +-- python/cudf/cudf/pandas/fast_slow_proxy.py | 6 +-- .../pandas/scripts/analyze-test-failures.py | 2 +- python/cudf/cudf/testing/dataset_generator.py | 4 +- python/cudf/cudf/testing/testing.py | 4 +- .../cudf/tests/series/test_datetimelike.py | 4 +- python/cudf/cudf/tests/test_binops.py | 12 ++--- python/cudf/cudf/tests/test_categorical.py | 6 +-- python/cudf/cudf/tests/test_concat.py | 6 +-- python/cudf/cudf/tests/test_csv.py | 4 +- .../cudf/tests/test_cuda_array_interface.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 28 +++++----- python/cudf/cudf/tests/test_feather.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 7 ++- python/cudf/cudf/tests/test_hdf.py | 2 +- python/cudf/cudf/tests/test_index.py | 14 ++--- python/cudf/cudf/tests/test_joining.py | 6 +-- python/cudf/cudf/tests/test_json.py | 14 ++--- python/cudf/cudf/tests/test_orc.py | 4 +- python/cudf/cudf/tests/test_parquet.py | 2 +- python/cudf/cudf/tests/test_quantiles.py | 2 +- python/cudf/cudf/tests/test_replace.py | 4 +- python/cudf/cudf/tests/test_reshape.py | 6 +-- python/cudf/cudf/tests/test_scalar.py | 4 +- python/cudf/cudf/tests/test_series.py | 6 +-- python/cudf/cudf/tests/test_setitem.py | 2 +- python/cudf/cudf/tests/test_spilling.py | 2 +- python/cudf/cudf/tests/test_string.py | 4 +- python/cudf/cudf/tests/test_testing.py | 2 +- .../cudf/cudf/tests/text/test_text_methods.py | 40 +++++++------- python/cudf/cudf/utils/ioutils.py | 2 +- python/cudf/cudf/utils/queryutils.py | 4 +- python/cudf/cudf/utils/utils.py | 2 +- .../cudf_pandas_tests/test_cudf_pandas.py | 4 +- .../tests/test_matplotlib.py | 2 +- .../tests/test_plotly.py | 2 +- .../tests/test_seaborn.py | 2 +- python/cudf_polars/cudf_polars/__init__.py | 2 +- .../cudf_polars/containers/__init__.py | 2 +- python/cudf_polars/cudf_polars/dsl/expr.py | 32 ++++++------ .../dsl/expressions/aggregation.py | 2 +- .../cudf_polars/dsl/expressions/base.py | 2 +- .../cudf_polars/dsl/expressions/boolean.py | 2 +- .../cudf_polars/dsl/expressions/rolling.py | 2 +- .../cudf_polars/dsl/expressions/selection.py | 2 +- .../cudf_polars/dsl/expressions/string.py | 2 +- .../cudf_polars/dsl/expressions/unary.py | 2 +- python/cudf_polars/cudf_polars/dsl/ir.py | 52 +++++++++---------- .../cudf_polars/cudf_polars/dsl/traversal.py | 6 +-- .../cudf_polars/typing/__init__.py | 10 ++-- .../cudf_polars/cudf_polars/utils/dtypes.py | 16 +++--- python/cudf_polars/pyproject.toml | 1 + python/dask_cudf/dask_cudf/__init__.py | 20 +++---- .../dask_cudf/dask_cudf/_expr/collection.py | 9 ++-- python/dask_cudf/dask_cudf/core.py | 4 +- python/dask_cudf/dask_cudf/io/__init__.py | 5 +- python/dask_cudf/dask_cudf/io/parquet.py | 2 +- python/dask_cudf/dask_cudf/tests/test_core.py | 4 +- .../dask_cudf/tests/test_dispatch.py | 2 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 2 +- python/libcudf/libcudf/__init__.py | 2 + python/pylibcudf/pylibcudf/__init__.py | 6 +-- python/pylibcudf/pylibcudf/nvtext/__init__.py | 2 +- .../pylibcudf/pylibcudf/tests/io/test_csv.py | 2 +- 105 files changed, 431 insertions(+), 368 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 37b26949804..39869b67547 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace exclude: | @@ -17,11 +17,11 @@ repos: ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.2 + rev: v0.16.6 hooks: - id: cython-lint - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.10.0' + rev: 'v1.13.0' hooks: - id: mypy additional_dependencies: [types-cachetools] @@ -33,7 +33,7 @@ repos: "python/dask_cudf/dask_cudf"] pass_filenames: false - repo: https://github.com/nbQA-dev/nbQA - rev: 1.8.5 + rev: 1.9.1 hooks: - id: nbqa-isort # Use the cudf_kafka isort orderings in notebooks so that dask @@ -52,7 +52,7 @@ repos: ^cpp/include/cudf_test/cxxopts.hpp ) - repo: https://github.com/sirosen/texthooks - rev: 0.6.6 + rev: 0.6.7 hooks: - id: fix-smartquotes exclude: | @@ -133,7 +133,7 @@ repos: pass_filenames: false verbose: true - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell additional_dependencies: [tomli] @@ -144,7 +144,7 @@ repos: ^CHANGELOG.md$ ) - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.8 + rev: v0.8.0 hooks: - id: ruff args: ["--fix"] diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu index c0bc10dd266..6f2acbb0712 100644 --- a/cpp/src/lists/set_operations.cu +++ b/cpp/src/lists/set_operations.cu @@ -72,7 +72,7 @@ std::unique_ptr have_overlap(lists_column_view const& lhs, // - Generate labels for lhs and rhs child elements. // - Check existence for rows of the table {rhs_labels, rhs_child} in the table // {lhs_labels, lhs_child}. - // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence reults + // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence results // computed in the previous step. auto const lhs_child = lhs.get_sliced_child(stream); diff --git a/pyproject.toml b/pyproject.toml index 6933484f4e7..0c95ea60408 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,12 +18,13 @@ exclude = [ skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp" # ignore short words, and typename parameters like OffsetT ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" -ignore-words-list = "inout,unparseable,falsy,couldn,Couldn" +ignore-words-list = "inout,unparseable,falsy,couldn,Couldn,thirdparty" builtin = "clear" quiet-level = 3 [tool.ruff] line-length = 79 +target-version = "py310" [tool.ruff.lint] typing-modules = ["cudf._typing"] @@ -94,17 +95,35 @@ select = [ "UP035", # usage of legacy `np.random` function calls "NPY002", + # Ruff-specific rules + "RUF", ] ignore = [ # whitespace before : "E203", # line-too-long (due to Copyright header) "E501", + # type-comparison, disabled because we compare types to numpy dtypes + "E721", + # String contains ambiguous character + "RUF001", + # Parenthesize `a and b` expressions when chaining `and` and `or` + # together, to make the precedence clear + "RUF021", + # Mutable class attributes should be annotated with + # `typing.ClassVar` + "RUF012", ] fixable = ["ALL"] exclude = [ - # TODO: Remove this in a follow-up where we fix __all__. - "__init__.py", + # TODO: https://github.com/rapidsai/cudf/issues/17461 + "**/*.ipynb", +] + +[tool.ruff.format] +exclude = [ + # TODO: https://github.com/rapidsai/cudf/issues/17461 + "**/*.ipynb", ] [tool.ruff.lint.per-file-ignores] diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py index c1e9d4d6116..872ba424d20 100644 --- a/python/cudf/benchmarks/common/config.py +++ b/python/cudf/benchmarks/common/config.py @@ -42,9 +42,9 @@ def pytest_collection_modifyitems(session, config, items): items[:] = list(filter(is_pandas_compatible, items)) else: - import cupy # noqa: W0611, F401 + import cupy # noqa: F401 - import cudf # noqa: W0611, F401 + import cudf # noqa: F401 def pytest_collection_modifyitems(session, config, items): pass diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 0e4afadccf5..24ff211387c 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -56,18 +56,16 @@ # into the main repo. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) -# Turn off isort until we upgrade to 5.8.0 -# https://github.com/pycqa/isort/issues/1594 -from config import ( # noqa: W0611, E402, F401 +from config import ( NUM_COLS, NUM_ROWS, - collect_ignore, - cudf, # noqa: W0611, E402, F401 - pytest_collection_modifyitems, - pytest_sessionfinish, - pytest_sessionstart, + collect_ignore, # noqa: F401 + cudf, + pytest_collection_modifyitems, # noqa: F401 + pytest_sessionfinish, # noqa: F401 + pytest_sessionstart, # noqa: F401 ) -from utils import ( # noqa: E402 +from utils import ( OrderedSet, collapse_fixtures, column_generators, diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 99b759e2166..843f2670b4d 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -99,6 +99,7 @@ __all__ = [ + "NA", "BaseIndex", "CategoricalDtype", "CategoricalIndex", @@ -114,7 +115,6 @@ "IntervalIndex", "ListDtype", "MultiIndex", - "NA", "NaT", "RangeIndex", "Scalar", diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py index ee1b2c1f1c4..4b080937a17 100644 --- a/python/cudf/cudf/_fuzz_testing/fuzzer.py +++ b/python/cudf/cudf/_fuzz_testing/fuzzer.py @@ -95,7 +95,7 @@ def start(self): else: self._data_handler.set_rand_params(self.params) kwargs = self._data_handler._current_params["test_kwargs"] - logging.info(f"Parameters passed: {str(kwargs)}") + logging.info(f"Parameters passed: {kwargs!s}") self._target(file_name, **kwargs) except KeyboardInterrupt: logging.info( diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index a6abd63d042..2df154ee112 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -133,7 +133,7 @@ def memory_usage(self, deep=False): """ raise NotImplementedError - def tolist(self): # noqa: D102 + def tolist(self): raise TypeError( "cuDF does not support conversion to host memory " "via the `tolist()` method. Consider using " @@ -148,7 +148,7 @@ def name(self): raise NotImplementedError @property # type: ignore - def ndim(self) -> int: # noqa: D401 + def ndim(self) -> int: """Number of dimensions of the underlying data, by definition 1.""" return 1 @@ -265,7 +265,7 @@ def get_loc(self, key): slice(1, 3, None) >>> multi_index.get_loc(('b', 'e')) 1 - """ # noqa: E501 + """ def max(self): """The maximum value of the index.""" @@ -1473,7 +1473,7 @@ def _intersection(self, other, sort=None): ._data ) - if sort is {None, True} and len(other): + if sort in {None, True} and len(other): return intersection_result.sort_values() return intersection_result diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index ed351a6b107..07d0d698cb8 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -54,7 +54,7 @@ def get_rmm_memory_resource_stack( """ if hasattr(mr, "upstream_mr"): - return [mr] + get_rmm_memory_resource_stack(mr.upstream_mr) + return [mr, *get_rmm_memory_resource_stack(mr.upstream_mr)] return [mr] @@ -275,7 +275,7 @@ def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool: print( f"[WARNING] RMM allocation of {format_bytes(nbytes)} bytes " "failed, spill-on-demand couldn't find any device memory to " - f"spill:\n{repr(self)}\ntraceback:\n{get_traceback()}\n" + f"spill:\n{self!r}\ntraceback:\n{get_traceback()}\n" f"{self.statistics}" ) return False # Since we didn't find anything to spill, we give up diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index b40c56c9a6b..7305ff651c6 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -366,7 +366,7 @@ def __str__(self) -> str: f"<{self.__class__.__name__} size={format_bytes(self._size)} " f"spillable={self.spillable} exposed={self.exposed} " f"num-spill-locks={len(self._spill_locks)} " - f"ptr={ptr_info} owner={repr(self._owner)}>" + f"ptr={ptr_info} owner={self._owner!r}>" ) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index a1e87d04bc9..0a9d339a6a8 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -1,9 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -""" -isort: skip_file -""" - from cudf.core.column.categorical import CategoricalColumn from cudf.core.column.column import ( ColumnBase, @@ -15,17 +11,43 @@ deserialize_columns, serialize_columns, ) -from cudf.core.column.datetime import DatetimeColumn # noqa: F401 -from cudf.core.column.datetime import DatetimeTZColumn # noqa: F401 -from cudf.core.column.lists import ListColumn # noqa: F401 -from cudf.core.column.numerical import NumericalColumn # noqa: F401 -from cudf.core.column.string import StringColumn # noqa: F401 -from cudf.core.column.struct import StructColumn # noqa: F401 -from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 -from cudf.core.column.interval import IntervalColumn # noqa: F401 -from cudf.core.column.decimal import ( # noqa: F401 +from cudf.core.column.datetime import ( + DatetimeColumn, + DatetimeTZColumn, +) +from cudf.core.column.decimal import ( Decimal32Column, Decimal64Column, Decimal128Column, DecimalBaseColumn, ) +from cudf.core.column.interval import IntervalColumn +from cudf.core.column.lists import ListColumn +from cudf.core.column.numerical import NumericalColumn +from cudf.core.column.string import StringColumn +from cudf.core.column.struct import StructColumn +from cudf.core.column.timedelta import TimeDeltaColumn + +__all__ = [ + "CategoricalColumn", + "ColumnBase", + "DatetimeColumn", + "DatetimeTZColumn", + "Decimal32Column", + "Decimal64Column", + "Decimal128Column", + "DecimalBaseColumn", + "IntervalColumn", + "ListColumn", + "NumericalColumn", + "StringColumn", + "StructColumn", + "TimeDeltaColumn", + "as_column", + "build_column", + "column_empty", + "column_empty_like", + "concat_columns", + "deserialize_columns", + "serialize_columns", +] diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7551703c53e..cbbe01f7289 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -888,7 +888,7 @@ def find_and_replace( if len(replacement_col) == replacement_col.null_count: replacement_col = replacement_col.astype(self.categories.dtype) - if type(to_replace_col) != type(replacement_col): + if type(to_replace_col) is not type(replacement_col): raise TypeError( f"to_replace and value should be of same types," f"got to_replace dtype: {to_replace_col.dtype} and " diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 24b55fe1bc2..c9be3f239f9 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -18,6 +18,8 @@ import pylibcudf as plc import cudf +import cudf.core.column.column as column +import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals import unary @@ -28,7 +30,7 @@ get_tz_data, ) from cudf.core.buffer import Buffer, acquire_spill_lock -from cudf.core.column import ColumnBase, as_column, column, string +from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.dtypes import _get_base_dtype from cudf.utils.utils import ( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index ce7aa91f775..ac9a2caad50 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -18,7 +18,8 @@ from cudf.api.types import is_scalar from cudf.core._internals import unary from cudf.core.buffer import as_buffer -from cudf.core.column import ColumnBase +from cudf.core.column.column import ColumnBase +from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import ( Decimal32Dtype, Decimal64Dtype, @@ -28,8 +29,6 @@ from cudf.core.mixins import BinaryOperand from cudf.utils.utils import pa_mask_buffer_to_mask -from .numerical_base import NumericalBaseColumn - if TYPE_CHECKING: from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer @@ -435,7 +434,7 @@ def _get_decimal_type( `op` for the given dtypes. For precision & scale calculations see : https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - """ # noqa: E501 + """ # This should at some point be hooked up to libcudf's # binary_operation_fixed_point_scale @@ -506,8 +505,8 @@ def _get_decimal_type( # if we've reached this point, we cannot create a decimal type without # overflow; raise an informative error raise ValueError( - f"Performing {op} between columns of type {repr(lhs_dtype)} and " - f"{repr(rhs_dtype)} would result in overflow" + f"Performing {op} between columns of type {lhs_dtype!r} and " + f"{rhs_dtype!r} would result in overflow" ) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 9147270c289..34975fc94f4 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -7,7 +7,8 @@ import pyarrow as pa import cudf -from cudf.core.column import StructColumn, as_column +from cudf.core.column.column import as_column +from cudf.core.column.struct import StructColumn from cudf.core.dtypes import IntervalDtype if TYPE_CHECKING: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 42df5123014..789c4a7f3cb 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -13,11 +13,12 @@ import pylibcudf as plc import cudf +import cudf.core.column.column as column from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase, as_column, column +from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.methods import ColumnMethods, ParentType from cudf.core.column.numerical import NumericalColumn from cudf.core.dtypes import ListDtype diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index c8f859596b2..8ca42debb72 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -12,10 +12,13 @@ import pylibcudf import cudf +import cudf.core.column.column as column +import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar from cudf.core._internals import unary -from cudf.core.column import ColumnBase, as_column, column, string +from cudf.core.column.column import ColumnBase, as_column +from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import CategoricalDtype from cudf.core.mixins import BinaryOperand from cudf.errors import MixedTypeError @@ -26,8 +29,6 @@ np_dtypes_to_pandas_dtypes, ) -from .numerical_base import NumericalBaseColumn - if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -226,7 +227,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: # If `other` is a Python integer and it is out-of-bounds # promotion could fail but we can trivially define the result # in terms of `notnull` or `NULL_NOT_EQUALS`. - if type(other) is int and self.dtype.kind in "iu": # noqa: E721 + if type(other) is int and self.dtype.kind in "iu": truthiness = None iinfo = np.iinfo(self.dtype) if iinfo.min > other: diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 6d639337401..ea242e34edb 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -12,7 +12,7 @@ import cudf from cudf import _lib as libcudf from cudf.core.buffer import Buffer, acquire_spill_lock -from cudf.core.column import ColumnBase +from cudf.core.column.column import ColumnBase from cudf.core.missing import NA from cudf.core.mixins import Scannable diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fa5f0dd99fa..76d67585609 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -17,13 +17,14 @@ import cudf import cudf.api.types +import cudf.core.column.column as column +import cudf.core.column.datetime as datetime from cudf import _lib as libcudf from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import column, datetime from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring @@ -548,7 +549,7 @@ def join( 2 3 c-d dtype: object - """ # noqa E501 + """ if sep is None: sep = "" @@ -694,7 +695,7 @@ def extract( The `flags` parameter currently only supports re.DOTALL and re.MULTILINE. - """ # noqa W605 + """ if not _is_supported_regex_flags(flags): raise NotImplementedError( "unsupported value for `flags` parameter" @@ -830,7 +831,7 @@ def contains( value is set. The `flags` parameter currently only supports re.DOTALL and re.MULTILINE. - """ # noqa W605 + """ if na is not np.nan: raise NotImplementedError("`na` parameter is not yet supported") if regex and isinstance(pat, re.Pattern): @@ -3675,7 +3676,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: - Some characters need to be escaped when passing in pat. e.g. ``'$'`` has a special meaning in regex and must be escaped when finding this literal character. - """ # noqa W605 + """ if isinstance(pat, re.Pattern): flags = pat.flags & ~re.U pat = pat.pattern @@ -6160,7 +6161,7 @@ def find_and_replace( to_replace_col = column.as_column(to_replace) replacement_col = column.as_column(replacement) - if type(to_replace_col) != type(replacement_col): + if type(to_replace_col) is not type(replacement_col): raise TypeError( f"to_replace and value should be of same types," f"got to_replace dtype: {to_replace_col.dtype} and " diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 2adc6b54bab..db6ad72ab56 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -8,7 +8,7 @@ import pyarrow as pa import cudf -from cudf.core.column import ColumnBase +from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import StructDtype from cudf.core.missing import NA diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 620fe31c30f..ccc9ef2b3f6 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -11,11 +11,13 @@ import pyarrow as pa import cudf +import cudf.core.column.column as column +import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.api.types import is_scalar from cudf.core._internals import unary from cudf.core.buffer import Buffer, acquire_spill_lock -from cudf.core.column import ColumnBase, column, string +from cudf.core.column.column import ColumnBase from cudf.utils.dtypes import np_to_pa_dtype from cudf.utils.utils import ( _all_bools_with_nulls, @@ -468,7 +470,7 @@ def components(self) -> dict[str, ColumnBase]: 2 13000 10 12 48 712 0 0 3 0 0 35 35 656 0 0 4 37 13 12 14 234 0 0 - """ # noqa: E501 + """ date_meta = { "seconds": ["m", "s"], diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 496e86ed709..e4fd82e819b 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -49,7 +49,7 @@ def from_zip(cls, data: abc.Iterator): def __getitem__(self, key): """Recursively apply dict.__getitem__ for nested elements.""" # As described in the pandas docs - # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index # noqa: E501 + # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index # accessing nested elements of a multiindex must be done using a tuple. # Lists and other sequences are treated as accessing multiple elements # at the top level of the index. @@ -62,10 +62,10 @@ def _to_flat_dict_inner(d: dict, parents: tuple = ()): for k, v in d.items(): if not isinstance(v, d.__class__): if parents: - k = parents + (k,) + k = (*parents, k) yield (k, v) else: - yield from _to_flat_dict_inner(d=v, parents=parents + (k,)) + yield from _to_flat_dict_inner(d=v, parents=(*parents, k)) class ColumnAccessor(abc.MutableMapping): diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index a4d12cfc7f0..5bfea45a946 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -149,7 +149,7 @@ def cut( if len(set(bins)) is not len(bins): if duplicates == "raise": raise ValueError( - f"Bin edges must be unique: {repr(bins)}.\n" + f"Bin edges must be unique: {bins!r}.\n" f"You can drop duplicate edges by setting the 'duplicates'" "kwarg" ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b58ab13be93..fa8d517a9ef 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -13,7 +13,13 @@ import textwrap import warnings from collections import abc, defaultdict -from collections.abc import Callable, Iterator, MutableMapping +from collections.abc import ( + Callable, + Hashable, + Iterator, + MutableMapping, + Sequence, +) from typing import TYPE_CHECKING, Any, Literal, cast import cupy @@ -1131,7 +1137,7 @@ def _from_data( data: MutableMapping, index: BaseIndex | None = None, columns: Any = None, - ) -> DataFrame: + ) -> Self: out = super()._from_data(data=data, index=index) if columns is not None: out.columns = columns @@ -2242,7 +2248,7 @@ def from_dict( n1 n2 a b 1 3 c 2 4 - """ # noqa: E501 + """ orient = orient.lower() if orient == "index": @@ -2399,7 +2405,7 @@ def to_dict( >>> df.to_dict('records', into=dd) [defaultdict(, {'col1': 1, 'col2': 0.5}), defaultdict(, {'col1': 2, 'col2': 0.75})] - """ # noqa: E501 + """ orient = orient.lower() if orient == "series": @@ -3027,7 +3033,7 @@ def set_index( if len(keys) == 0: raise ValueError("No valid columns to be added to index.") if append: - keys = [self.index] + keys + keys = [self.index, *keys] # Preliminary type check labels_not_found = [] @@ -3093,7 +3099,7 @@ def set_index( @_performance_tracking def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None - ): # noqa: D102 + ): if isinstance(value, (pd.Series, pd.DataFrame)): value = cudf.from_pandas(value) if isinstance(value, cudf.Series): @@ -3574,7 +3580,7 @@ def drop_duplicates( 1 Yum Yum cup 4.0 2 Indomie cup 3.5 4 Indomie pack 5.0 - """ # noqa: E501 + """ outdf = super().drop_duplicates( subset=subset, keep=keep, @@ -4854,7 +4860,7 @@ def map( if na_action not in {"ignore", None}: raise ValueError( - f"na_action must be 'ignore' or None. Got {repr(na_action)}" + f"na_action must be 'ignore' or None. Got {na_action!r}" ) if na_action == "ignore": @@ -5727,7 +5733,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table: """ data = self - index_descr = [] + index_descr: Sequence[dict[str, Any]] | Sequence[str] = [] write_index = preserve_index is not False keep_range_index = write_index and preserve_index is None index = self.index @@ -5934,7 +5940,7 @@ def _from_arrays( index=None, columns=None, nan_as_null=False, - ): + ) -> Self: """ Convert an object implementing an array interface to DataFrame. @@ -5987,6 +5993,12 @@ def _from_arrays( raise ValueError("Duplicate column names are not allowed") names = columns + # Mapping/MutableMapping are invariant in the key type, so + # dict[int, ColumnBase] (the inferred type of ca_data) is not + # a valid type to pass to a function accepting + # Mapping[Hashable, ColumnBase] even though int is Hashable. + # See: https://github.com/python/typing/issues/445 + ca_data: dict[Hashable, ColumnBase] if array_data.ndim == 2: ca_data = { k: column.as_column(array_data[:, i], nan_as_null=nan_as_null) @@ -6133,7 +6145,7 @@ def quantile( non-numeric types and result is expected to be a Series in case of Pandas. cuDF will return a DataFrame as it doesn't support mixed types under Series. - """ # noqa: E501 + """ if axis not in (0, None): raise NotImplementedError("axis is not implemented yet") @@ -6832,7 +6844,7 @@ def select_dtypes(self, include=None, exclude=None): 3 False 2.0 4 True 1.0 5 False 2.0 - """ # noqa: E501 + """ # code modified from: # https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L3196 @@ -7035,7 +7047,9 @@ def to_orc( ) @_performance_tracking - def stack(self, level=-1, dropna=no_default, future_stack=False): + def stack( + self, level=-1, dropna=no_default, future_stack=False + ) -> DataFrame | Series: """Stack the prescribed level(s) from columns to index Return a reshaped DataFrame or Series having a multi-level @@ -7282,11 +7296,13 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): ) if has_unnamed_levels: - unnamed_level_values = list( - map(column_name_idx.get_level_values, unnamed_levels_indices) - ) unnamed_level_values = pd.MultiIndex.from_arrays( - unnamed_level_values + list( + map( + column_name_idx.get_level_values, + unnamed_levels_indices, + ) + ) ) def unnamed_group_generator(): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 2110e610c37..801020664da 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -237,7 +237,7 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": >>> cudf_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) >>> cudf_dtype CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) - """ # noqa: E501 + """ return CategoricalDtype( categories=dtype.categories, ordered=dtype.ordered ) @@ -254,7 +254,7 @@ def to_pandas(self) -> pd.CategoricalDtype: CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) >>> dtype.to_pandas() CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) - """ # noqa: E501 + """ if self._categories is None: categories = None elif self._categories.dtype.kind == "f": @@ -399,7 +399,7 @@ def element_type(self) -> Dtype: ListDtype(float32) >>> deep_nested_type.element_type.element_type.element_type 'float32' - """ # noqa: E501 + """ if isinstance(self._typ.value_type, pa.ListType): return ListDtype.from_arrow(self._typ.value_type) elif isinstance(self._typ.value_type, pa.StructType): @@ -420,7 +420,7 @@ def leaf_type(self): ListDtype(ListDtype(ListDtype(float32))) >>> deep_nested_type.leaf_type 'float32' - """ # noqa: E501 + """ if isinstance(self.element_type, ListDtype): return self.element_type.leaf_type else: @@ -486,7 +486,7 @@ def __eq__(self, other): def __repr__(self): if isinstance(self.element_type, (ListDtype, StructDtype)): - return f"{type(self).__name__}({repr(self.element_type)})" + return f"{type(self).__name__}({self.element_type!r})" else: return f"{type(self).__name__}({self.element_type})" @@ -556,7 +556,7 @@ class StructDtype(_BaseDtype): >>> nested_struct_dtype = cudf.StructDtype({"dict_data": struct_dtype, "c": "uint8"}) >>> nested_struct_dtype StructDtype({'dict_data': StructDtype({'a': dtype('int64'), 'b': dtype('O')}), 'c': dtype('uint8')}) - """ # noqa: E501 + """ name = "struct" @@ -730,7 +730,7 @@ def itemsize(self): >>> decimal{size}_dtype = cudf.Decimal{size}Dtype(precision=9, scale=2) >>> decimal{size}_dtype Decimal{size}Dtype(precision=9, scale=2) - """ # noqa: E501 + """ ) @@ -743,7 +743,7 @@ def __init__(self, precision, scale=0): @property def str(self): - return f"{str(self.name)}({self.precision}, {self.scale})" + return f"{self.name!s}({self.precision}, {self.scale})" @property def precision(self): @@ -950,7 +950,7 @@ def __eq__(self, other): # This means equality isn't transitive but mimics pandas return other in (self.name, str(self)) return ( - type(self) == type(other) + type(self) is type(other) and self.subtype == other.subtype and self.closed == other.closed ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0c0f271fe6f..70789160cb6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1392,7 +1392,7 @@ def argsort( >>> idx = cudf.Index([3, 1, 2]) >>> idx.argsort() array([1, 2, 0], dtype=int32) - """ # noqa: E501 + """ if na_position not in {"first", "last"}: raise ValueError(f"invalid na_position: {na_position}") if kind != "quicksort": diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e977f037b79..29ab3b60d9d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1467,9 +1467,7 @@ def _iterative_groupby_apply( RuntimeWarning, ) - chunks = [ - grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) - ] + chunks = [grouped_values[s:e] for s, e in itertools.pairwise(offsets)] chunk_results = [function(chk, *args) for chk in chunks] return self._post_process_chunk_results( chunk_results, group_names, group_keys, grouped_values diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index ff9cd310aef..eac04cf36ec 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1619,7 +1619,7 @@ def argsort( Returns ------- cupy.ndarray: The indices sorted based on input. - """ # noqa: E501 + """ return super().argsort( axis=axis, kind=kind, @@ -2218,7 +2218,7 @@ def year(self) -> Index: DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]') >>> datetime_index.year Index([2000, 2001, 2002], dtype='int16') - """ # noqa: E501 + """ return Index._from_column(self._column.year, name=self.name) @property # type: ignore @@ -2237,7 +2237,7 @@ def month(self) -> Index: DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]') >>> datetime_index.month Index([1, 2, 3], dtype='int16') - """ # noqa: E501 + """ return Index._from_column(self._column.month, name=self.name) @property # type: ignore @@ -2256,7 +2256,7 @@ def day(self) -> Index: DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]') >>> datetime_index.day Index([1, 2, 3], dtype='int16') - """ # noqa: E501 + """ return Index._from_column(self._column.day, name=self.name) @property # type: ignore @@ -2340,7 +2340,7 @@ def microsecond(self) -> Index: dtype='datetime64[ns]') >>> datetime_index.microsecond Index([0, 1, 2], dtype='int32') - """ # noqa: E501 + """ return Index._from_column( ( # Need to manually promote column to int32 because @@ -2615,7 +2615,7 @@ def ceil(self, freq: str) -> Self: ... ]) >>> gIndex.ceil("T") DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]') - """ # noqa: E501 + """ return type(self)._from_column(self._column.ceil(freq), name=self.name) @_performance_tracking @@ -2646,7 +2646,7 @@ def floor(self, freq: str) -> Self: ... ]) >>> gIndex.floor("T") DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]') - """ # noqa: E501 + """ return type(self)._from_column( self._column.floor(freq), name=self.name ) @@ -2686,7 +2686,7 @@ def round(self, freq: str) -> Self: DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]') >>> dt_idx.round('T') DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]') - """ # noqa: E501 + """ return type(self)._from_column( self._column.round(freq), name=self.name ) @@ -2737,7 +2737,7 @@ def tz_localize( ``ambiguous`` and ``nonexistent`` arguments. Any ambiguous or nonexistent timestamps are converted to 'NaT'. - """ # noqa: E501 + """ result_col = self._column.tz_localize(tz, ambiguous, nonexistent) return DatetimeIndex._from_column( result_col, name=self.name, freq=self._freq @@ -2774,7 +2774,7 @@ def tz_convert(self, tz: str | None) -> Self: '2018-03-02 14:00:00+00:00', '2018-03-03 14:00:00+00:00'], dtype='datetime64[ns, Europe/London]') - """ # noqa: E501 + """ result_col = self._column.tz_convert(tz) return DatetimeIndex._from_column(result_col, name=self.name) @@ -3118,7 +3118,7 @@ class CategoricalIndex(Index): >>> cudf.CategoricalIndex( ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a") CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, dtype='category', name='a') - """ # noqa: E501 + """ @_performance_tracking def __init__( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2f8c2587937..21ac009e7ff 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -607,7 +607,7 @@ def copy(self, deep: bool = True) -> Self: ) @_performance_tracking - def equals(self, other) -> bool: # noqa: D102 + def equals(self, other) -> bool: return super().equals(other) and self.index.equals(other.index) @property @@ -5474,7 +5474,7 @@ def groupby( ), ) ) - def add(self, other, axis, level=None, fill_value=None): # noqa: D102 + def add(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5515,7 +5515,7 @@ def add(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def radd(self, other, axis, level=None, fill_value=None): # noqa: D102 + def radd(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5556,7 +5556,7 @@ def radd(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def subtract(self, other, axis, level=None, fill_value=None): # noqa: D102 + def subtract(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5599,7 +5599,7 @@ def subtract(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rsub(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rsub(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5640,7 +5640,7 @@ def rsub(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def multiply(self, other, axis, level=None, fill_value=None): # noqa: D102 + def multiply(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5683,7 +5683,7 @@ def multiply(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rmul(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rmul(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5724,7 +5724,7 @@ def rmul(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def mod(self, other, axis, level=None, fill_value=None): # noqa: D102 + def mod(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5765,7 +5765,7 @@ def mod(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rmod(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rmod(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5806,7 +5806,7 @@ def rmod(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def pow(self, other, axis, level=None, fill_value=None): # noqa: D102 + def pow(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5847,7 +5847,7 @@ def pow(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rpow(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rpow(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5888,7 +5888,7 @@ def rpow(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def floordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 + def floordiv(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5929,7 +5929,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rfloordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rfloordiv(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5970,7 +5970,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def truediv(self, other, axis, level=None, fill_value=None): # noqa: D102 + def truediv(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -6015,7 +6015,7 @@ def truediv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rtruediv(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rtruediv(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -6059,7 +6059,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def eq(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def eq(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__eq__", fill_value=fill_value, can_reindex=True ) @@ -6099,7 +6099,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def ne(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def ne(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__ne__", fill_value=fill_value, can_reindex=True ) @@ -6139,7 +6139,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def lt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def lt(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__lt__", fill_value=fill_value, can_reindex=True ) @@ -6179,7 +6179,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def le(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def le(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__le__", fill_value=fill_value, can_reindex=True ) @@ -6219,7 +6219,7 @@ def le(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def gt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def gt(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__gt__", fill_value=fill_value, can_reindex=True ) @@ -6259,7 +6259,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def ge(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def ge(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__ge__", fill_value=fill_value, can_reindex=True ) diff --git a/python/cudf/cudf/core/mixins/scans.py b/python/cudf/cudf/core/mixins/scans.py index b0f606e32e6..289fcb84d91 100644 --- a/python/cudf/cudf/core/mixins/scans.py +++ b/python/cudf/cudf/core/mixins/scans.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from .mixin_factory import _create_delegating_mixin @@ -12,5 +12,5 @@ "cumprod", "cummin", "cummax", - }, # noqa: E231 + }, ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 19a53af018d..173d4e1c584 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -567,7 +567,7 @@ def levels(self) -> list[cudf.Index]: names=['a', 'b']) >>> midx.levels [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')] - """ # noqa: E501 + """ return [ idx.rename(name) for idx, name in zip(self._levels, self.names) ] diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 016bd1225cd..f37b44b1100 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1013,7 +1013,7 @@ def as_tuple(x): ca = ColumnAccessor( result, multiindex=True, - level_names=(None,) + columns._column_names, + level_names=(None, *columns._column_names), verify=False, ) return cudf.DataFrame._from_data(ca, index=index_labels) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index f6331aa1f49..80dd0921f9c 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -304,7 +304,7 @@ def __repr__(self): # https://github.com/numpy/numpy/issues/17552 return ( f"{self.__class__.__name__}" - f"({str(self.value)}, dtype={self.dtype})" + f"({self.value!s}, dtype={self.dtype})" ) def _binop_result_dtype_or_error(self, other, op): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 95ea22b5ad5..928f3c3d666 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -517,7 +517,7 @@ def from_categorical(cls, categorical, codes=None): 3 a dtype: category Categories (3, object): ['a', 'b', 'c'] - """ # noqa: E501 + """ col = as_column(categorical) if codes is not None: codes = as_column(codes) @@ -942,7 +942,7 @@ def drop( labels, axis, index, columns, level, inplace, errors ) - def tolist(self): # noqa: D102 + def tolist(self): raise TypeError( "cuDF does not support conversion to host memory " "via the `tolist()` method. Consider using " @@ -1087,7 +1087,7 @@ def reindex( DataFrame, followed by the original Series values. When `drop` is True, a `Series` is returned. In either case, if ``inplace=True``, no value is returned. -""", # noqa: E501 +""", example=""" >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) >>> series @@ -1196,7 +1196,7 @@ def to_frame(self, name: abc.Hashable = no_default) -> cudf.DataFrame: 12 c 13 15 d - """ # noqa: E501 + """ return self._to_frame(name=name, index=self.index) @_performance_tracking @@ -2122,7 +2122,7 @@ def data(self): >>> np.array(series.data.memoryview()) array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) - """ # noqa: E501 + """ return self._column.data @property # type: ignore @@ -4590,7 +4590,7 @@ def is_month_end(self) -> Series: 7 False 8 False dtype: bool - """ # noqa: E501 + """ return self._return_result_like_self(self.series._column.is_month_end) @property # type: ignore @@ -5169,7 +5169,7 @@ def components(self) -> cudf.DataFrame: 2 13000 10 12 48 712 0 0 3 0 0 35 35 656 0 0 4 37 13 12 14 234 0 0 - """ # noqa: E501 + """ ca = ColumnAccessor(self.series._column.components(), verify=False) return self.series._constructor_expanddim._from_data( ca, index=self.series.index diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 0e66f383ca0..f6d0664758f 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -83,7 +83,7 @@ def name(self, value): @property # type: ignore @_performance_tracking - def ndim(self) -> int: # noqa: D401 + def ndim(self) -> int: """Number of dimensions of the underlying data, by definition 1.""" return 1 @@ -105,12 +105,12 @@ def _column(self) -> ColumnBase: @property # type: ignore @_performance_tracking - def values(self) -> cupy.ndarray: # noqa: D102 + def values(self) -> cupy.ndarray: return self._column.values @property # type: ignore @_performance_tracking - def values_host(self) -> numpy.ndarray: # noqa: D102 + def values_host(self) -> numpy.ndarray: return self._column.values_host @classmethod diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py index 4c90c5bbba0..3a1e01caf28 100644 --- a/python/cudf/cudf/core/udf/masked_typing.py +++ b/python/cudf/cudf/core/udf/masked_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import operator @@ -50,7 +50,7 @@ SUPPORTED_NUMPY_TYPES = ( NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES ) -supported_type_str = "\n".join(sorted(list(SUPPORTED_NUMPY_TYPES) + ["bool"])) +supported_type_str = "\n".join(sorted([*list(SUPPORTED_NUMPY_TYPES), "bool"])) _units = ["ns", "ms", "us", "s"] _datetime_cases = {types.NPDatetime(u) for u in _units} diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index dbabaacf6b5..e8d634598f4 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -6,7 +6,7 @@ import cudf from cudf._lib.transform import bools_to_mask -__all__ = ["timeseries", "randomdata"] +__all__ = ["randomdata", "timeseries"] # TODO: diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 750c6cec180..2382e9f12ed 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1062,10 +1062,7 @@ def to_parquet( ) partition_info = ( - [ - (i, j - i) - for i, j in zip(partition_offsets, partition_offsets[1:]) - ] + [(i, j - i) for i, j in itertools.pairwise(partition_offsets)] if partition_offsets is not None else None ) @@ -1485,7 +1482,7 @@ def write_table(self, df): ) existing_cw_batch = defaultdict(dict) new_cw_paths = [] - partition_info = [(i, j - i) for i, j in zip(offsets, offsets[1:])] + partition_info = [(i, j - i) for i, j in itertools.pairwise(offsets)] for path, part_info, meta_path in zip( paths, diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index e206c8bca08..79a3a794af3 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -380,7 +380,7 @@ class option_context(ContextDecorator): >>> from cudf import option_context >>> with option_context('mode.pandas_compatible', True, 'default_float_bitwidth', 32): ... pass - """ # noqa: E501 + """ def __init__(self, *args) -> None: if len(args) % 2 != 0: diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index bacf1f7e77b..fec181e85d7 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -12,7 +12,7 @@ from .magics import load_ipython_extension from .profiler import Profiler -__all__ = ["Profiler", "load_ipython_extension", "install", "is_proxy_object"] +__all__ = ["Profiler", "install", "is_proxy_object", "load_ipython_extension"] LOADED = False @@ -57,7 +57,7 @@ def install(): current_mr = rmm.mr.get_current_device_resource() if not isinstance(current_mr, rmm.mr.CudaMemoryResource): warnings.warn( - f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}", + f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={rmm_mode!s}", UserWarning, ) return diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py index e0d3d9101a9..619ee822a54 100644 --- a/python/cudf/cudf/pandas/__main__.py +++ b/python/cudf/cudf/pandas/__main__.py @@ -96,7 +96,7 @@ def main(): (module,) = args.module # run the module passing the remaining arguments # as if it were run with python -m - sys.argv[:] = [module] + args.args # not thread safe? + sys.argv[:] = [module, *args.args] # not thread safe? runpy.run_module(module, run_name="__main__") elif len(args.args) >= 1: # Remove ourself from argv and continue diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 05e7d159c63..e763875adb8 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -77,8 +77,8 @@ def _pandas_util_dir(): # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py res = list( set( - list(importlib.import_module("pandas.util").__dict__.keys()) - + [ + [ + *list(importlib.import_module("pandas.util").__dict__.keys()), "Appender", "Substitution", "_exceptions", @@ -219,7 +219,7 @@ def Timestamp_Timedelta__new__(cls, *args, **kwargs): def _DataFrame__dir__(self): # Column names that are string identifiers are added to the dir of the # DataFrame - # See https://github.com/pandas-dev/pandas/blob/43691a2f5d235b08f0f3aa813d8fdcb7c4ce1e47/pandas/core/indexes/base.py#L878 # noqa: E501 + # See https://github.com/pandas-dev/pandas/blob/43691a2f5d235b08f0f3aa813d8fdcb7c4ce1e47/pandas/core/indexes/base.py#L878 _pd_df_dir = dir(pd.DataFrame) return _pd_df_dir + [ colname diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 40893ee2614..d32d388b975 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -247,7 +247,7 @@ def _fsproxy_state(self) -> _State: if metaclasses: metaclass = types.new_class( # type: ignore f"{name}_Meta", - metaclasses + (_FastSlowProxyMeta,), + (*metaclasses, _FastSlowProxyMeta), {}, ) cls = types.new_class( @@ -1301,7 +1301,7 @@ def _replace_closurevars( return functools.update_wrapper( g, f, - assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",), + assigned=(*functools.WRAPPER_ASSIGNMENTS, "__kwdefaults__"), ) diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index bb2fc00d9fc..e4ee0ce1ca4 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -41,7 +41,7 @@ def count_failures(log_file_name, pattern): PANDAS_TEST_PREFIX ) if fnmatch(line_module_name, pattern): - if "longrepr" in line and line["longrepr"]: + if line.get("longrepr"): if isinstance(line["longrepr"], (tuple, list)): message = line["longrepr"][2].splitlines()[0] elif isinstance(line["longrepr"], str): diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 99b686406fb..01a75a2efb0 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -237,9 +237,9 @@ def generate( def get_dataframe(parameters, use_threads): # Initialize seeds if parameters.seed is not None: - rng = np.random.default_rng(seed=parameters.seed) # noqa: F841 + rng = np.random.default_rng(seed=parameters.seed) else: - rng = np.random.default_rng(seed=0) # noqa: F841 + rng = np.random.default_rng(seed=0) # For each column, invoke the data generator for column_params in parameters.column_parameters: diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 8d342f8e6c6..0b09cf7dc34 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -77,7 +77,7 @@ def _check_types( ): return - if type(left) != type(right): + if type(left) is not type(right): raise_assert_detail( obj, "Class types are different", f"{type(left)}", f"{type(right)}" ) @@ -149,7 +149,7 @@ def assert_column_equal( ): pass else: - if type(left) != type(right) or left.dtype != right.dtype: + if type(left) is not type(right) or left.dtype != right.dtype: msg1 = f"{left.dtype}" msg2 = f"{right.dtype}" raise_assert_detail(obj, "Dtypes are different", msg1, msg2) diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 691da224f44..81ba61b31dc 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -57,7 +57,7 @@ def test_localize_ambiguous(request, unit, zone_name): request.applymarker( pytest.mark.xfail( condition=(zone_name == "America/Metlakatla"), - reason="https://www.timeanddate.com/news/time/metlakatla-quits-dst.html", # noqa: E501 + reason="https://www.timeanddate.com/news/time/metlakatla-quits-dst.html", ) ) s = cudf.Series( @@ -83,7 +83,7 @@ def test_localize_nonexistent(request, unit, zone_name): request.applymarker( pytest.mark.xfail( condition=(zone_name == "America/Grand_Turk"), - reason="https://www.worldtimezone.com/dst_news/dst_news_turkscaicos03.html", # noqa: E501 + reason="https://www.worldtimezone.com/dst_news/dst_news_turkscaicos03.html", ) ) s = cudf.Series( diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 71b6bbd688d..0712a0de635 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -167,11 +167,11 @@ _operators_arithmetic = _operators_arithmetic[:1] _operators_comparison = _operators_comparison[:1] _cudf_scalar_reflected_ops = _cudf_scalar_reflected_ops[:1] - DATETIME_TYPES = {"datetime64[ms]"} # noqa: F811 - NUMERIC_TYPES = {"float32"} # noqa: F811 - FLOAT_TYPES = {"float64"} # noqa: F811 - INTEGER_TYPES = {"int16"} # noqa: F811 - TIMEDELTA_TYPES = {"timedelta64[s]"} # noqa: F811 + DATETIME_TYPES = {"datetime64[ms]"} + NUMERIC_TYPES = {"float32"} + FLOAT_TYPES = {"float64"} + INTEGER_TYPES = {"int16"} + TIMEDELTA_TYPES = {"timedelta64[s]"} # To save time, we skip tests marked "pytest.mark.xfail" pytest_xfail = pytest.mark.skipif @@ -444,7 +444,7 @@ def test_str_series_compare_num_reflected( @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @pytest.mark.parametrize("nelem", [1, 2, 100]) @pytest.mark.parametrize("cmpop", _cmpops) -@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES + ["datetime64[ms]"]) +@pytest.mark.parametrize("dtype", [*utils.NUMERIC_TYPES, "datetime64[ms]"]) @pytest.mark.parametrize("use_cudf_scalar", [True, False]) def test_series_compare_scalar( nelem, cmpop, obj_class, dtype, use_cudf_scalar diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index db41f689255..db24fdd2a29 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -98,7 +98,7 @@ def test_categorical_compare_unordered(): # test equal out = sr == sr assert out.dtype == np.bool_ - assert type(out[0]) == np.bool_ + assert type(out[0]) is np.bool_ assert np.all(out.to_numpy()) assert np.all(pdsr == pdsr) @@ -134,7 +134,7 @@ def test_categorical_compare_ordered(): # test equal out = sr1 == sr1 assert out.dtype == np.bool_ - assert type(out[0]) == np.bool_ + assert type(out[0]) is np.bool_ assert np.all(out.to_numpy()) assert np.all(pdsr1 == pdsr1) @@ -768,7 +768,7 @@ def test_categorical_setitem_with_nan(): assert_eq(gs, expected_series) -@pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"]) +@pytest.mark.parametrize("dtype", [*list(NUMERIC_TYPES), "object"]) @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) def test_series_construction_with_nulls(input_obj, dtype): dtype = cudf.dtype(dtype) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index ab0f1767cd6..f57f256d55c 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -625,7 +625,7 @@ def test_concat_series_dataframe_input_str(objs): ) @pytest.mark.parametrize("ignore_index", [True, False]) def test_concat_empty_dataframes(df, other, ignore_index): - other_pd = [df] + other + other_pd = [df, *other] gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] @@ -1224,7 +1224,7 @@ def test_concat_join_empty_dataframes( request, df, other, ignore_index, join, sort ): axis = 0 - other_pd = [df] + other + other_pd = [df, *other] gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] @@ -1312,7 +1312,7 @@ def test_concat_join_empty_dataframes_axis_1( df, other, ignore_index, axis, join, sort ): # no duplicate columns - other_pd = [df] + other + other_pd = [df, *other] gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index ac772c47e3a..e18112d03ea 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -916,10 +916,10 @@ def test_csv_reader_nrows(tmpdir): str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows ) assert df.shape == (read_rows, 2) - assert str(skip_rows) in list(df)[0] + assert str(skip_rows) in next(iter(df)) assert str(2 * skip_rows) in list(df)[1] for row in range(0, read_rows // sample_skip, sample_skip): - assert df[list(df)[0]][row] == row + skip_rows + 1 + assert df[next(iter(df))][row] == row + skip_rows + 1 assert df[list(df)[1]][row] == 2 * (row + skip_rows + 1) assert df[list(df)[1]][read_rows - 1] == 2 * (read_rows + skip_rows) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 29f2f46e3c7..381ca45de31 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -170,7 +170,7 @@ def test_column_from_ephemeral_cupy_try_lose_reference(): # CuPy array a = cudf.Series(cupy.asarray([1, 2, 3]))._column a = cudf.core.column.as_column(a) - b = cupy.asarray([1, 1, 1]) # noqa: F841 + b = cupy.asarray([1, 1, 1]) assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) a = cudf.Series(cupy.asarray([1, 2, 3]))._column diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 509ee0d65a5..d04fd97dcbd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -56,9 +56,9 @@ # If spilling is enabled globally, we skip many test permutations # to reduce running time. if get_global_manager() is not None: - ALL_TYPES = ["float32"] # noqa: F811 - DATETIME_TYPES = ["datetime64[ms]"] # noqa: F811 - NUMERIC_TYPES = ["float32"] # noqa: F811 + ALL_TYPES = ["float32"] + DATETIME_TYPES = ["datetime64[ms]"] + NUMERIC_TYPES = ["float32"] # To save time, we skip tests marked "xfail" pytest_xfail = pytest.mark.skipif @@ -452,8 +452,8 @@ def test_dataframe_basic(): df = cudf.concat([df, df2]) assert len(df) == 11 - hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123]) - hvals = np.asarray(rnd_vals.tolist() + [321]) + hkeys = np.asarray([*np.arange(10, dtype=np.float64).tolist(), 123]) + hvals = np.asarray([*rnd_vals.tolist(), 321]) np.testing.assert_equal(df["keys"].to_numpy(), hkeys) np.testing.assert_equal(df["vals"].to_numpy(), hvals) @@ -1118,7 +1118,7 @@ def test_dataframe_to_string_wide(monkeypatch): 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 ... 2 2 2 2 2 2 2 2 - [3 rows x 100 columns]""" # noqa: E501 + [3 rows x 100 columns]""" ) assert got == expect @@ -2197,7 +2197,7 @@ def test_dataframe_shape_empty(): @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 20]) -@pytest.mark.parametrize("dtype", dtypes + ["object"]) +@pytest.mark.parametrize("dtype", [*dtypes, "object"]) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): # In case of `bool` dtype: pandas <= 1.2.5 type-casts @@ -2842,7 +2842,7 @@ def test_arrow_round_trip(preserve_index, index): assert_eq(gdf_out, pdf_out) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"]) def test_cuda_array_interface(dtype): np_data = np.arange(10).astype(dtype) cupy_data = cupy.array(np_data) @@ -3707,7 +3707,7 @@ def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names(): assert result._data.names == tuple(result._data.keys()) -@pytest.mark.parametrize("dtype", dtypes + ["category"]) +@pytest.mark.parametrize("dtype", [*dtypes, "category"]) def test_dataframe_0_row_dtype(dtype): if dtype == "category": data = pd.Series(["a", "b", "c", "d", "e"], dtype="category") @@ -7910,10 +7910,10 @@ def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index): with _hide_concat_empty_dtype_warning(): expected = pd.concat( - [pdf] + other_pd, sort=sort, ignore_index=ignore_index + [pdf, *other_pd], sort=sort, ignore_index=ignore_index ) actual = cudf.concat( - [gdf] + other_gd, sort=sort, ignore_index=ignore_index + [gdf, *other_gd], sort=sort, ignore_index=ignore_index ) # In some cases, Pandas creates an empty Index([], dtype="object") for @@ -8026,10 +8026,10 @@ def test_dataframe_concat_lists(df, other, sort, ignore_index): with _hide_concat_empty_dtype_warning(): expected = pd.concat( - [pdf] + other_pd, sort=sort, ignore_index=ignore_index + [pdf, *other_pd], sort=sort, ignore_index=ignore_index ) actual = cudf.concat( - [gdf] + other_gd, sort=sort, ignore_index=ignore_index + [gdf, *other_gd], sort=sort, ignore_index=ignore_index ) if expected.shape != df.shape: @@ -10892,7 +10892,7 @@ def test_dataframe_from_ndarray_dup_columns(): @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]]) def test_dataframe_contains(name, contains, other_names): - column_names = [name] + other_names + column_names = [name, *other_names] gdf = cudf.DataFrame({c: [0] for c in column_names}) pdf = pd.DataFrame({c: [0] for c in column_names}) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index f93bd2c5d32..6a9dd4c4a66 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -16,7 +16,7 @@ @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): rng = np.random.default_rng(seed=0) - types = NUMERIC_TYPES + ["bool"] + types = [*NUMERIC_TYPES, "bool"] nrows = request.param # Create a pandas dataframe with random data of mixed types diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index e4422e204bc..eae0fd23ef8 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -917,7 +917,6 @@ def test_groupby_apply_return_col_from_df(): # tests a UDF that consists of purely colwise # ops, such as `lambda group: group.x + group.y` # which returns a column - func = lambda group: group.x + group.y # noqa:E731 df = cudf.DataFrame( { "id": range(10), @@ -1222,7 +1221,7 @@ def test_groupby_column_numeral(): pd.Series([0, 2, 0]), pd.Series([0, 2, 0], index=[0, 2, 1]), ], -) # noqa: E501 +) def test_groupby_external_series(series): pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) gdf = DataFrame.from_pandas(pdf) @@ -2016,8 +2015,8 @@ def test_multi_agg(): @pytest.mark.parametrize( "agg", ( - list(itertools.combinations(["count", "max", "min", "nunique"], 2)) - + [ + [ + *itertools.combinations(["count", "max", "min", "nunique"], 2), {"b": "min", "c": "mean"}, {"b": "max", "c": "mean"}, {"b": "count", "c": "mean"}, diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py index 430ed973f19..4921b7b51fc 100644 --- a/python/cudf/cudf/tests/test_hdf.py +++ b/python/cudf/cudf/tests/test_hdf.py @@ -16,7 +16,7 @@ @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): - types = set(NUMERIC_TYPES + ["datetime64[ns]"] + ["bool"]) - set( + types = set([*NUMERIC_TYPES, "datetime64[ns]", "bool"]) - set( UNSIGNED_TYPES ) typer = {"col_" + val: val for val in types} diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 24d42d9eb4c..11f6d687931 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1274,7 +1274,7 @@ def test_index_append_list(data, other): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) @pytest.mark.parametrize("name", [1, "a", None]) def test_index_basic(data, dtype, name): @@ -1399,7 +1399,7 @@ def test_multiindex_append(data, other): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_empty(data, dtype): pdi = pd.Index(data, dtype=dtype) @@ -1410,7 +1410,7 @@ def test_index_empty(data, dtype): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_size(data, dtype): pdi = pd.Index(data, dtype=dtype) @@ -1421,7 +1421,7 @@ def test_index_size(data, dtype): @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_drop_duplicates(data, dtype): pdi = pd.Index(data, dtype=dtype) @@ -1437,7 +1437,7 @@ def test_dropna_bad_how(): @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_tolist(data, dtype): gdi = cudf.Index(data, dtype=dtype) @@ -1455,7 +1455,7 @@ def test_index_tolist(data, dtype): @pytest.mark.parametrize("data", [[], [1], [1, 2, 3]]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_iter_error(data, dtype): gdi = cudf.Index(data, dtype=dtype) @@ -1473,7 +1473,7 @@ def test_index_iter_error(data, dtype): @pytest.mark.parametrize("data", [[], [1], [1, 2, 3, 4, 5]]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_values_host(data, dtype): gdi = cudf.Index(data, dtype=dtype) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index f6941ce7fae..f8e61651f37 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1527,7 +1527,7 @@ def test_categorical_typecast_outer(): result = left.merge(right, how="outer", on="key") -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"]) def test_categorical_typecast_inner_one_cat(dtype): data = np.array([1, 2, 3], dtype=dtype) @@ -1538,7 +1538,7 @@ def test_categorical_typecast_inner_one_cat(dtype): assert result["key"].dtype == left["key"].dtype.categories.dtype -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"]) def test_categorical_typecast_left_one_cat(dtype): data = np.array([1, 2, 3], dtype=dtype) @@ -1549,7 +1549,7 @@ def test_categorical_typecast_left_one_cat(dtype): assert result["key"].dtype == left["key"].dtype -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"]) def test_categorical_typecast_outer_one_cat(dtype): data = np.array([1, 2, 3], dtype=dtype) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index b48be6b2c2f..aaa8d7d07ee 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -58,12 +58,14 @@ def gdf(pdf): @pytest.fixture(params=[0, 1, 10, 100]) def gdf_writer_types(request): # datetime64[us], datetime64[ns] are unsupported due to a bug in parser - types = ( - NUMERIC_TYPES - + ["datetime64[s]", "datetime64[ms]"] - + TIMEDELTA_TYPES - + ["bool", "str"] - ) + types = [ + *NUMERIC_TYPES, + "datetime64[s]", + "datetime64[ms]", + *TIMEDELTA_TYPES, + "bool", + "str", + ] typer = {"col_" + val: val for val in types} ncols = len(types) nrows = request.param diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 41c1c3ccb20..c4b4ef60184 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -606,7 +606,7 @@ def normalized_equals(value1, value2): def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): from pyarrow import orc - supported_stat_types = supported_numpy_dtypes + ["str"] + supported_stat_types = [*supported_numpy_dtypes, "str"] # Writing bool columns to multiple row groups is disabled # until #6763 is fixed if nrows == 100000: @@ -681,7 +681,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): from pyarrow import orc - supported_stat_types = supported_numpy_dtypes + ["str"] + supported_stat_types = [*supported_numpy_dtypes, "str"] # Writing bool columns to multiple row groups is disabled # until #6763 is fixed if nrows == 200000: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 659d2ebd89a..de3636f7526 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2313,7 +2313,7 @@ def test_parquet_writer_criteo(tmpdir): cont_names = ["I" + str(x) for x in range(1, 14)] cat_names = ["C" + str(x) for x in range(1, 27)] - cols = ["label"] + cont_names + cat_names + cols = ["label", *cont_names, *cat_names] df = cudf.read_csv(fname, sep="\t", names=cols, byte_range=(0, 1000000000)) df = df.drop(columns=cont_names) diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index 7d8303df0c3..9a2816f5444 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -90,4 +90,4 @@ def test_quantile_type_int_float(interpolation): actual = gsr.quantile(0.5, interpolation=interpolation) assert expected == actual - assert type(expected) == type(actual) + assert type(expected) is type(actual) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index d9f4ceaf3f7..8ea0d205e8b 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -31,7 +31,7 @@ [ cudf.Series([5, 1, 2, 3, None, 243, None, 4]), cudf.Series(["one", "two", "three", None, "one"], dtype="category"), - cudf.Series(list(range(400)) + [None]), + cudf.Series([*list(range(400)), None]), ], ) @pytest.mark.parametrize( @@ -128,7 +128,7 @@ def test_series_replace(): assert_eq(a8, sr8.to_numpy()) # large input containing null - sr9 = cudf.Series(list(range(400)) + [None]) + sr9 = cudf.Series([*list(range(400)), None]) sr10 = sr9.replace([22, 323, 27, 0], None) assert sr10.null_count == 5 assert len(sr10.dropna().to_numpy()) == (401 - 5) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 53fe5f7f30d..5cebdf37c9f 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -28,9 +28,9 @@ # If spilling is enabled globally, we skip many test permutations # to reduce running time. if get_global_manager() is not None: - ALL_TYPES = ["float32"] # noqa: F811 - DATETIME_TYPES = ["datetime64[ms]"] # noqa: F811 - NUMERIC_TYPES = ["float32"] # noqa: F811 + ALL_TYPES = ["float32"] + DATETIME_TYPES = ["datetime64[ms]"] + NUMERIC_TYPES = ["float32"] # To save time, we skip tests marked "pytest.mark.xfail" pytest_xfail = pytest.mark.skipif diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index f2faf4343b6..fcd98831686 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -351,7 +351,7 @@ def test_scalar_implicit_float_conversion(value): got = float(cudf.Scalar(value)) assert expect == got - assert type(expect) == type(got) + assert type(expect) is type(got) @pytest.mark.parametrize("value", [1, -1, 1.5, 0, "1", True, False]) @@ -360,7 +360,7 @@ def test_scalar_implicit_int_conversion(value): got = int(cudf.Scalar(value)) assert expect == got - assert type(expect) == type(got) + assert type(expect) is type(got) @pytest.mark.parametrize("cls", [int, float, bool]) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a040d1dc57f..99bd9adb034 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -282,8 +282,8 @@ def test_series_concat_list_series_with_index(data, others, ignore_index): other_ps = others other_gs = [cudf.from_pandas(obj) for obj in others] - expected = pd.concat([psr] + other_ps, ignore_index=ignore_index) - actual = cudf.concat([gsr] + other_gs, ignore_index=ignore_index) + expected = pd.concat([psr, *other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, *other_gs], ignore_index=ignore_index) assert_eq(expected, actual) @@ -1942,7 +1942,7 @@ def test_diff_many_dtypes(data): @pytest.mark.parametrize("num_rows", [1, 100]) @pytest.mark.parametrize("num_bins", [1, 10]) @pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"]) @pytest.mark.parametrize("series_bins", [True, False]) def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): rng = np.random.default_rng(seed=0) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 5406836ba61..6119fda0752 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -472,7 +472,7 @@ def test_loc_setitem_series_index_alignment_13031(other_index): ), ], ) -@pytest.mark.parametrize("arg", list(range(-20, 20)) + [5.6, 3.1]) +@pytest.mark.parametrize("arg", [*list(range(-20, 20)), 5.6, 3.1]) def test_series_set_item_range_index(ps, arg): gsr = cudf.from_pandas(ps) psr = ps.copy(deep=True) diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 7af83a99d60..13d98e43ddc 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -669,7 +669,7 @@ def test_statistics_expose(manager: SpillManager): # Expose the first buffer buffers[0].owner.mark_exposed() assert len(manager.statistics.exposes) == 1 - stat = list(manager.statistics.exposes.values())[0] + stat = next(iter(manager.statistics.exposes.values())) assert stat.count == 1 assert stat.total_nbytes == buffers[0].nbytes assert stat.spilled_nbytes == 0 diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 9700f548a16..bdc9e695844 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -536,8 +536,8 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): assert_eq(expect, got) - expect = ps.str.cat(others=[ps.index] + [ps.index], sep=sep, na_rep=na_rep) - got = gs.str.cat(others=[gs.index] + [gs.index], sep=sep, na_rep=na_rep) + expect = ps.str.cat(others=[ps.index, ps.index], sep=sep, na_rep=na_rep) + got = gs.str.cat(others=[gs.index, gs.index], sep=sep, na_rep=na_rep) assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index c3620db3880..87734ebed58 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -69,7 +69,7 @@ def test_basic_assert_index_equal( msg = str(e) if kind is not None: - if (kind == TypeError) and ( + if (kind is TypeError) and ( msg == ( "Categoricals can only be compared " diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 47e541fdcef..3637ef075f2 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -44,7 +44,7 @@ def test_tokenize(): actual = strings.str.tokenize() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -71,7 +71,7 @@ def test_tokenize_delimiter(): actual = strings.str.tokenize(delimiter="o") - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -106,7 +106,7 @@ def test_detokenize(): "the siamésé cat jumped under the sofa", ] ) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) indices = cudf.Series( @@ -122,7 +122,7 @@ def test_detokenize(): "the+the+the+the", ] ) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -150,7 +150,7 @@ def test_token_count(delimiter, expected_token_counts): actual = strings.str.token_count(delimiter) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual, check_dtype=False) @@ -208,7 +208,7 @@ def test_tokenize_with_vocabulary(delimiter, input, default_id, results): ) actual = tokenizer.tokenize(strings, delimiter, default_id) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -232,7 +232,7 @@ def test_normalize_spaces(): actual = strings.str.normalize_spaces() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -252,7 +252,7 @@ def test_normalize_characters(): ) actual = strings.str.normalize_characters() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) expected = cudf.Series( @@ -266,7 +266,7 @@ def test_normalize_characters(): ] ) actual = strings.str.normalize_characters(do_lower=False) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -309,7 +309,7 @@ def test_ngrams(n, separator, expected_values): actual = strings.str.ngrams(n=n, separator=separator) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -364,7 +364,7 @@ def test_character_ngrams(n, expected_values, expected_index, as_list): actual = strings.str.character_ngrams(n=n, as_list=as_list) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -379,12 +379,12 @@ def test_hash_character_ngrams(): ] ) actual = strings.str.hash_character_ngrams(5, True) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) actual = strings.str.hash_character_ngrams(5) expected = expected.explode() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -417,7 +417,7 @@ def test_ngrams_tokenize(n, separator, expected_values): actual = strings.str.ngrams_tokenize(n=n, separator=separator) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -844,7 +844,7 @@ def test_porter_stemmer_measure(): actual = strings.str.porter_stemmer_measure() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -856,14 +856,14 @@ def test_is_vowel_consonant(): [False, False, True, False, False, False, True, False, None, False] ) actual = strings.str.is_vowel(2) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) expected = cudf.Series( [True, False, True, False, False, False, True, True, None, False] ) actual = strings.str.is_consonant(1) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) indices = cudf.Series([2, 1, 0, 0, 1, 2, 0, 3, 0, 0]) @@ -871,14 +871,14 @@ def test_is_vowel_consonant(): [False, True, False, False, True, False, True, True, None, False] ) actual = strings.str.is_vowel(indices) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) expected = cudf.Series( [False, False, True, True, False, True, False, False, None, False] ) actual = strings.str.is_consonant(indices) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -1097,5 +1097,5 @@ def test_byte_pair_encoding(separator, input, results): expected = cudf.Series([results, None, "", results]) actual = encoder(strings, separator) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 252bb19063a..5681601d2be 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -829,7 +829,7 @@ >>> cudf.read_json(json_str, engine='cudf', lines=True, dtype={'k1':float, 'k2':cudf.ListDtype(int)}) k1 k2 0 1.0 [1] -""" # noqa: E501 +""" doc_read_json: Callable = docfmt_partial(docstring=_docstring_read_json) _docstring_to_json = """ diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 78aeac425f7..8966789fee8 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -64,7 +64,7 @@ def query_parser(text): Returns ------- info: a `dict` of the parsed info - """ # noqa + """ # convert any '@' to text = text.replace("@", ENVREF_PREFIX) tree = ast.parse(text) @@ -249,7 +249,7 @@ def query_execute(df, expr, callenv): nrows = len(df) out = column_empty(nrows, dtype=np.bool_) # run kernel - args = [out] + colarrays + envargs + args = [out, *colarrays, *envargs] with _CUDFNumbaConfig(): kernel.forall(nrows)(*args) out_mask = applyutils.make_aggregate_nullmask(df, columns=columns) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index e6d252b8807..c83c1cbe895 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -210,7 +210,7 @@ class GetAttrGetItemMixin: # Tracking of protected keys by each subclass is necessary to make the # `__getattr__`->`__getitem__` call safe. See - # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html # noqa: E501 + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html # for an explanation. In brief, defining the `_PROTECTED_KEYS` allows this # class to avoid calling `__getitem__` inside `__getattr__` when # `__getitem__` will internally again call `__getattr__`, resulting in an diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 4473a0e6f12..d494e157a18 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1589,8 +1589,8 @@ def test_numpy_cupy_flatiter(series): _, s = series arr = s.values - assert type(arr.flat._fsproxy_fast) == cp.flatiter - assert type(arr.flat._fsproxy_slow) == np.flatiter + assert type(arr.flat._fsproxy_fast) is cp.flatiter + assert type(arr.flat._fsproxy_slow) is np.flatiter @pytest.mark.xfail( diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py index 665b9d6fb08..1909392b9f7 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py @@ -15,7 +15,7 @@ def assert_plots_equal(expect, got): for expect_ch, got_ch in zip( expect.get_children(), got.get_children() ): - assert type(expect_ch) == type(got_ch) + assert type(expect_ch) is type(got_ch) if isinstance(expect_ch, Line2D): assert_equal(expect_ch.get_xdata(), got_ch.get_xdata()) assert_equal(expect_ch.get_ydata(), got_ch.get_ydata()) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py index 27d9df83476..2a0f6697f3a 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py @@ -8,7 +8,7 @@ def assert_plotly_equal(expect, got): - assert type(expect) == type(got) + assert type(expect) is type(got) if isinstance(expect, dict): assert expect.keys() == got.keys() for k in expect.keys(): diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py index 4b272900acd..021c5bac9b7 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py @@ -14,7 +14,7 @@ def assert_plots_equal(expect, got): for expect_ch, got_ch in zip( expect.get_children(), got.get_children() ): - assert type(expect_ch) == type(got_ch) + assert type(expect_ch) is type(got_ch) if isinstance(expect_ch, Line2D): assert_equal(expect_ch.get_xdata(), got_ch.get_xdata()) assert_equal(expect_ch.get_ydata(), got_ch.get_ydata()) diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index ba4858c5619..72e09b872d5 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -21,8 +21,8 @@ del _ensure_polars_version __all__: list[str] = [ - "execute_with_cudf", "Translator", "__git_commit__", "__version__", + "execute_with_cudf", ] diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index 3b1eff4a0d0..9dff8822376 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column"] +__all__: list[str] = ["Column", "DataFrame"] from cudf_polars.containers.column import Column from cudf_polars.containers.dataframe import DataFrame diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 326d6b65cbe..98d49e36fb1 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -36,27 +36,27 @@ from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction __all__ = [ - "Expr", + "Agg", + "AggInfo", + "BinOp", + "BooleanFunction", + "Cast", + "Col", + "ColRef", "ErrorExpr", - "NamedExpr", + "Expr", + "Filter", + "Gather", + "GroupedRollingWindow", + "Len", "Literal", "LiteralColumn", - "Len", - "Col", - "ColRef", - "BooleanFunction", - "StringFunction", - "TemporalFunction", + "NamedExpr", + "RollingWindow", "Sort", "SortBy", - "Gather", - "Filter", - "RollingWindow", - "GroupedRollingWindow", - "Cast", - "Agg", - "AggInfo", + "StringFunction", + "TemporalFunction", "Ternary", - "BinOp", "UnaryFunction", ] diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index 2af9fdaacc5..624a9bd87ea 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -31,7 +31,7 @@ class Agg(Expr): - __slots__ = ("name", "options", "op", "request") + __slots__ = ("name", "op", "options", "request") _non_child = ("dtype", "name", "options") def __init__( diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py index 23851f91938..4c7ae007070 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py @@ -20,7 +20,7 @@ from cudf_polars.containers import Column, DataFrame -__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext", "ColRef"] +__all__ = ["AggInfo", "Col", "ColRef", "ExecutionContext", "Expr", "NamedExpr"] class AggInfo(NamedTuple): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index 1682e7a8a9c..5aa35ead127 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -195,7 +195,7 @@ def do_evaluate( # If the input null count was non-zero, we must # post-process the result to insert the correct value. h_result = plc.interop.to_arrow(result).as_py() - if is_any and not h_result or not is_any and h_result: + if (is_any and not h_result) or (not is_any and h_result): # Any All # False || Null => Null True && Null => Null return Column(plc.Column.all_null_like(column.obj, 1)) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py index fa68bcb9426..48c37d101f4 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py @@ -13,7 +13,7 @@ if TYPE_CHECKING: import pylibcudf as plc -__all__ = ["RollingWindow", "GroupedRollingWindow"] +__all__ = ["GroupedRollingWindow", "RollingWindow"] class RollingWindow(Expr): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py index 77d7d4c0d22..12326740f74 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py @@ -20,7 +20,7 @@ from cudf_polars.containers import DataFrame -__all__ = ["Gather", "Filter"] +__all__ = ["Filter", "Gather"] class Gather(Expr): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 92c3c658c21..124a6e8d71c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -92,7 +92,7 @@ def from_polars(cls, obj: pl_expr.StringFunction) -> Self: raise ValueError("StringFunction required") return getattr(cls, name) - __slots__ = ("name", "options", "_regex_program") + __slots__ = ("_regex_program", "name", "options") _non_child = ("dtype", "name", "options") def __init__( diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 7999ec86068..10caaff6811 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -21,7 +21,7 @@ from cudf_polars.containers import DataFrame -__all__ = ["Cast", "UnaryFunction", "Len"] +__all__ = ["Cast", "Len", "UnaryFunction"] class Cast(Expr): diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index e8d9691f2a0..a28b4cf25b2 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -42,24 +42,24 @@ __all__ = [ "IR", - "ErrorNode", - "PythonScan", - "Scan", "Cache", - "DataFrameScan", - "Select", - "GroupBy", - "Join", "ConditionalJoin", - "HStack", + "DataFrameScan", "Distinct", - "Sort", - "Slice", + "ErrorNode", "Filter", - "Projection", + "GroupBy", + "HConcat", + "HStack", + "Join", "MapFunction", + "Projection", + "PythonScan", + "Scan", + "Select", + "Slice", + "Sort", "Union", - "HConcat", ] @@ -130,7 +130,7 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column class IR(Node["IR"]): """Abstract plan node, representing an unevaluated dataframe.""" - __slots__ = ("schema", "_non_child_args") + __slots__ = ("_non_child_args", "schema") # This annotation is needed because of https://github.com/python/mypy/issues/17981 _non_child: ClassVar[tuple[str, ...]] = ("schema",) # Concrete classes should set this up with the arguments that will @@ -253,16 +253,16 @@ class Scan(IR): """Input from files.""" __slots__ = ( - "typ", - "reader_options", "cloud_options", "config_options", - "paths", - "with_columns", - "skip_rows", "n_rows", - "row_index", + "paths", "predicate", + "reader_options", + "row_index", + "skip_rows", + "typ", + "with_columns", ) _non_child = ( "schema", @@ -688,7 +688,7 @@ class DataFrameScan(IR): This typically arises from ``q.collect().lazy()`` """ - __slots__ = ("df", "projection", "predicate") + __slots__ = ("df", "predicate", "projection") _non_child = ("schema", "df", "projection", "predicate") df: Any """Polars LazyFrame object.""" @@ -819,11 +819,11 @@ class GroupBy(IR): """Perform a groupby.""" __slots__ = ( + "agg_infos", "agg_requests", "keys", "maintain_order", "options", - "agg_infos", ) _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options") keys: tuple[expr.NamedExpr, ...] @@ -993,7 +993,7 @@ def do_evaluate( class ConditionalJoin(IR): """A conditional inner join of two dataframes on a predicate.""" - __slots__ = ("predicate", "options", "ast_predicate") + __slots__ = ("ast_predicate", "options", "predicate") _non_child = ("schema", "predicate", "options") predicate: expr.Expr options: tuple @@ -1053,7 +1053,7 @@ def do_evaluate( class Join(IR): """A join of two dataframes.""" - __slots__ = ("left_on", "right_on", "options") + __slots__ = ("left_on", "options", "right_on") _non_child = ("schema", "left_on", "right_on", "options") left_on: tuple[expr.NamedExpr, ...] """List of expressions used as keys in the left frame.""" @@ -1337,7 +1337,7 @@ def do_evaluate( class Distinct(IR): """Produce a new dataframe with distinct rows.""" - __slots__ = ("keep", "subset", "zlice", "stable") + __slots__ = ("keep", "stable", "subset", "zlice") _non_child = ("schema", "keep", "subset", "zlice", "stable") keep: plc.stream_compaction.DuplicateKeepOption """Which distinct value to keep.""" @@ -1424,7 +1424,7 @@ def do_evaluate( class Sort(IR): """Sort a dataframe.""" - __slots__ = ("by", "order", "null_order", "stable", "zlice") + __slots__ = ("by", "null_order", "order", "stable", "zlice") _non_child = ("schema", "by", "order", "null_order", "stable", "zlice") by: tuple[expr.NamedExpr, ...] """Sort keys.""" @@ -1505,7 +1505,7 @@ def do_evaluate( class Slice(IR): """Slice a dataframe.""" - __slots__ = ("offset", "length") + __slots__ = ("length", "offset") _non_child = ("schema", "offset", "length") offset: int """Start of the slice.""" diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py index be8338cb9a9..b3248dae93c 100644 --- a/python/cudf_polars/cudf_polars/dsl/traversal.py +++ b/python/cudf_polars/cudf_polars/dsl/traversal.py @@ -16,10 +16,10 @@ __all__: list[str] = [ - "traversal", - "reuse_if_unchanged", - "make_recursive", "CachingVisitor", + "make_recursive", + "reuse_if_unchanged", + "traversal", ] diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index 57c5fdaa7cf..52be130ab90 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -21,13 +21,13 @@ from cudf_polars.dsl import expr, ir, nodebase __all__: list[str] = [ - "PolarsIR", - "PolarsExpr", - "NodeTraverser", - "OptimizationArgs", - "GenericTransformer", "ExprTransformer", + "GenericTransformer", "IRTransformer", + "NodeTraverser", + "OptimizationArgs", + "PolarsExpr", + "PolarsIR", ] PolarsIR: TypeAlias = Union[ diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index e7ac72df609..6bb5d78c488 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -19,9 +19,9 @@ ) __all__ = [ - "from_polars", - "downcast_arrow_lists", "can_cast", + "downcast_arrow_lists", + "from_polars", "is_order_preserving_cast", ] import pylibcudf as plc @@ -75,11 +75,13 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: return ( ( from_ == to - or not has_empty - and ( - plc.traits.is_fixed_width(to) - and plc.traits.is_fixed_width(from_) - and plc.unary.is_supported_cast(from_, to) + or ( + not has_empty + and ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) ) ) or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index f050a7c568a..b781b13ec10 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -158,6 +158,7 @@ ignore = [ "ISC002", # multi-line-implicit-string-concatenation ] fixable = ["ALL"] +typing-modules = ["cudf_polars.typing"] [tool.ruff.lint.per-file-ignores] "**/tests/**/*.py" = ["D"] diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index cc17e71039a..20eb2404b77 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -3,15 +3,15 @@ import warnings from importlib import import_module -from dask import config import dask.dataframe as dd -from dask.dataframe import from_delayed # noqa: E402 +from dask import config +from dask.dataframe import from_delayed -import cudf # noqa: E402 +import cudf -from . import backends # noqa: E402, F401 -from ._version import __git_commit__, __version__ # noqa: E402, F401 -from .core import concat, from_cudf, DataFrame, Index, Series # noqa: F401 +from . import backends # noqa: F401 +from ._version import __git_commit__, __version__ # noqa: F401 +from .core import DataFrame, Index, Series, concat, from_cudf QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED @@ -56,17 +56,17 @@ def inner_func(*args, **kwargs): if QUERY_PLANNING_ON: + from . import io from ._expr.expr import _patch_dask_expr - from . import io # noqa: F401 groupby_agg = _deprecated_api("dask_cudf.groupby_agg") read_text = DataFrame.read_text _patch_dask_expr() else: + from . import io # noqa: F401 from ._legacy.groupby import groupby_agg # noqa: F401 from ._legacy.io import read_text # noqa: F401 - from . import io # noqa: F401 to_orc = _deprecated_api( @@ -78,10 +78,10 @@ def inner_func(*args, **kwargs): __all__ = [ "DataFrame", - "Series", "Index", - "from_cudf", + "Series", "concat", + "from_cudf", "from_delayed", ] diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py index 89c0d108743..2dc4031b876 100644 --- a/python/dask_cudf/dask_cudf/_expr/collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -213,8 +213,9 @@ def _create_array_collection_with_meta(expr): name = result._name meta = result._meta divisions = result.divisions - chunks = ((np.nan,) * (len(divisions) - 1),) + tuple( - (d,) for d in meta.shape[1:] + chunks = ( + (np.nan,) * (len(divisions) - 1), + *tuple((d,) for d in meta.shape[1:]), ) if len(chunks) > 1: if isinstance(dsk, HighLevelGraph): @@ -224,11 +225,11 @@ def _create_array_collection_with_meta(expr): layer = dsk if isinstance(layer, Blockwise): layer.new_axes["j"] = chunks[1][0] - layer.output_indices = layer.output_indices + ("j",) + layer.output_indices = (*layer.output_indices, "j") else: suffix = (0,) * (len(chunks) - 1) for i in range(len(chunks[0])): - layer[(name, i) + suffix] = layer.pop((name, i)) + layer[(name, i, *suffix)] = layer.pop((name, i)) return da.Array(dsk, name=name, chunks=chunks, meta=meta) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 7d6d5c05cbe..5fd217209ec 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -10,7 +10,7 @@ # This module provides backward compatibility for legacy import patterns. if dd.DASK_EXPR_ENABLED: - from dask_cudf._expr.collection import ( # noqa: E402 + from dask_cudf._expr.collection import ( DataFrame, Index, Series, @@ -19,7 +19,7 @@ from dask_cudf._legacy.core import DataFrame, Index, Series # noqa: F401 -concat = dd.concat # noqa: F401 +concat = dd.concat @_dask_cudf_performance_tracking diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 212951336c9..9bca33e414a 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,9 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api, QUERY_PLANNING_ON - -from . import csv, orc, json, parquet, text # noqa: F401 +from dask_cudf import QUERY_PLANNING_ON, _deprecated_api +from . import csv, json, orc, parquet, text # noqa: F401 read_csv = _deprecated_api( "dask_cudf.io.read_csv", new_api="dask_cudf.read_csv" diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ce9935c8b3c..ba6209c4820 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -40,7 +40,7 @@ def TaskList(*x): from dask_cudf import QUERY_PLANNING_ON, _deprecated_api # Dask-expr imports CudfEngine from this module -from dask_cudf._legacy.io.parquet import CudfEngine # noqa: F401 +from dask_cudf._legacy.io.parquet import CudfEngine if TYPE_CHECKING: from collections.abc import MutableMapping diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 5130b804179..cda7e2d134d 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -489,7 +489,7 @@ def test_repartition_hash_staged(npartitions): ) # Make sure we are getting a dask_cudf dataframe - assert type(ddf_new) == type(ddf) + assert type(ddf_new) is type(ddf) # Check that the length was preserved assert len(ddf_new) == len(ddf) @@ -956,7 +956,7 @@ def func(x): # NOTE: The calculation here doesn't need to make sense. # We just need to make sure we get the right type back. - assert type(result) == type(expect) + assert type(result) is type(expect) @pytest.mark.parametrize("data", [[1, 2, 3], [1.1, 2.3, 4.5]]) diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index fe57d4a4f00..d91b9defc1c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -44,7 +44,7 @@ def test_pyarrow_conversion_dispatch(preserve_index, index): if not preserve_index and index is not None: df1.index.name = None - assert type(df1) == type(df2) + assert type(df1) is type(df2) assert_eq(df1, df2) # Check that preserve_index does not produce a RangeIndex diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 918290aa6fa..9bd3b506db0 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -58,7 +58,7 @@ def pdf(request): # deprecation check for "collect". @pytest.mark.parametrize( "aggregation", - sorted(tuple(set(OPTIMIZED_AGGS) - {list}) + ("collect",)), + sorted((*tuple(set(OPTIMIZED_AGGS) - {list}), "collect")), ) @pytest.mark.parametrize("series", [False, True]) def test_groupby_basic(series, aggregation, pdf): diff --git a/python/libcudf/libcudf/__init__.py b/python/libcudf/libcudf/__init__.py index 10c476cbe89..4077fa8fbf9 100644 --- a/python/libcudf/libcudf/__init__.py +++ b/python/libcudf/libcudf/__init__.py @@ -14,3 +14,5 @@ from libcudf._version import __git_commit__, __version__ from libcudf.load import load_library + +__all__ = ["__git_commit__", "__version__", "load_library"] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 62a2170f83e..8ea176a6b07 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -65,8 +65,8 @@ "aggregation", "binaryop", "column_factories", - "contiguous_split", "concatenate", + "contiguous_split", "copying", "datetime", "experimental", @@ -83,6 +83,7 @@ "lists", "merge", "null_mask", + "nvtext", "partitioning", "quantiles", "reduce", @@ -91,13 +92,12 @@ "rolling", "round", "search", + "sorting", "stream_compaction", "strings", - "sorting", "traits", "transform", "transpose", "types", "unary", - "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 4f125d3a733..d88a7d4b825 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -15,11 +15,11 @@ ) __all__ = [ + "byte_pair_encode", "edit_distance", "generate_ngrams", "jaccard", "minhash", - "byte_pair_encode", "ngrams_tokenize", "normalize", "replace", diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index 1cbaac57315..555ca2fb02c 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -281,7 +281,7 @@ def test_read_csv_header(csv_table_data, source_or_sink, header): new_tbl_dict = {} for i, (name, vals) in enumerate(tbl_dict.items()): str_vals = [str(val) for val in vals] - new_tbl_dict[str(i)] = [name] + str_vals + new_tbl_dict[str(i)] = [name, *str_vals] pa_table = pa.table(new_tbl_dict) assert_table_and_meta_eq( From 852338e71dae9833a53507bd4b1470798f0a5c4b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 2 Dec 2024 16:41:35 -0600 Subject: [PATCH 2/2] Update PyTorch to >=2.4.0 to get fix for CUDA array interface bug, and drop CUDA 11 PyTorch tests. (#17475) This PR updates our PyTorch lower bound to 2.4.0 to get the bugfix from https://github.com/pytorch/pytorch/pull/121458. Also, this PR drops CUDA 11 tests because conda-forge no longer produces CUDA 11 builds of PyTorch. This was causing a failure on Hopper GPUs because the last available CUDA 11 builds from conda-forge do not include sm90 support. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17475 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 3 --- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- dependencies.yaml | 7 ++----- .../cudf/cudf/tests/test_cuda_array_interface.py | 15 +++++---------- .../dependencies.yaml | 2 +- 5 files changed, 9 insertions(+), 20 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 97c72ec8042..2be64b7cd70 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -80,7 +80,6 @@ dependencies: - python-confluent-kafka>=2.5.0,<2.6.0a0 - python-xxhash - python>=3.10,<3.13 -- pytorch>=2.1.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - rapids-dask-dependency==25.2.*,>=0.0.0a0 - rich @@ -97,8 +96,6 @@ dependencies: - sphinxcontrib-websupport - streamz - sysroot_linux-64==2.17 -- tokenizers==0.15.2 -- transformers==4.39.3 - typing_extensions>=4.0.0 - zlib>=1.2.13 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 84b58b6d7a4..6b5ca04c015 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -78,7 +78,7 @@ dependencies: - python-confluent-kafka>=2.5.0,<2.6.0a0 - python-xxhash - python>=3.10,<3.13 -- pytorch>=2.1.0 +- pytorch>=2.4.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - rapids-dask-dependency==25.2.*,>=0.0.0a0 - rich diff --git a/dependencies.yaml b/dependencies.yaml index 3976696a41c..259d41b59fe 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -885,12 +885,9 @@ dependencies: - output_types: conda matrices: - matrix: - arch: x86_64 + cuda: "12.*" packages: - # Currently, CUDA + aarch64 builds of pytorch do not exist on conda-forge. - - pytorch>=2.1.0 - # We only install these on x86_64 to avoid pulling pytorch as a - # dependency of transformers. + - pytorch>=2.4.0 - *tokenizers - *transformers - matrix: diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 381ca45de31..dcde0dab83d 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -187,7 +187,7 @@ def test_column_from_ephemeral_cupy_try_lose_reference(): ), ) def test_cuda_array_interface_pytorch(): - torch = pytest.importorskip("torch", minversion="1.6.0") + torch = pytest.importorskip("torch", minversion="2.4.0") if not torch.cuda.is_available(): pytest.skip("need gpu version of pytorch to be installed") @@ -202,15 +202,10 @@ def test_cuda_array_interface_pytorch(): assert_eq(got, cudf.Series(buffer, dtype=np.bool_)) - # TODO: This test fails with PyTorch 2. It appears that PyTorch - # checks that the pointer is device-accessible even when the - # size is zero. See - # https://github.com/pytorch/pytorch/issues/98133 - # - # index = cudf.Index([], dtype="float64") - # tensor = torch.tensor(index) - # got = cudf.Index(tensor) - # assert_eq(got, index) + index = cudf.Index([], dtype="float64") + tensor = torch.tensor(index) + got = cudf.Index(tensor) + assert_eq(got, index) index = cudf.core.index.RangeIndex(start=0, stop=100) tensor = torch.tensor(index) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index 6b317cc13fb..e726b7fdca1 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -213,7 +213,7 @@ dependencies: - output_types: conda packages: - numpy - - pytorch>=2.1.0 + - pytorch>=2.4.0 test_seaborn: common: - output_types: conda