rapidsai · rapids-bot · Dec 2, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024
@@ -2,7 +2,7 @@
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
         exclude: |
@@ -17,11 +17,11 @@ repos:
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
   - repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.2
+    rev: v0.16.6
     hooks:
       - id: cython-lint
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.10.0'
+    rev: 'v1.13.0'
     hooks:
       - id: mypy
         additional_dependencies: [types-cachetools]
@@ -33,7 +33,7 @@ repos:
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.8.5
+    rev: 1.9.1
     hooks:
       - id: nbqa-isort
         # Use the cudf_kafka isort orderings in notebooks so that dask
@@ -52,7 +52,7 @@ repos:
             ^cpp/include/cudf_test/cxxopts.hpp
           )
   - repo: https://github.com/sirosen/texthooks
-    rev: 0.6.6
+    rev: 0.6.7
     hooks:
       - id: fix-smartquotes
         exclude: |
@@ -133,7 +133,7 @@ repos:
         pass_filenames: false
         verbose: true
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
     hooks:
       - id: codespell
         additional_dependencies: [tomli]
@@ -144,7 +144,7 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.8
+    rev: v0.8.0
     hooks:
       - id: ruff
         args: ["--fix"]

@@ -72,7 +72,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
   // - Generate labels for lhs and rhs child elements.
   // - Check existence for rows of the table {rhs_labels, rhs_child} in the table
   //   {lhs_labels, lhs_child}.
-  // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence reults
+  // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence results
   //   computed in the previous step.
 
   auto const lhs_child = lhs.get_sliced_child(stream);

@@ -18,12 +18,13 @@ exclude = [
 skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
-ignore-words-list = "inout,unparseable,falsy,couldn,Couldn"
+ignore-words-list = "inout,unparseable,falsy,couldn,Couldn,thirdparty"
 builtin = "clear"
 quiet-level = 3
 
 [tool.ruff]
 line-length = 79
+target-version = "py310"
 
 [tool.ruff.lint]
 typing-modules = ["cudf._typing"]
@@ -94,17 +95,33 @@ select = [
     "UP035",
     # usage of legacy `np.random` function calls
     "NPY002",
+    # Ruff-specific rules
+    "RUF",
 ]
 ignore = [
     # whitespace before :
     "E203",
     # line-too-long (due to Copyright header)
     "E501",
+    # String contains ambiguous character
+    "RUF001",
+    # Parenthesize `a and b` expressions when chaining `and` and `or`
+    # together, to make the precedence clear
+    "RUF021",
+    # Mutable class attributes should be annotated with
+    # `typing.ClassVar`
+    "RUF012",
 ]
 fixable = ["ALL"]
 exclude = [
-    # TODO: Remove this in a follow-up where we fix __all__.
-    "__init__.py",
+    # TODO: https://github.com/rapidsai/cudf/issues/17461
+    "**/*.ipynb",
+]
+
+[tool.ruff.format]
+exclude = [
+    # TODO: https://github.com/rapidsai/cudf/issues/17461
+    "**/*.ipynb",
 ]
 
 [tool.ruff.lint.per-file-ignores]

@@ -42,9 +42,9 @@ def pytest_collection_modifyitems(session, config, items):
         items[:] = list(filter(is_pandas_compatible, items))
 
 else:
-    import cupy  # noqa: W0611, F401
+    import cupy  # noqa: F401
 
-    import cudf  # noqa: W0611, F401
+    import cudf  # noqa: F401
 
     def pytest_collection_modifyitems(session, config, items):
         pass

@@ -56,18 +56,16 @@
 # into the main repo.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-# Turn off isort until we upgrade to 5.8.0
-# https://github.com/pycqa/isort/issues/1594
-from config import (  # noqa: W0611, E402, F401
+from config import (
     NUM_COLS,
     NUM_ROWS,
-    collect_ignore,
-    cudf,  # noqa: W0611, E402, F401
-    pytest_collection_modifyitems,
-    pytest_sessionfinish,
-    pytest_sessionstart,
+    collect_ignore,  # noqa: F401
+    cudf,
+    pytest_collection_modifyitems,  # noqa: F401
+    pytest_sessionfinish,  # noqa: F401
+    pytest_sessionstart,  # noqa: F401
 )
-from utils import (  # noqa: E402
+from utils import (
     OrderedSet,
     collapse_fixtures,
     column_generators,

@@ -99,6 +99,7 @@
 
 
 __all__ = [
+    "NA",
     "BaseIndex",
     "CategoricalDtype",
     "CategoricalIndex",
@@ -114,7 +115,6 @@
     "IntervalIndex",
     "ListDtype",
     "MultiIndex",
-    "NA",
     "NaT",
     "RangeIndex",
     "Scalar",

@@ -95,7 +95,7 @@ def start(self):
                 else:
                     self._data_handler.set_rand_params(self.params)
                     kwargs = self._data_handler._current_params["test_kwargs"]
-                    logging.info(f"Parameters passed: {str(kwargs)}")
+                    logging.info(f"Parameters passed: {kwargs!s}")
                     self._target(file_name, **kwargs)
             except KeyboardInterrupt:
                 logging.info(

@@ -133,7 +133,7 @@ def memory_usage(self, deep=False):
         """
         raise NotImplementedError
 
-    def tolist(self):  # noqa: D102
+    def tolist(self):
         raise TypeError(
             "cuDF does not support conversion to host memory "
             "via the `tolist()` method. Consider using "
@@ -148,7 +148,7 @@ def name(self):
         raise NotImplementedError
 
     @property  # type: ignore
-    def ndim(self) -> int:  # noqa: D401
+    def ndim(self) -> int:
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
@@ -265,7 +265,7 @@ def get_loc(self, key):
         slice(1, 3, None)
         >>> multi_index.get_loc(('b', 'e'))
         1
-        """  # noqa: E501
+        """
 
     def max(self):
         """The maximum value of the index."""
@@ -1473,7 +1473,7 @@ def _intersection(self, other, sort=None):
             ._data
         )
 
-        if sort is {None, True} and len(other):
+        if sort in {None, True} and len(other):
             return intersection_result.sort_values()
         return intersection_result
 

@@ -54,7 +54,7 @@ def get_rmm_memory_resource_stack(
     """
 
     if hasattr(mr, "upstream_mr"):
-        return [mr] + get_rmm_memory_resource_stack(mr.upstream_mr)
+        return [mr, *get_rmm_memory_resource_stack(mr.upstream_mr)]
     return [mr]
 
 
@@ -275,7 +275,7 @@ def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool:
         print(
             f"[WARNING] RMM allocation of {format_bytes(nbytes)} bytes "
             "failed, spill-on-demand couldn't find any device memory to "
-            f"spill:\n{repr(self)}\ntraceback:\n{get_traceback()}\n"
+            f"spill:\n{self!r}\ntraceback:\n{get_traceback()}\n"
             f"{self.statistics}"
         )
         return False  # Since we didn't find anything to spill, we give up

@@ -366,7 +366,7 @@ def __str__(self) -> str:
             f"<{self.__class__.__name__} size={format_bytes(self._size)} "
             f"spillable={self.spillable} exposed={self.exposed} "
             f"num-spill-locks={len(self._spill_locks)} "
-            f"ptr={ptr_info} owner={repr(self._owner)}>"
+            f"ptr={ptr_info} owner={self._owner!r}>"
         )
 
 

@@ -15,17 +15,19 @@
     deserialize_columns,
     serialize_columns,
 )
-from cudf.core.column.datetime import DatetimeColumn  # noqa: F401
-from cudf.core.column.datetime import DatetimeTZColumn  # noqa: F401
-from cudf.core.column.lists import ListColumn  # noqa: F401
-from cudf.core.column.numerical import NumericalColumn  # noqa: F401
-from cudf.core.column.string import StringColumn  # noqa: F401
-from cudf.core.column.struct import StructColumn  # noqa: F401
-from cudf.core.column.timedelta import TimeDeltaColumn  # noqa: F401
-from cudf.core.column.interval import IntervalColumn  # noqa: F401
-from cudf.core.column.decimal import (  # noqa: F401
+from cudf.core.column.datetime import (
+    DatetimeColumn,
+    DatetimeTZColumn,
+)
+from cudf.core.column.decimal import (
     Decimal32Column,
     Decimal64Column,
     Decimal128Column,
     DecimalBaseColumn,
 )
+from cudf.core.column.interval import IntervalColumn
+from cudf.core.column.lists import ListColumn
+from cudf.core.column.numerical import NumericalColumn
+from cudf.core.column.string import StringColumn
+from cudf.core.column.struct import StructColumn
+from cudf.core.column.timedelta import TimeDeltaColumn
@@ -888,7 +888,7 @@ def find_and_replace(
         if len(replacement_col) == replacement_col.null_count:
             replacement_col = replacement_col.astype(self.categories.dtype)
 
-        if type(to_replace_col) != type(replacement_col):
+        if type(to_replace_col) is not type(replacement_col):
             raise TypeError(
                 f"to_replace and value should be of same types,"
                 f"got to_replace dtype: {to_replace_col.dtype} and "

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1016,7 +1016,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
             # astype("interval") (the string only) should no-op
             result = self
         else:
-            was_object = dtype == object or dtype == np.dtype(object)
+            was_object = dtype is object or dtype == np.dtype(object)
             dtype = cudf.dtype(dtype)
             if self.dtype == dtype:
                 result = self
@@ -2117,7 +2117,7 @@ def as_column(
     }:
         if isinstance(dtype, (cudf.CategoricalDtype, cudf.IntervalDtype)):
             dtype = dtype.to_pandas()
-        elif dtype == object:
+        elif dtype is object:
             # Unlike pandas, interpret object as "str" instead of "python object"
             dtype = "str"
         ser = pd.Series(arbitrary, dtype=dtype)

@@ -435,7 +435,7 @@ def _get_decimal_type(
     `op` for the given dtypes.
 
     For precision & scale calculations see : https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-    """  # noqa: E501
+    """
 
     # This should at some point be hooked up to libcudf's
     # binary_operation_fixed_point_scale
@@ -506,8 +506,8 @@ def _get_decimal_type(
     # if we've reached this point, we cannot create a decimal type without
     # overflow; raise an informative error
     raise ValueError(
-        f"Performing {op} between columns of type {repr(lhs_dtype)} and "
-        f"{repr(rhs_dtype)} would result in overflow"
+        f"Performing {op} between columns of type {lhs_dtype!r} and "
+        f"{rhs_dtype!r} would result in overflow"
     )
 
 

@@ -226,7 +226,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             # If `other` is a Python integer and it is out-of-bounds
             # promotion could fail but we can trivially define the result
             # in terms of `notnull` or `NULL_NOT_EQUALS`.
-            if type(other) is int and self.dtype.kind in "iu":  # noqa: E721
+            if type(other) is int and self.dtype.kind in "iu":
                 truthiness = None
                 iinfo = np.iinfo(self.dtype)
                 if iinfo.min > other:

@@ -548,7 +548,7 @@ def join(
         2    <NA>
         3     c-d
         dtype: object
-        """  # noqa E501
+        """
         if sep is None:
             sep = ""
 
@@ -694,7 +694,7 @@ def extract(
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
-        """  # noqa W605
+        """
         if not _is_supported_regex_flags(flags):
             raise NotImplementedError(
                 "unsupported value for `flags` parameter"
@@ -830,7 +830,7 @@ def contains(
             value is set.
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
-        """  # noqa W605
+        """
         if na is not np.nan:
             raise NotImplementedError("`na` parameter is not yet supported")
         if regex and isinstance(pat, re.Pattern):
@@ -3675,7 +3675,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
             -   Some characters need to be escaped when passing
                 in pat. e.g. ``'$'`` has a special meaning in regex
                 and must be escaped when finding this literal character.
-        """  # noqa W605
+        """
         if isinstance(pat, re.Pattern):
             flags = pat.flags & ~re.U
             pat = pat.pattern
@@ -6160,7 +6160,7 @@ def find_and_replace(
         to_replace_col = column.as_column(to_replace)
         replacement_col = column.as_column(replacement)
 
-        if type(to_replace_col) != type(replacement_col):
+        if type(to_replace_col) is not type(replacement_col):
             raise TypeError(
                 f"to_replace and value should be of same types,"
                 f"got to_replace dtype: {to_replace_col.dtype} and "

@@ -468,7 +468,7 @@ def components(self) -> dict[str, ColumnBase]:
         2  13000     10       12       48           712             0            0
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
-        """  # noqa: E501
+        """
 
         date_meta = {
             "seconds": ["m", "s"],