Consolidate linter configs into pyproject.toml (#12834)

This consolidation allows us to get rid of now unnecessary setup.cfg files (thanks to removing versioneer in #12741). It also allows us to move towards a fully pyproject.toml-driven build. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - David Wendt (https://github.com/davidwendt) - Lawrence Mitchell (https://github.com/wence-) URL: #12834
rapidsai · Feb 24, 2023 · 77c2e03 · 77c2e03
1 parent 12e4501
commit 77c2e03
Show file tree

Hide file tree

Showing 17 changed files with 261 additions and 212 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,24 @@
+# Copyright (c) 2017-2023, NVIDIA CORPORATION.
+
+[flake8]
+filename = *.py, *.pyx, *.pxd, *.pxi
+exclude = __init__.py, *.egg, build, docs, .git
+force-check = True
+ignore =
+    # line break before binary operator
+    W503,
+    # whitespace before :
+    E203
+per-file-ignores =
+    # Rules ignored only in Cython:
+    # E211: whitespace before '(' (used in multi-line imports)
+    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
+    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
+    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
+    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
+    # E402: invalid syntax (works for Python, not Cython)
+    # E999: invalid syntax (works for Python, not Cython)
+    # W504: line break after binary operator (breaks lines that end with a pointer)
+    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -34,7 +34,7 @@ repos:
         rev: 5.0.4
         hooks:
               - id: flake8
-                args: ["--config=setup.cfg"]
+                args: ["--config=.flake8"]
                 files: python/.*$
                 types: [file]
                 types_or: [python, cython]
@@ -48,7 +48,7 @@ repos:
         hooks:
               - id: mypy
                 additional_dependencies: [types-cachetools]
-                args: ["--config-file=setup.cfg",
+                args: ["--config-file=pyproject.toml",
                        "python/cudf/cudf",
                        "python/custreamz/custreamz",
                        "python/cudf_kafka/cudf_kafka",
@@ -58,7 +58,9 @@ repos:
         rev: 6.1.1
         hooks:
               - id: pydocstyle
-                args: ["--config=setup.cfg"]
+                # https://github.com/PyCQA/pydocstyle/issues/603
+                additional_dependencies: [toml]
+                args: ["--config=pyproject.toml"]
       - repo: https://github.com/pre-commit/mirrors-clang-format
         rev: v11.1.0
         hooks:
@@ -138,9 +140,11 @@ repos:
                 pass_filenames: false
                 verbose: false
       - repo: https://github.com/codespell-project/codespell
-        rev: v2.1.0
+        rev: v2.2.2
         hooks:
               - id: codespell
+                additional_dependencies: [tomli]
+                args: ["--toml", "pyproject.toml"]
                 exclude: |
                   (?x)^(
                     .*test.*|

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -94,7 +94,7 @@ sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/setup
 sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/setup.py
 
 # Dependency versions in pyproject.toml
-sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml
+sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/pyproject.toml
 
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -501,7 +501,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
   rmm::device_uvector<cudf::size_type> offsets(num_rows + 1, cudf::get_default_stream());
   thrust::exclusive_scan(
     thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin());
-  // offfsets are ready.
+  // offsets are ready.
   auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1);
   rmm::device_uvector<char> chars(chars_length, cudf::get_default_stream());
   thrust::for_each_n(thrust::device,

diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -373,13 +373,13 @@ class data_profile {
 
   void set_bool_probability_true(double p)
   {
-    CUDF_EXPECTS(p >= 0. and p <= 1., "probablity must be in range [0...1]");
+    CUDF_EXPECTS(p >= 0. and p <= 1., "probability must be in range [0...1]");
     bool_probability_true = p;
   }
   void set_null_probability(std::optional<double> p)
   {
     CUDF_EXPECTS(p.value_or(0.) >= 0. and p.value_or(0.) <= 1.,
-                 "probablity must be in range [0...1]");
+                 "probability must be in range [0...1]");
     null_probability = p;
   }
   void set_cardinality(cudf::size_type c) { cardinality = c; }

diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -22,16 +22,16 @@ Specifically, cuDF uses the following tools:
   In conjunction with [type hints](https://docs.python.org/3/library/typing.html),
   `mypy` can help catch various bugs that are otherwise difficult to find.
 - [`pydocstyle`](https://github.com/PyCQA/pydocstyle/) lints docstring style.
+- [`codespell`](https://github.com/codespell-project/codespell) finds spelling errors.
 
 Linter config data is stored in a number of files.
-We generally use `pyproject.toml` over `setup.cfg` and avoid project-specific files (e.g. `setup.cfg` > `python/cudf/setup.cfg`).
+We generally use `pyproject.toml` over `setup.cfg` and avoid project-specific files (e.g. `pyproject.toml` > `python/cudf/pyproject.toml`).
 However, differences between tools and the different packages in the repo result in the following caveats:
 
-- `flake8` has no plans to support `pyproject.toml`, so it must live in `setup.cfg`.
+- `flake8` has no plans to support `pyproject.toml`, so it must live in `.flake8`.
 - `isort` must be configured per project to set which project is the "first party" project.
 
-Additionally, our use of `versioneer` means that each project must have a `setup.cfg`.
-As a result, we currently maintain both root and project-level `pyproject.toml` and `setup.cfg` files.
+As a result, we currently maintain both root and project-level `pyproject.toml` files as well as a `.flake8` file.
 
 For more information on how to use pre-commit hooks, see the code formatting section of the
 [overall contributing guide](https://github.com/rapidsai/cudf/blob/main/CONTRIBUTING.md#python--pre-commit-hooks).

diff --git a/pyproject.toml b/pyproject.toml
@@ -17,3 +17,41 @@ force-exclude = '''
     dist
 )/
 '''
+
+[tool.pydocstyle]
+# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
+# than include using match-dir. Note that as discussed in
+# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
+# unlike the match option above this match-dir will have no effect when
+# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
+# also be maintained in the pre-commit config file.
+match-dir = "^(?!(ci|cpp|conda|docs|java|notebooks)).*$"
+# Allow missing docstrings for docutils
+ignore-decorators = ".*(docutils|doc_apply|copy_docstring).*"
+select = "D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418"
+    # Would like to enable the following rules in the future:
+    # D200, D202, D205, D400
+
+[tool.mypy]
+ignore_missing_imports = true
+# If we don't specify this, then mypy will check excluded files if
+# they are imported by a checked file.
+follow_imports = "skip"
+exclude = [
+    "cudf/_lib/",
+    "cudf/cudf/benchmarks/",
+    "cudf/cudf/tests/",
+    "cudf/cudf/utils/metadata/orc_column_statistics_pb2.py",
+    "custreamz/custreamz/tests/",
+    "dask_cudf/dask_cudf/tests/",
+ ]
+
+[tool.codespell]
+# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
+# this is only to allow you to run codespell interactively
+skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp"
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
+ignore-words-list = "inout,unparseable,falsy"
+builtin = "clear"
+quiet-level = 3
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pyarrow as pa
@@ -315,7 +315,7 @@ cdef columns_from_table_view(
     object owners,
 ):
     """
-    Given a ``cudf::table_view``, construsts a list of columns from it,
+    Given a ``cudf::table_view``, constructs a list of columns from it,
     along with referencing an owner Python object that owns the memory
     lifetime. owner must be either None or a list of column. If owner
     is a list of columns, the owner of the `i`th ``cudf::column_view``

diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
@@ -15,3 +15,46 @@ requires = [
     "protoc-wheel",
     "rmm==23.4.*",
 ]
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_dask = [
+    "dask",
+    "distributed",
+    "dask_cuda",
+]
+known_rapids = [
+    "rmm",
+]
+known_first_party = [
+    "cudf",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
diff --git a/python/cudf/setup.cfg b/python/cudf/setup.cfg
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
@@ -7,3 +7,49 @@ requires = [
     "setuptools",
     "cython>=0.29,<0.30",
 ]
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_dask = [
+    "dask",
+    "distributed",
+    "dask_cuda",
+    "streamz",
+]
+known_rapids = [
+    "rmm",
+    "cudf",
+    "dask_cudf",
+]
+known_first_party = [
+    "cudf_kafka",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
diff --git a/python/cudf_kafka/setup.cfg b/python/cudf_kafka/setup.cfg