From e64e26eda09f8508b7760ddba9f742c4f4e827cb Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Thu, 23 Feb 2023 18:07:56 -0800 Subject: [PATCH 01/10] Expose seed argument to hash_values (#12795) This PR exposes the `seed` param to `hash_values` that is already supported by libcudf's `hash` method. Closes #12775 Authors: - Ayush Dattagupta (https://github.com/ayushdg) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/12795 --- python/cudf/cudf/core/indexed_frame.py | 24 +++++++++++++-- python/cudf/cudf/tests/test_dataframe.py | 39 +++++++++++++++++++++--- 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 43277fb55ff..2992cb005e5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1629,7 +1629,7 @@ def memory_usage(self, index=True, deep=False): """ raise NotImplementedError - def hash_values(self, method="murmur3"): + def hash_values(self, method="murmur3", seed=None): """Compute the hash of values in this column. Parameters @@ -1639,6 +1639,12 @@ def hash_values(self, method="murmur3"): * murmur3: MurmurHash3 hash function. * md5: MD5 hash function. + seed : int, optional + Seed value to use for the hash function. + Note - This only has effect for the following supported + hash functions: + * murmur3: MurmurHash3 hash function. + Returns ------- Series @@ -1665,6 +1671,11 @@ def hash_values(self, method="murmur3"): 1 947ca8d2c5f0f27437f156cfbfab0969 2 d0580ef52d27c043c8e341fd5039b166 dtype: object + >>> series.hash_values(method="murmur3", seed=42) + 0 2364453205 + 1 422621911 + 2 3353449140 + dtype: uint32 **DataFrame** @@ -1686,11 +1697,20 @@ def hash_values(self, method="murmur3"): 2 fe061786ea286a515b772d91b0dfcd70 dtype: object """ + seed_hash_methods = {"murmur3"} + if seed is None: + seed = 0 + elif method not in seed_hash_methods: + warnings.warn( + "Provided seed value has no effect for hash method" + f" `{method}`. Refer to the docstring for information" + " on hash methods that support the `seed` param" + ) # Note that both Series and DataFrame return Series objects from this # calculation, necessitating the unfortunate circular reference to the # child class here. return cudf.Series._from_data( - {None: libcudf.hash.hash([*self._columns], method)}, + {None: libcudf.hash.hash([*self._columns], method, seed)}, index=self.index, ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 09b9f57356c..13f312f6f0c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -38,6 +38,7 @@ NUMERIC_TYPES, assert_eq, assert_exceptions_equal, + assert_neq, does_not_raise, expect_warning_if, gen_rand, @@ -1323,9 +1324,10 @@ def test_assign(): @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) @pytest.mark.parametrize("method", ["murmur3", "md5"]) -def test_dataframe_hash_values(nrows, method): +@pytest.mark.parametrize("seed", [None, 42]) +def test_dataframe_hash_values(nrows, method, seed): gdf = cudf.DataFrame() - data = np.asarray(range(nrows)) + data = np.arange(nrows) data[0] = data[-1] # make first and last the same gdf["a"] = data gdf["b"] = gdf.a + 100 @@ -1334,12 +1336,41 @@ def test_dataframe_hash_values(nrows, method): assert len(out) == nrows assert out.dtype == np.uint32 + warning_expected = ( + True if seed is not None and method not in {"murmur3"} else False + ) # Check single column - out_one = gdf[["a"]].hash_values(method=method) + if warning_expected: + with pytest.warns( + UserWarning, match="Provided seed value has no effect*" + ): + out_one = gdf[["a"]].hash_values(method=method, seed=seed) + else: + out_one = gdf[["a"]].hash_values(method=method, seed=seed) # First matches last assert out_one.iloc[0] == out_one.iloc[-1] # Equivalent to the cudf.Series.hash_values() - assert_eq(gdf["a"].hash_values(method=method), out_one) + if warning_expected: + with pytest.warns( + UserWarning, match="Provided seed value has no effect*" + ): + assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) + else: + assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) + + +@pytest.mark.parametrize("method", ["murmur3"]) +def test_dataframe_hash_values_seed(method): + gdf = cudf.DataFrame() + data = np.arange(10) + data[0] = data[-1] # make first and last the same + gdf["a"] = data + gdf["b"] = gdf.a + 100 + out_one = gdf.hash_values(method=method, seed=0) + out_two = gdf.hash_values(method=method, seed=1) + assert out_one.iloc[0] == out_one.iloc[-1] + assert out_two.iloc[0] == out_two.iloc[-1] + assert_neq(out_one, out_two) @pytest.mark.parametrize("nrows", [3, 10, 100, 1000]) From 2e80eba6f75b03f039517c947f386ede65842a4c Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Fri, 24 Feb 2023 10:28:25 -0600 Subject: [PATCH 02/10] Fix parquet `RangeIndex` bug (#12838) Possible fix for https://github.com/rapidsai/cudf/issues/12837 Avoids dropping RangeIndex when `columns` argument is passed to `read_parquet` (unless `columns=[]`). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/12838 --- python/cudf/cudf/_lib/parquet.pyx | 2 +- python/cudf/cudf/tests/test_parquet.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index e5520ae1987..464d9243408 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -170,7 +170,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, allow_range_index = True if columns is not None: cpp_columns.reserve(len(columns)) - allow_range_index = False + allow_range_index = len(columns) > 0 for col in columns: cpp_columns.push_back(str(col).encode()) args.set_columns(cpp_columns) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index ccd62729a9d..661497e4650 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2650,6 +2650,20 @@ def test_parquet_columns_and_index_param(index, columns): assert_eq(expected, got, check_index_type=True) +@pytest.mark.parametrize("columns", [None, ["b", "a"]]) +def test_parquet_columns_and_range_index(columns): + buffer = BytesIO() + df = cudf.DataFrame( + {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=pd.RangeIndex(2, 5) + ) + df.to_parquet(buffer) + + expected = pd.read_parquet(buffer, columns=columns) + got = cudf.read_parquet(buffer, columns=columns) + + assert_eq(expected, got, check_index_type=True) + + def test_parquet_nested_struct_list(): buffer = BytesIO() data = { From 0e4e6dd567964404934d96a1fe8fc14b1d25a526 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Fri, 24 Feb 2023 12:07:51 -0500 Subject: [PATCH 03/10] Add `always_nullable` flag to Dremel encoding (#12727) Closes #12389 by fixing the bug describe here https://github.com/rapidsai/cudf/issues/12389#issuecomment-1419949751. This flag, when `always_nullable=true`, generates `definition levels` in the Dremel encoding such that it considers every nested column and child to be `nullable`, even if they actually are not. In the context of `two_table_comparators`, this helps us with producing consistently mapped `definition levels` in case there are some nested columns or children that are not nullable in either one or both of the tables. This PR now exposes two APIs: 1. `cudf::detail::get_dremel_data(...)` : This API is consistent with standard Dremel encoding 2. `cudf::detail::get_comparator_data(...)` : This API modifies the definition levels in Dremel encoding to produce the effect described above Authors: - Divye Gala (https://github.com/divyegala) - Nghia Truong (https://github.com/ttnghia) Approvers: - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12727 --- cpp/include/cudf/lists/detail/dremel.hpp | 30 +++++++-- .../cudf/table/experimental/row_operators.cuh | 3 +- cpp/src/lists/dremel.cu | 48 ++++++++++---- cpp/src/table/row_operators.cu | 2 +- cpp/tests/search/search_list_test.cpp | 64 ++++++++++++++++++- 5 files changed, 124 insertions(+), 23 deletions(-) diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp index 4e3aeec2499..d36a4091947 100644 --- a/cpp/include/cudf/lists/detail/dremel.hpp +++ b/cpp/include/cudf/lists/detail/dremel.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -183,16 +183,34 @@ struct dremel_data { * - | - | -- | --- * ``` * - * @param col Column of LIST type - * @param level_nullability Pre-determined nullability at each list level. Empty means infer from - * `col` + * @param input Column of LIST type + * @param nullability Pre-determined nullability at each list level. Empty means infer from + * `input` + * @param output_as_byte_array if `true`, then any nested list level that has a child of type + * `uint8_t` will be considered as the last level * @param stream CUDA stream used for device memory operations and kernel launches. - * * @return A struct containing dremel data */ -dremel_data get_dremel_data(column_view h_col, +dremel_data get_dremel_data(column_view input, std::vector nullability, bool output_as_byte_array, rmm::cuda_stream_view stream); +/** + * @brief Get Dremel offsets, repetition levels, and modified definition levels to be used for + * lexicographical comparators. The modified definition levels are produced by treating + * each nested column in the input as nullable + * + * @param input Column of LIST type + * @param nullability Pre-determined nullability at each list level. Empty means infer from + * `input` + * @param output_as_byte_array if `true`, then any nested list level that has a child of type + * `uint8_t` will be considered as the last level + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return A struct containing dremel data + */ +dremel_data get_comparator_data(column_view input, + std::vector nullability, + bool output_as_byte_array, + rmm::cuda_stream_view stream); } // namespace cudf::detail diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index f9ffbfcdf7b..2a207d2a5c4 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -487,7 +487,8 @@ class device_row_comparator { // element_index because either both rows have a deeply nested NULL at the // same position, and we'll "continue" in our iteration, or we will early // exit if only one of the rows has a deeply nested NULL - if (lcol.nullable() and l_def_levels[l_dremel_index] == l_max_def_level - 1) { + if ((lcol.nullable() and l_def_levels[l_dremel_index] == l_max_def_level - 1) or + (rcol.nullable() and r_def_levels[r_dremel_index] == r_max_def_level - 1)) { ++element_index; } if (l_def_level == r_def_level) { continue; } diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu index 26988622aee..c96a21df905 100644 --- a/cpp/src/lists/dremel.cu +++ b/cpp/src/lists/dremel.cu @@ -35,7 +35,7 @@ #include namespace cudf::detail { - +namespace { /** * @brief Functor to get definition level value for a nested struct column until the leaf level or * the first list level. @@ -46,6 +46,7 @@ struct def_level_fn { uint8_t const* d_nullability; uint8_t sub_level_start; uint8_t curr_def_level; + bool always_nullable; __device__ uint32_t operator()(size_type i) { @@ -55,7 +56,7 @@ struct def_level_fn { auto col = *parent_col; do { // If col not nullable then it does not contribute to def levels - if (d_nullability[l]) { + if (always_nullable or d_nullability[l]) { if (not col.nullable() or bit_is_set(col.null_mask(), i)) { ++def; } else { // We have found the shallowest level at which this row is null @@ -72,10 +73,11 @@ struct def_level_fn { } }; -dremel_data get_dremel_data(column_view h_col, - std::vector nullability, - bool output_as_byte_array, - rmm::cuda_stream_view stream) +dremel_data get_encoding(column_view h_col, + std::vector nullability, + bool output_as_byte_array, + bool always_nullable, + rmm::cuda_stream_view stream) { auto get_list_level = [](column_view col) { while (col.type().id() == type_id::STRUCT) { @@ -173,14 +175,14 @@ dremel_data get_dremel_data(column_view h_col, uint32_t def = 0; start_at_sub_level.push_back(curr_nesting_level_idx); while (col.type().id() == type_id::STRUCT) { - def += (nullability[curr_nesting_level_idx]) ? 1 : 0; + def += (always_nullable or nullability[curr_nesting_level_idx]) ? 1 : 0; col = col.child(0); ++curr_nesting_level_idx; } // At the end of all those structs is either a list column or the leaf. List column contributes // at least one def level. Leaf contributes 1 level only if it is nullable. - def += - (col.type().id() == type_id::LIST ? 1 : 0) + (nullability[curr_nesting_level_idx] ? 1 : 0); + def += (col.type().id() == type_id::LIST ? 1 : 0) + + (always_nullable or nullability[curr_nesting_level_idx] ? 1 : 0); def_at_level.push_back(def); ++curr_nesting_level_idx; }; @@ -209,7 +211,7 @@ dremel_data get_dremel_data(column_view h_col, } } - auto [device_view_owners, d_nesting_levels] = + [[maybe_unused]] auto [device_view_owners, d_nesting_levels] = contiguous_copy_column_device_views(nesting_levels, stream); auto max_def_level = def_at_level.back(); @@ -297,7 +299,8 @@ dremel_data get_dremel_data(column_view h_col, def_level_fn{d_nesting_levels + level, d_nullability.data(), start_at_sub_level[level], - def_at_level[level]}); + def_at_level[level], + always_nullable}); // `nesting_levels.size()` == no of list levels + leaf. Max repetition level = no of list levels auto input_child_rep_it = thrust::make_constant_iterator(nesting_levels.size() - 1); @@ -306,7 +309,8 @@ dremel_data get_dremel_data(column_view h_col, def_level_fn{d_nesting_levels + level + 1, d_nullability.data(), start_at_sub_level[level + 1], - def_at_level[level + 1]}); + def_at_level[level + 1], + always_nullable}); // Zip the input and output value iterators so that merge operation is done only once auto input_parent_zip_it = @@ -389,7 +393,8 @@ dremel_data get_dremel_data(column_view h_col, def_level_fn{d_nesting_levels + level, d_nullability.data(), start_at_sub_level[level], - def_at_level[level]}); + def_at_level[level], + always_nullable}); // Zip the input and output value iterators so that merge operation is done only once auto input_parent_zip_it = @@ -459,5 +464,22 @@ dremel_data get_dremel_data(column_view h_col, leaf_data_size, max_def_level}; } +} // namespace + +dremel_data get_dremel_data(column_view h_col, + std::vector nullability, + bool output_as_byte_array, + rmm::cuda_stream_view stream) +{ + return get_encoding(h_col, nullability, output_as_byte_array, false, stream); +} + +dremel_data get_comparator_data(column_view h_col, + std::vector nullability, + bool output_as_byte_array, + rmm::cuda_stream_view stream) +{ + return get_encoding(h_col, nullability, output_as_byte_array, true, stream); +} } // namespace cudf::detail diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 766a1b63905..8a63a6f6411 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -264,7 +264,7 @@ auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream) std::vector dremel_device_views; for (auto const& col : table) { if (col.type().id() == type_id::LIST) { - dremel_data.push_back(detail::get_dremel_data(col, {}, false, stream)); + dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream)); dremel_device_views.push_back(dremel_data.back()); } } diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp index 1393095037d..1e97933fa4d 100644 --- a/cpp/tests/search/search_list_test.cpp +++ b/cpp/tests/search/search_list_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,8 @@ #include #include -using namespace cudf::test::iterators; +using cudf::test::iterators::null_at; +using cudf::test::iterators::nulls_at; using bools_col = cudf::test::fixed_width_column_wrapper; using int32s_col = cudf::test::fixed_width_column_wrapper; @@ -347,3 +348,62 @@ TYPED_TEST(TypedListContainsTestColumnNeedles, ListsOfStructs) auto const result = cudf::contains(*haystack, *needles); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); } + +struct ListLowerBound : public cudf::test::BaseFixture { +}; + +TEST_F(ListLowerBound, ListWithNulls) +{ + { + using lcw = cudf::test::lists_column_wrapper; + auto const haystack = lcw{ + lcw{-3.45967821e+12}, // 0 + lcw{-3.6912186e-32}, // 1 + lcw{9.721175}, // 2 + }; + + auto const needles = lcw{ + lcw{{0, 4.22671e+32}, null_at(0)}, + }; + + auto const expect = int32s_col{0}; + auto const result = cudf::lower_bound(cudf::table_view{{haystack}}, + cudf::table_view{{needles}}, + {cudf::order::ASCENDING}, + {cudf::null_order::BEFORE}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *result); + } + + { + using lcw = cudf::test::lists_column_wrapper; + auto const col1 = lcw{ + lcw{{0}, null_at(0)}, // 0 + lcw{-80}, // 1 + lcw{-17}, // 2 + }; + + auto const col2 = lcw{ + lcw{27}, // 0 + lcw{{0}, null_at(0)}, // 1 + lcw{}, // 2 + }; + + auto const val1 = lcw{ + lcw{87}, + }; + + auto const val2 = lcw{ + lcw{}, + }; + + cudf::table_view input{{col1, col2}}; + cudf::table_view values{{val1, val2}}; + std::vector column_order{cudf::order::ASCENDING, cudf::order::DESCENDING}; + std::vector null_order_flags{cudf::null_order::BEFORE, + cudf::null_order::BEFORE}; + + auto const expect = int32s_col{3}; + auto const result = cudf::lower_bound(input, values, column_order, null_order_flags); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *result); + } +} From 8a7fb2f14a73937d31f648a65f57bc47751e97c1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 24 Feb 2023 12:25:49 -0600 Subject: [PATCH 04/10] Deprecate `inplace` parameters in categorical methods (#12824) To get ready for pandas-2.0 compatibility, this PR deprecates `inplace` in the following APIs: - [x] `as_ordered` - [x] `as_unordered` - [x] `add_categories` - [x] `remove_categories` - [x] `set_categories` - [x] `reorder_categories` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/12824 --- python/cudf/cudf/core/column/categorical.py | 78 ++++++++++++++++++++- python/cudf/cudf/tests/test_categorical.py | 19 +++-- 2 files changed, 92 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a1526d25512..52f7c0b957f 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -141,6 +141,13 @@ def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: or return a copy of this categorical with added categories. + .. deprecated:: 23.02 + + The `inplace` parameter is is deprecated and + will be removed in a future version of cudf. + Setting categories as ordered will always + return a new Categorical object. + Returns ------- Categorical @@ -204,6 +211,13 @@ def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: in-place or return a copy of this categorical with ordered set to False. + .. deprecated:: 23.02 + + The `inplace` parameter is is deprecated and + will be removed in a future version of cudf. + Setting categories as unordered will always + return a new Categorical object. + Returns ------- Categorical @@ -286,6 +300,13 @@ def add_categories( or return a copy of this categorical with added categories. + .. deprecated:: 23.04 + + The `inplace` parameter is is deprecated and + will be removed in a future version of cudf. + Adding categories will always return a + new Categorical object. + Returns ------- cat @@ -318,7 +339,14 @@ def add_categories( dtype: category Categories (5, int64): [1, 2, 0, 3, 4] """ - + if inplace: + warnings.warn( + "The `inplace` parameter in cudf.Series.cat.add_categories " + "is deprecated and will be removed in a future version of " + "cudf. Adding categories will always return a new " + "Categorical object.", + FutureWarning, + ) old_categories = self._column.categories new_categories = column.as_column( new_categories, @@ -371,6 +399,13 @@ def remove_categories( inplace or return a copy of this categorical with removed categories. + .. deprecated:: 23.04 + + The `inplace` parameter is is deprecated and + will be removed in a future version of cudf. + Removing categories will always return a + new Categorical object. + Returns ------- cat @@ -423,6 +458,16 @@ def remove_categories( dtype: category Categories (2, int64): [1, 2] """ + if inplace: + warnings.warn( + "The `inplace` parameter in " + "cudf.Series.cat.remove_categories is deprecated and " + "will be removed in a future version of cudf. " + "Removing categories will always return a new " + "Categorical object.", + FutureWarning, + ) + cats = self.categories.to_series() removals = cudf.Series(removals, dtype=cats.dtype) removals_mask = removals.isin(cats) @@ -485,6 +530,13 @@ def set_categories( or return a copy of this categorical with reordered categories. + .. deprecated:: 23.04 + + The `inplace` parameter is is deprecated and + will be removed in a future version of cudf. + Setting categories will always return a + new Categorical object. + Returns ------- cat @@ -524,6 +576,14 @@ def set_categories( dtype: category Categories (2, int64): [1, 10] """ + if inplace: + warnings.warn( + "The `inplace` parameter in cudf.Series.cat.set_categories is " + "deprecated and will be removed in a future version of cudf. " + "Setting categories will always return a new Categorical " + "object.", + FutureWarning, + ) return self._return_or_inplace( self._column.set_categories( new_categories=new_categories, ordered=ordered, rename=rename @@ -556,6 +616,13 @@ def reorder_categories( inplace or return a copy of this categorical with reordered categories. + .. deprecated:: 23.04 + + The `inplace` parameter is is deprecated and + will be removed in a future version of cudf. + Reordering categories will always return a + new Categorical object. + Returns ------- cat @@ -597,6 +664,15 @@ def reorder_categories( ValueError: items in new_categories are not the same as in old categories """ + if inplace: + warnings.warn( + "The `inplace` parameter in " + "cudf.Series.cat.reorder_categories is deprecated " + "and will be removed in a future version of cudf. " + "Reordering categories will always return a new " + "Categorical object.", + FutureWarning, + ) return self._return_or_inplace( self._column.reorder_categories(new_categories, ordered=ordered), inplace=inplace, diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index fa8981cf7e3..496039ca2f8 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -443,10 +443,13 @@ def test_categorical_reorder_categories( "reorder_categories" ): pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs) - cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) if inplace: + with pytest.warns(FutureWarning): + cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) pd_sr_1 = pd_sr cd_sr_1 = cd_sr + else: + cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) assert_eq(pd_sr_1, cd_sr_1) @@ -479,10 +482,14 @@ def test_categorical_add_categories(pd_str_cat, inplace): "add_categories" ): pd_sr_1 = pd_sr.cat.add_categories(["d"], inplace=inplace) - cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace) + if inplace: + with pytest.warns(FutureWarning): + cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace) pd_sr_1 = pd_sr cd_sr_1 = cd_sr + else: + cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace) assert "d" in pd_sr_1.cat.categories.to_list() assert "d" in cd_sr_1.cat.categories.to_pandas().to_list() @@ -516,10 +523,14 @@ def test_categorical_remove_categories(pd_str_cat, inplace): "remove_categories" ): pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace) - cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) + if inplace: + with pytest.warns(FutureWarning): + cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) pd_sr_1 = pd_sr cd_sr_1 = cd_sr + else: + cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) assert "a" not in pd_sr_1.cat.categories.to_list() assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list() @@ -529,7 +540,7 @@ def test_categorical_remove_categories(pd_str_cat, inplace): # test using ordered operators with _hide_deprecated_pandas_categorical_inplace_warnings( "remove_categories" - ): + ) as _, pytest.warns(FutureWarning) as _: assert_exceptions_equal( lfunc=cd_sr.to_pandas().cat.remove_categories, rfunc=cd_sr.cat.remove_categories, From 54ee14e36157fe63d0eb58ed7ac8bafc2b1e4932 Mon Sep 17 00:00:00 2001 From: Jordan Jacobelli Date: Fri, 24 Feb 2023 19:37:29 +0100 Subject: [PATCH 05/10] Update datasets download URL (#12840) Update datasets download URL to reduce latency and costs Authors: - Jordan Jacobelli (https://github.com/jjacobelli) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/12840 --- python/cudf/cudf/benchmarks/get_datasets.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/benchmarks/get_datasets.py b/python/cudf/cudf/benchmarks/get_datasets.py index f3b66eda512..7090539bcb0 100644 --- a/python/cudf/cudf/benchmarks/get_datasets.py +++ b/python/cudf/cudf/benchmarks/get_datasets.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import argparse import os @@ -9,10 +9,7 @@ Dataset = namedtuple("Dataset", ["url", "dir"]) datasets = { "cuio_dataset": Dataset( - ( - "https://rapidsai-data.s3.us-east-2.amazonaws.com/cudf/" - "benchmark/avro_json_datasets.zip" - ), + "https://data.rapids.ai/cudf/benchmark/avro_json_datasets.zip", "cudf/benchmarks/cuio_data/", ), } From 12e4501c49daac3d0e3837a3f65078e63e20b904 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 24 Feb 2023 13:42:49 -0500 Subject: [PATCH 06/10] Remove KAFKA_HOST_TEST from compute-sanitizer check (#12831) Removes the `KAFKA_HOST_TEST` from the compute-sanitizer memcheck nighly runs. The following error occurs when running this host test. ``` Running compute-sanitizer on KAFKA_HOST_TEST ========= COMPUTE-SANITIZER Running main() from gmock_main.cc [==========] Running 2 tests from 1 test suite. [----------] Global test environment set-up. [----------] 2 tests from KafkaDatasourceTest [ RUN ] KafkaDatasourceTest.MissingGroupID [ OK ] KafkaDatasourceTest.MissingGroupID (0 ms) [ RUN ] KafkaDatasourceTest.InvalidConfigValues [ OK ] KafkaDatasourceTest.InvalidConfigValues (0 ms) [----------] 2 tests from KafkaDatasourceTest (0 ms total) [----------] Global test environment tear-down [==========] 2 tests from 1 test suite ran. (0 ms total) [ PASSED ] 2 tests. ========= Error: Target application terminated before first instrumented API call ========= Tracking kernels launched by child processes requires the --target-processes all option. ``` Adding the `--target-processes all` option gives the same error. Disabling the check of this test since it is a host test that checks error conditions and does not appear to make any device calls. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12831 --- ci/test_cpp_memcheck.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh index 0cad4fc3a3f..db9ce143d51 100755 --- a/ci/test_cpp_memcheck.sh +++ b/ci/test_cpp_memcheck.sh @@ -11,7 +11,7 @@ set +e rapids-logger "Memcheck gtests with rmm_mode=cuda" export GTEST_CUDF_RMM_MODE=cuda COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck" -for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do +for gt in "$CONDA_PREFIX"/bin/gtests/libcudf/* ; do test_name=$(basename ${gt}) if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then continue From 77c2e03ec572527b5c5c7a3f7a48b0cabd29abde Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 24 Feb 2023 10:47:44 -0800 Subject: [PATCH 07/10] Consolidate linter configs into pyproject.toml (#12834) This consolidation allows us to get rid of now unnecessary setup.cfg files (thanks to removing versioneer in #12741). It also allows us to move towards a fully pyproject.toml-driven build. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - David Wendt (https://github.com/davidwendt) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/12834 --- .flake8 | 24 +++++++ .pre-commit-config.yaml | 12 ++-- ci/release/update-version.sh | 2 +- cpp/benchmarks/common/generate_input.cu | 4 +- cpp/benchmarks/common/generate_input.hpp | 6 +- .../developer_guide/contributing_guide.md | 8 +-- pyproject.toml | 38 +++++++++++ python/cudf/cudf/_lib/utils.pyx | 4 +- python/cudf/pyproject.toml | 43 +++++++++++++ python/cudf/setup.cfg | 32 ---------- python/cudf_kafka/pyproject.toml | 46 +++++++++++++ python/cudf_kafka/setup.cfg | 35 ---------- python/custreamz/pyproject.toml | 45 +++++++++++++ python/custreamz/setup.cfg | 34 ---------- python/dask_cudf/pyproject.toml | 45 +++++++++++++ python/dask_cudf/setup.cfg | 31 --------- setup.cfg | 64 ------------------- 17 files changed, 261 insertions(+), 212 deletions(-) create mode 100644 .flake8 delete mode 100644 python/cudf/setup.cfg delete mode 100644 python/cudf_kafka/setup.cfg delete mode 100644 python/custreamz/setup.cfg delete mode 100644 setup.cfg diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000000..e80e3afc443 --- /dev/null +++ b/.flake8 @@ -0,0 +1,24 @@ +# Copyright (c) 2017-2023, NVIDIA CORPORATION. + +[flake8] +filename = *.py, *.pyx, *.pxd, *.pxi +exclude = __init__.py, *.egg, build, docs, .git +force-check = True +ignore = + # line break before binary operator + W503, + # whitespace before : + E203 +per-file-ignores = + # Rules ignored only in Cython: + # E211: whitespace before '(' (used in multi-line imports) + # E225: Missing whitespace around operators (breaks cython casting syntax like ) + # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*) + # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax) + # E275: Missing whitespace after keyword (Doesn't work with Cython except?) + # E402: invalid syntax (works for Python, not Cython) + # E999: invalid syntax (works for Python, not Cython) + # W504: line break after binary operator (breaks lines that end with a pointer) + *.pyx: E211, E225, E226, E227, E275, E402, E999, W504 + *.pxd: E211, E225, E226, E227, E275, E402, E999, W504 + *.pxi: E211, E225, E226, E227, E275, E402, E999, W504 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 244fc0d3872..e252af717ce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,7 +34,7 @@ repos: rev: 5.0.4 hooks: - id: flake8 - args: ["--config=setup.cfg"] + args: ["--config=.flake8"] files: python/.*$ types: [file] types_or: [python, cython] @@ -48,7 +48,7 @@ repos: hooks: - id: mypy additional_dependencies: [types-cachetools] - args: ["--config-file=setup.cfg", + args: ["--config-file=pyproject.toml", "python/cudf/cudf", "python/custreamz/custreamz", "python/cudf_kafka/cudf_kafka", @@ -58,7 +58,9 @@ repos: rev: 6.1.1 hooks: - id: pydocstyle - args: ["--config=setup.cfg"] + # https://github.com/PyCQA/pydocstyle/issues/603 + additional_dependencies: [toml] + args: ["--config=pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v11.1.0 hooks: @@ -138,9 +140,11 @@ repos: pass_filenames: false verbose: false - repo: https://github.com/codespell-project/codespell - rev: v2.1.0 + rev: v2.2.2 hooks: - id: codespell + additional_dependencies: [tomli] + args: ["--toml", "pyproject.toml"] exclude: | (?x)^( .*test.*| diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index c8875fda641..831b91bb2a6 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -94,7 +94,7 @@ sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/setup sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/setup.py # Dependency versions in pyproject.toml -sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml +sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/pyproject.toml for FILE in .github/workflows/*.yaml; do sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index dee7e2b8586..2829d14070c 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -501,7 +501,7 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons rmm::device_uvector offsets(num_rows + 1, cudf::get_default_stream()); thrust::exclusive_scan( thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin()); - // offfsets are ready. + // offsets are ready. auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1); rmm::device_uvector chars(chars_length, cudf::get_default_stream()); thrust::for_each_n(thrust::device, diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index f8ea194f0c4..e65aa69763b 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -373,13 +373,13 @@ class data_profile { void set_bool_probability_true(double p) { - CUDF_EXPECTS(p >= 0. and p <= 1., "probablity must be in range [0...1]"); + CUDF_EXPECTS(p >= 0. and p <= 1., "probability must be in range [0...1]"); bool_probability_true = p; } void set_null_probability(std::optional p) { CUDF_EXPECTS(p.value_or(0.) >= 0. and p.value_or(0.) <= 1., - "probablity must be in range [0...1]"); + "probability must be in range [0...1]"); null_probability = p; } void set_cardinality(cudf::size_type c) { cardinality = c; } diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md index 34071f44914..bb3479cf4c1 100644 --- a/docs/cudf/source/developer_guide/contributing_guide.md +++ b/docs/cudf/source/developer_guide/contributing_guide.md @@ -22,16 +22,16 @@ Specifically, cuDF uses the following tools: In conjunction with [type hints](https://docs.python.org/3/library/typing.html), `mypy` can help catch various bugs that are otherwise difficult to find. - [`pydocstyle`](https://github.com/PyCQA/pydocstyle/) lints docstring style. +- [`codespell`](https://github.com/codespell-project/codespell) finds spelling errors. Linter config data is stored in a number of files. -We generally use `pyproject.toml` over `setup.cfg` and avoid project-specific files (e.g. `setup.cfg` > `python/cudf/setup.cfg`). +We generally use `pyproject.toml` over `setup.cfg` and avoid project-specific files (e.g. `pyproject.toml` > `python/cudf/pyproject.toml`). However, differences between tools and the different packages in the repo result in the following caveats: -- `flake8` has no plans to support `pyproject.toml`, so it must live in `setup.cfg`. +- `flake8` has no plans to support `pyproject.toml`, so it must live in `.flake8`. - `isort` must be configured per project to set which project is the "first party" project. -Additionally, our use of `versioneer` means that each project must have a `setup.cfg`. -As a result, we currently maintain both root and project-level `pyproject.toml` and `setup.cfg` files. +As a result, we currently maintain both root and project-level `pyproject.toml` files as well as a `.flake8` file. For more information on how to use pre-commit hooks, see the code formatting section of the [overall contributing guide](https://github.com/rapidsai/cudf/blob/main/CONTRIBUTING.md#python--pre-commit-hooks). diff --git a/pyproject.toml b/pyproject.toml index dfd22f33785..3940d9119ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,3 +17,41 @@ force-exclude = ''' dist )/ ''' + +[tool.pydocstyle] +# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather +# than include using match-dir. Note that as discussed in +# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle, +# unlike the match option above this match-dir will have no effect when +# pydocstyle is invoked from pre-commit. Therefore this exclusion list must +# also be maintained in the pre-commit config file. +match-dir = "^(?!(ci|cpp|conda|docs|java|notebooks)).*$" +# Allow missing docstrings for docutils +ignore-decorators = ".*(docutils|doc_apply|copy_docstring).*" +select = "D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418" + # Would like to enable the following rules in the future: + # D200, D202, D205, D400 + +[tool.mypy] +ignore_missing_imports = true +# If we don't specify this, then mypy will check excluded files if +# they are imported by a checked file. +follow_imports = "skip" +exclude = [ + "cudf/_lib/", + "cudf/cudf/benchmarks/", + "cudf/cudf/tests/", + "cudf/cudf/utils/metadata/orc_column_statistics_pb2.py", + "custreamz/custreamz/tests/", + "dask_cudf/dask_cudf/tests/", + ] + +[tool.codespell] +# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - +# this is only to allow you to run codespell interactively +skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp" +# ignore short words, and typename parameters like OffsetT +ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" +ignore-words-list = "inout,unparseable,falsy" +builtin = "clear" +quiet-level = 3 diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 5f4d3e17fbc..56918799cca 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import numpy as np import pyarrow as pa @@ -315,7 +315,7 @@ cdef columns_from_table_view( object owners, ): """ - Given a ``cudf::table_view``, construsts a list of columns from it, + Given a ``cudf::table_view``, constructs a list of columns from it, along with referencing an owner Python object that owns the memory lifetime. owner must be either None or a list of column. If owner is a list of columns, the owner of the `i`th ``cudf::column_view`` diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 49c4d83245f..305e8822030 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -15,3 +15,46 @@ requires = [ "protoc-wheel", "rmm==23.4.*", ] + +[tool.isort] +line_length = 79 +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +combine_as_imports = true +order_by_type = true +known_dask = [ + "dask", + "distributed", + "dask_cuda", +] +known_rapids = [ + "rmm", +] +known_first_party = [ + "cudf", +] +default_section = "THIRDPARTY" +sections = [ + "FUTURE", + "STDLIB", + "THIRDPARTY", + "DASK", + "RAPIDS", + "FIRSTPARTY", + "LOCALFOLDER", +] +skip = [ + "thirdparty", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".tox", + ".venv", + "_build", + "buck-out", + "build", + "dist", + "__init__.py", +] diff --git a/python/cudf/setup.cfg b/python/cudf/setup.cfg deleted file mode 100644 index 8380da371f9..00000000000 --- a/python/cudf/setup.cfg +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. - -[isort] -line_length=79 -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -order_by_type=True -known_dask= - dask - distributed - dask_cuda -known_rapids= - rmm -known_first_party= - cudf -default_section=THIRDPARTY -sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER -skip= - thirdparty - .eggs - .git - .hg - .mypy_cache - .tox - .venv - _build - buck-out - build - dist - __init__.py diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 0924fc90352..308a7869bc0 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -7,3 +7,49 @@ requires = [ "setuptools", "cython>=0.29,<0.30", ] + +[tool.isort] +line_length = 79 +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +combine_as_imports = true +order_by_type = true +known_dask = [ + "dask", + "distributed", + "dask_cuda", + "streamz", +] +known_rapids = [ + "rmm", + "cudf", + "dask_cudf", +] +known_first_party = [ + "cudf_kafka", +] +default_section = "THIRDPARTY" +sections = [ + "FUTURE", + "STDLIB", + "THIRDPARTY", + "DASK", + "RAPIDS", + "FIRSTPARTY", + "LOCALFOLDER", +] +skip = [ + "thirdparty", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".tox", + ".venv", + "_build", + "buck-out", + "build", + "dist", + "__init__.py", +] diff --git a/python/cudf_kafka/setup.cfg b/python/cudf_kafka/setup.cfg deleted file mode 100644 index ee0d783b184..00000000000 --- a/python/cudf_kafka/setup.cfg +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -[isort] -line_length=79 -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -order_by_type=True -known_dask= - dask - distributed - dask_cuda - streamz -known_rapids= - rmm - cudf - dask_cudf -known_first_party= - cudf_kafka -default_section=THIRDPARTY -sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER -skip= - thirdparty - .eggs - .git - .hg - .mypy_cache - .tox - .venv - _build - buck-out - build - dist - __init__.py diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 806848c356e..d5c41945482 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -6,3 +6,48 @@ requires = [ "wheel", "setuptools", ] + +[tool.isort] +line_length = 79 +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +combine_as_imports = true +order_by_type = true +known_dask = [ + "dask", + "distributed", + "dask_cuda", +] +known_rapids = [ + "rmm", + "cudf", + "dask_cudf", +] +known_first_party = [ + "streamz", +] +default_section = "THIRDPARTY" +sections = [ + "FUTURE", + "STDLIB", + "THIRDPARTY", + "DASK", + "RAPIDS", + "FIRSTPARTY", + "LOCALFOLDER", +] +skip = [ + "thirdparty", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".tox", + ".venv", + "_build", + "buck-out", + "build", + "dist", + "__init__.py", +] diff --git a/python/custreamz/setup.cfg b/python/custreamz/setup.cfg deleted file mode 100644 index 8c038db9349..00000000000 --- a/python/custreamz/setup.cfg +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -[isort] -line_length=79 -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -order_by_type=True -known_dask= - dask - distributed - dask_cuda -known_rapids= - rmm - cudf - dask_cudf -known_first_party= - streamz -default_section=THIRDPARTY -sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER -skip= - thirdparty - .eggs - .git - .hg - .mypy_cache - .tox - .venv - _build - buck-out - build - dist - __init__.py diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 806848c356e..8cf823d4291 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -6,3 +6,48 @@ requires = [ "wheel", "setuptools", ] + +[tool.isort] +line_length = 79 +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +combine_as_imports = true +order_by_type = true + +known_dask = [ + "dask", + "distributed", + "dask_cuda", +] +known_rapids = [ + "rmm", + "cudf", +] +known_first_party = [ + "dask_cudf", +] + +default_section = "THIRDPARTY" +sections = [ + "FUTURE", + "STDLIB", + "THIRDPARTY", + "DASK", + "RAPIDS", + "FIRSTPARTY", + "LOCALFOLDER", +] +skip = [ + "thirdparty", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".tox", + ".venv", + "_build", + "buck-out", + "build", + "dist", +] diff --git a/python/dask_cudf/setup.cfg b/python/dask_cudf/setup.cfg index 66f4b8891d0..8139b3c7dc6 100644 --- a/python/dask_cudf/setup.cfg +++ b/python/dask_cudf/setup.cfg @@ -1,36 +1,5 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -[isort] -line_length=79 -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -order_by_type=True -known_dask= - dask - distributed - dask_cuda -known_rapids= - rmm - cudf -known_first_party= - dask_cudf -default_section=THIRDPARTY -sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER -skip= - thirdparty - .eggs - .git - .hg - .mypy_cache - .tox - .venv - _build - buck-out - build - dist - [options.entry_points] dask.dataframe.backends = cudf = dask_cudf.backends:CudfBackendEntrypoint diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 962b7d73bbe..00000000000 --- a/setup.cfg +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2017-2023, NVIDIA CORPORATION. - -[flake8] -filename = *.py, *.pyx, *.pxd, *.pxi -exclude = __init__.py, *.egg, build, docs, .git -force-check = True -ignore = - # line break before binary operator - W503, - # whitespace before : - E203 -per-file-ignores = - # Rules ignored only in Cython: - # E211: whitespace before '(' (used in multi-line imports) - # E225: Missing whitespace around operators (breaks cython casting syntax like ) - # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*) - # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax) - # E275: Missing whitespace after keyword (Doesn't work with Cython except?) - # E402: invalid syntax (works for Python, not Cython) - # E999: invalid syntax (works for Python, not Cython) - # W504: line break after binary operator (breaks lines that end with a pointer) - *.pyx: E211, E225, E226, E227, E275, E402, E999, W504 - *.pxd: E211, E225, E226, E227, E275, E402, E999, W504 - *.pxi: E211, E225, E226, E227, E275, E402, E999, W504 - -[pydocstyle] -# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather -# than include using match-dir. Note that as discussed in -# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle, -# unlike the match option above this match-dir will have no effect when -# pydocstyle is invoked from pre-commit. Therefore this exclusion list must -# also be maintained in the pre-commit config file. -match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks)).*$ -# Allow missing docstrings for docutils -ignore-decorators = .*(docutils|doc_apply|copy_docstring).* -select = - D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418 - # Would like to enable the following rules in the future: - # D200, D202, D205, D400 - -[mypy] -ignore_missing_imports = True -# If we don't specify this, then mypy will check excluded files if -# they are imported by a checked file. -follow_imports = skip -exclude = (?x)( - cudf/_lib/ - | cudf/cudf/benchmarks/ - | cudf/cudf/tests/ - | cudf/cudf/utils/metadata/orc_column_statistics_pb2.py - | custreamz/custreamz/tests/ - | dask_cudf/dask_cudf/tests/ - # This close paren cannot be in column zero otherwise the config parser barfs - ) - -[codespell] -# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - -# this is only to allow you to run codespell interactively -skip = ./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp -# ignore short words, and typename parameters like OffsetT -ignore-regex = \b(.{1,4}|[A-Z]\w*T)\b -ignore-words-list = inout,unparseable -builtin = clear -quiet-level = 3 From 4f2f37987fbd66de0cc9116734d2094ca4a39948 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 24 Feb 2023 17:04:59 -0600 Subject: [PATCH 08/10] Enable nbqa pre-commit hooks for isort and black. (#12848) This enables `black` and `isort` linters for ipynb notebooks via [nbqa](https://github.com/nbQA-dev/nbQA). I propose this change to avoid manually linting notebooks like https://github.com/rapidsai/cudf/pull/12595. cc: @galipremsagar Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/12848 --- .pre-commit-config.yaml | 10 ++ docs/cudf/source/user_guide/10min.ipynb | 1 + .../cudf/source/user_guide/cupy-interop.ipynb | 34 ++-- .../source/user_guide/guide-to-udfs.ipynb | 149 +++++++++--------- .../cudf/source/user_guide/missing-data.ipynb | 56 ++++--- 5 files changed, 141 insertions(+), 109 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e252af717ce..a030f3bd25b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -61,6 +61,16 @@ repos: # https://github.com/PyCQA/pydocstyle/issues/603 additional_dependencies: [toml] args: ["--config=pyproject.toml"] + - repo: https://github.com/nbQA-dev/nbQA + rev: 1.6.3 + hooks: + - id: nbqa-isort + # Use the cudf_kafka isort orderings in notebooks so that dask + # and RAPIDS packages have their own sections. + args: ["--settings-file=python/cudf_kafka/pyproject.toml"] + - id: nbqa-black + # Explicitly specify the pyproject.toml at the repo root, not per-project. + args: ["--config=pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v11.1.0 hooks: diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index af938b79a29..0352c624e04 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -35,6 +35,7 @@ "\n", "import cupy as cp\n", "import pandas as pd\n", + "\n", "import cudf\n", "import dask_cudf\n", "\n", diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb index 3e169984ace..c98a4ddea23 100644 --- a/docs/cudf/source/user_guide/cupy-interop.ipynb +++ b/docs/cudf/source/user_guide/cupy-interop.ipynb @@ -18,9 +18,10 @@ "outputs": [], "source": [ "import timeit\n", - "from packaging import version\n", "\n", "import cupy as cp\n", + "from packaging import version\n", + "\n", "import cudf\n", "\n", "if version.parse(cp.__version__) >= version.parse(\"10.0.0\"):\n", @@ -63,10 +64,13 @@ ], "source": [ "nelem = 10000\n", - "df = cudf.DataFrame({'a':range(nelem),\n", - " 'b':range(500, nelem + 500),\n", - " 'c':range(1000, nelem + 1000)}\n", - " )\n", + "df = cudf.DataFrame(\n", + " {\n", + " \"a\": range(nelem),\n", + " \"b\": range(500, nelem + 500),\n", + " \"c\": range(1000, nelem + 1000),\n", + " }\n", + ")\n", "\n", "%timeit arr_cupy = cupy_from_dlpack(df.to_dlpack())\n", "%timeit arr_cupy = df.values\n", @@ -138,7 +142,7 @@ } ], "source": [ - "col = 'a'\n", + "col = \"a\"\n", "\n", "%timeit cola_cupy = cp.asarray(df[col])\n", "%timeit cola_cupy = cupy_from_dlpack(df[col].to_dlpack())\n", @@ -1088,14 +1092,16 @@ "metadata": {}, "outputs": [], "source": [ - "def cudf_to_cupy_sparse_matrix(data, sparseformat='column'):\n", - " \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\n", - " \"\"\"\n", - " if sparseformat not in ('row', 'column',):\n", + "def cudf_to_cupy_sparse_matrix(data, sparseformat=\"column\"):\n", + " \"\"\"Converts a cuDF object to a CuPy Sparse Column matrix.\"\"\"\n", + " if sparseformat not in (\n", + " \"row\",\n", + " \"column\",\n", + " ):\n", " raise ValueError(\"Let's focus on column and row formats for now.\")\n", - " \n", + "\n", " _sparse_constructor = cp.sparse.csc_matrix\n", - " if sparseformat == 'row':\n", + " if sparseformat == \"row\":\n", " _sparse_constructor = cp.sparse.csr_matrix\n", "\n", " return _sparse_constructor(cupy_from_dlpack(data.to_dlpack()))" @@ -1121,8 +1127,8 @@ "nonzero = 1000\n", "for i in range(20):\n", " arr = cp.random.normal(5, 5, nelem)\n", - " arr[cp.random.choice(arr.shape[0], nelem-nonzero, replace=False)] = 0\n", - " df['a' + str(i)] = arr" + " arr[cp.random.choice(arr.shape[0], nelem - nonzero, replace=False)] = 0\n", + " df[\"a\" + str(i)] = arr" ] }, { diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index 943fc980a31..ba8c65784d2 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -15,9 +15,10 @@ "metadata": {}, "outputs": [], "source": [ + "import numpy as np\n", + "\n", "import cudf\n", - "from cudf.datasets import randomdata\n", - "import numpy as np" + "from cudf.datasets import randomdata" ] }, { @@ -375,7 +376,7 @@ "metadata": {}, "outputs": [], "source": [ - "sr = cudf.Series(['', 'abc', 'some_example'])" + "sr = cudf.Series([\"\", \"abc\", \"some_example\"])" ] }, { @@ -387,9 +388,9 @@ "source": [ "def f(st):\n", " if len(st) > 0:\n", - " if st.startswith('a'):\n", + " if st.startswith(\"a\"):\n", " return 1\n", - " elif 'example' in st:\n", + " elif \"example\" in st:\n", " return 2\n", " else:\n", " return -1\n", @@ -443,6 +444,7 @@ "outputs": [], "source": [ "from cudf.core.udf.utils import set_malloc_heap_size\n", + "\n", "set_malloc_heap_size(int(2e9))" ] }, @@ -472,7 +474,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)" + "df = randomdata(nrows=5, dtypes={\"a\": int, \"b\": int, \"c\": int}, seed=12)" ] }, { @@ -484,10 +486,11 @@ "source": [ "from numba import cuda\n", "\n", + "\n", "@cuda.jit\n", "def multiply(in_col, out_col, multiplier):\n", " i = cuda.grid(1)\n", - " if i < in_col.size: # boundary guard\n", + " if i < in_col.size: # boundary guard\n", " out_col[i] = in_col[i] * multiplier" ] }, @@ -508,9 +511,9 @@ "metadata": {}, "outputs": [], "source": [ - "size = len(df['a'])\n", - "df['e'] = 0.0\n", - "multiply.forall(size)(df['a'], df['e'], 10.0)" + "size = len(df[\"a\"])\n", + "df[\"e\"] = 0.0\n", + "multiply.forall(size)(df[\"a\"], df[\"e\"], 10.0)" ] }, { @@ -658,7 +661,7 @@ "outputs": [], "source": [ "def f(row):\n", - " return row['A'] + row['B']" + " return row[\"A\"] + row[\"B\"]" ] }, { @@ -733,10 +736,7 @@ } ], "source": [ - "df = cudf.DataFrame({\n", - " 'A': [1,2,3],\n", - " 'B': [4,cudf.NA,6]\n", - "})\n", + "df = cudf.DataFrame({\"A\": [1, 2, 3], \"B\": [4, cudf.NA, 6]})\n", "df" ] }, @@ -881,13 +881,14 @@ ], "source": [ "def f(row):\n", - " x = row['a']\n", + " x = row[\"a\"]\n", " if x is cudf.NA:\n", " return 0\n", " else:\n", " return x + 1\n", "\n", - "df = cudf.DataFrame({'a': [1, cudf.NA, 3]})\n", + "\n", + "df = cudf.DataFrame({\"a\": [1, cudf.NA, 3]})\n", "df" ] }, @@ -988,17 +989,15 @@ ], "source": [ "def f(row):\n", - " x = row['a']\n", - " y = row['b']\n", + " x = row[\"a\"]\n", + " y = row[\"b\"]\n", " if x + y > 3:\n", " return cudf.NA\n", " else:\n", " return x + y\n", "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 2, 3], \n", - " 'b': [2, 1, 1]\n", - "})\n", + "\n", + "df = cudf.DataFrame({\"a\": [1, 2, 3], \"b\": [2, 1, 1]})\n", "df" ] }, @@ -1099,12 +1098,10 @@ ], "source": [ "def f(row):\n", - " return row['a'] + row['b']\n", + " return row[\"a\"] + row[\"b\"]\n", + "\n", "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 2, 3], \n", - " 'b': [0.5, cudf.NA, 3.14]\n", - "})\n", + "df = cudf.DataFrame({\"a\": [1, 2, 3], \"b\": [0.5, cudf.NA, 3.14]})\n", "df" ] }, @@ -1214,15 +1211,14 @@ ], "source": [ "def f(row):\n", - " x = row['a']\n", + " x = row[\"a\"]\n", " if x > 3:\n", - " return x\n", + " return x\n", " else:\n", - " return 1.5\n", + " return 1.5\n", + "\n", "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 3, 5]\n", - "})\n", + "df = cudf.DataFrame({\"a\": [1, 3, 5]})\n", "df" ] }, @@ -1335,15 +1331,18 @@ ], "source": [ "def f(row):\n", - " return row['a'] + (row['b'] - (row['c'] / row['d'])) % row['e']\n", + " return row[\"a\"] + (row[\"b\"] - (row[\"c\"] / row[\"d\"])) % row[\"e\"]\n", "\n", - "df = cudf.DataFrame({\n", - " 'a': [1, 2, 3],\n", - " 'b': [4, 5, 6],\n", - " 'c': [cudf.NA, 4, 4],\n", - " 'd': [8, 7, 8],\n", - " 'e': [7, 1, 6]\n", - "})\n", + "\n", + "df = cudf.DataFrame(\n", + " {\n", + " \"a\": [1, 2, 3],\n", + " \"b\": [4, 5, 6],\n", + " \"c\": [cudf.NA, 4, 4],\n", + " \"d\": [8, 7, 8],\n", + " \"e\": [7, 1, 6],\n", + " }\n", + ")\n", "df" ] }, @@ -1451,10 +1450,9 @@ } ], "source": [ - "str_df = cudf.DataFrame({\n", - " 'str_col': ['abc', 'ABC', 'Example'],\n", - " 'scale': [1, 2, 3]\n", - "})\n", + "str_df = cudf.DataFrame(\n", + " {\"str_col\": [\"abc\", \"ABC\", \"Example\"], \"scale\": [1, 2, 3]}\n", + ")\n", "str_df" ] }, @@ -1466,9 +1464,9 @@ "outputs": [], "source": [ "def f(row):\n", - " st = row['str_col']\n", - " scale = row['scale']\n", - " \n", + " st = row[\"str_col\"]\n", + " scale = row[\"scale\"]\n", + "\n", " if len(st) > 5:\n", " return len(st) + scale\n", " else:\n", @@ -1626,11 +1624,12 @@ } ], "source": [ - "df = df.apply_rows(conditional_add, \n", - " incols={'a':'x', 'e':'y'},\n", - " outcols={'out': np.float64},\n", - " kwargs={}\n", - " )\n", + "df = df.apply_rows(\n", + " conditional_add,\n", + " incols={\"a\": \"x\", \"e\": \"y\"},\n", + " outcols={\"out\": np.float64},\n", + " kwargs={},\n", + ")\n", "df.head()" ] }, @@ -1738,10 +1737,11 @@ " for i, (x, y) in enumerate(zip(a, b)):\n", " out[i] = x + y\n", "\n", - "df = randomdata(nrows=5, dtypes={'a':int, 'b':int, 'c':int}, seed=12)\n", - "df.loc[2, 'a'] = None\n", - "df.loc[3, 'b'] = None\n", - "df.loc[1, 'c'] = None\n", + "\n", + "df = randomdata(nrows=5, dtypes={\"a\": int, \"b\": int, \"c\": int}, seed=12)\n", + "df.loc[2, \"a\"] = None\n", + "df.loc[3, \"b\"] = None\n", + "df.loc[1, \"c\"] = None\n", "df.head()" ] }, @@ -1841,10 +1841,9 @@ } ], "source": [ - "df = df.apply_rows(gpu_add, \n", - " incols=['a', 'b'],\n", - " outcols={'out':np.float64},\n", - " kwargs={})\n", + "df = df.apply_rows(\n", + " gpu_add, incols=[\"a\", \"b\"], outcols={\"out\": np.float64}, kwargs={}\n", + ")\n", "df.head()" ] }, @@ -1892,7 +1891,7 @@ } ], "source": [ - "ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype='float64')\n", + "ser = cudf.Series([16, 25, 36, 49, 64, 81], dtype=\"float64\")\n", "ser" ] }, @@ -1935,12 +1934,13 @@ "source": [ "import math\n", "\n", + "\n", "def example_func(window):\n", " b = 0\n", " for a in window:\n", " b = max(b, math.sqrt(a))\n", " if b == 8:\n", - " return 100 \n", + " return 100\n", " return b" ] }, @@ -2064,8 +2064,8 @@ ], "source": [ "df2 = cudf.DataFrame()\n", - "df2['a'] = np.arange(55, 65, dtype='float64')\n", - "df2['b'] = np.arange(55, 65, dtype='float64')\n", + "df2[\"a\"] = np.arange(55, 65, dtype=\"float64\")\n", + "df2[\"b\"] = np.arange(55, 65, dtype=\"float64\")\n", "df2.head()" ] }, @@ -2279,7 +2279,9 @@ } ], "source": [ - "df = randomdata(nrows=10, dtypes={'a':float, 'b':bool, 'c':str, 'e': float}, seed=12)\n", + "df = randomdata(\n", + " nrows=10, dtypes={\"a\": float, \"b\": bool, \"c\": str, \"e\": float}, seed=12\n", + ")\n", "df.head()" ] }, @@ -2290,7 +2292,7 @@ "metadata": {}, "outputs": [], "source": [ - "grouped = df.groupby(['b'])" + "grouped = df.groupby([\"b\"])" ] }, { @@ -2469,9 +2471,9 @@ } ], "source": [ - "results = grouped.apply_grouped(rolling_avg,\n", - " incols=['e'],\n", - " outcols=dict(rolling_avg_e=np.float64))\n", + "results = grouped.apply_grouped(\n", + " rolling_avg, incols=[\"e\"], outcols=dict(rolling_avg_e=np.float64)\n", + ")\n", "results" ] }, @@ -2554,8 +2556,9 @@ " i = cuda.grid(1)\n", " if i < x.size:\n", " out[i] = x[i] * 5\n", - " \n", - "out = cudf.Series(cp.zeros(len(s), dtype='int32'))\n", + "\n", + "\n", + "out = cudf.Series(cp.zeros(len(s), dtype=\"int32\"))\n", "multiply_by_5.forall(s.shape[0])(s, out)\n", "out" ] diff --git a/docs/cudf/source/user_guide/missing-data.ipynb b/docs/cudf/source/user_guide/missing-data.ipynb index ac5bddd34cf..f1404ce0b77 100644 --- a/docs/cudf/source/user_guide/missing-data.ipynb +++ b/docs/cudf/source/user_guide/missing-data.ipynb @@ -39,8 +39,9 @@ "metadata": {}, "outputs": [], "source": [ - "import cudf\n", - "import numpy as np" + "import numpy as np\n", + "\n", + "import cudf" ] }, { @@ -50,7 +51,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = cudf.DataFrame({'a': [1, 2, None, 4], 'b':[0.1, None, 2.3, 17.17]})" + "df = cudf.DataFrame({\"a\": [1, 2, None, 4], \"b\": [0.1, None, 2.3, 17.17]})" ] }, { @@ -221,7 +222,7 @@ } ], "source": [ - "df['a'].notna()" + "df[\"a\"].notna()" ] }, { @@ -304,7 +305,7 @@ } ], "source": [ - "df['b'] == np.nan" + "df[\"b\"] == np.nan" ] }, { @@ -535,7 +536,10 @@ ], "source": [ "import pandas as pd\n", - "datetime_series = cudf.Series([pd.Timestamp(\"20120101\"), pd.NaT, pd.Timestamp(\"20120101\")])\n", + "\n", + "datetime_series = cudf.Series(\n", + " [pd.Timestamp(\"20120101\"), pd.NaT, pd.Timestamp(\"20120101\")]\n", + ")\n", "datetime_series" ] }, @@ -618,7 +622,12 @@ "metadata": {}, "outputs": [], "source": [ - "df1 = cudf.DataFrame({'a':[1, None, 2, 3, None], 'b':cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False)})" + "df1 = cudf.DataFrame(\n", + " {\n", + " \"a\": [1, None, 2, 3, None],\n", + " \"b\": cudf.Series([np.nan, 2, 3.2, 0.1, 1], nan_as_null=False),\n", + " }\n", + ")" ] }, { @@ -628,7 +637,9 @@ "metadata": {}, "outputs": [], "source": [ - "df2 = cudf.DataFrame({'a':[1, 11, 2, 34, 10], 'b':cudf.Series([0.23, 22, 3.2, None, 1])})" + "df2 = cudf.DataFrame(\n", + " {\"a\": [1, 11, 2, 34, 10], \"b\": cudf.Series([0.23, 22, 3.2, None, 1])}\n", + ")" ] }, { @@ -899,7 +910,7 @@ } ], "source": [ - "df1['a']" + "df1[\"a\"]" ] }, { @@ -920,7 +931,7 @@ } ], "source": [ - "df1['a'].sum()" + "df1[\"a\"].sum()" ] }, { @@ -949,7 +960,7 @@ } ], "source": [ - "df1['a'].mean()" + "df1[\"a\"].mean()" ] }, { @@ -980,7 +991,7 @@ } ], "source": [ - "df1['a'].sum(skipna=False)" + "df1[\"a\"].sum(skipna=False)" ] }, { @@ -1001,7 +1012,7 @@ } ], "source": [ - "df1['a'].mean(skipna=False)" + "df1[\"a\"].mean(skipna=False)" ] }, { @@ -1035,7 +1046,7 @@ } ], "source": [ - "df1['a'].cumsum()" + "df1[\"a\"].cumsum()" ] }, { @@ -1069,7 +1080,7 @@ } ], "source": [ - "df1['a'].cumsum(skipna=False)" + "df1[\"a\"].cumsum(skipna=False)" ] }, { @@ -1148,7 +1159,7 @@ } ], "source": [ - "cudf.Series([], dtype='float64').sum()" + "cudf.Series([], dtype=\"float64\").sum()" ] }, { @@ -1219,7 +1230,7 @@ } ], "source": [ - "cudf.Series([], dtype='float64').prod()" + "cudf.Series([], dtype=\"float64\").prod()" ] }, { @@ -1382,7 +1393,7 @@ } ], "source": [ - "df1.groupby('a').mean()" + "df1.groupby(\"a\").mean()" ] }, { @@ -1463,7 +1474,7 @@ } ], "source": [ - "df1.groupby('a', dropna=False).mean()" + "df1.groupby(\"a\", dropna=False).mean()" ] }, { @@ -1670,7 +1681,7 @@ } ], "source": [ - "df1['b'].fillna(10)" + "df1[\"b\"].fillna(10)" ] }, { @@ -1697,7 +1708,8 @@ "outputs": [], "source": [ "import cupy as cp\n", - "dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list('ABC'))" + "\n", + "dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list(\"ABC\"))" ] }, { @@ -2339,7 +2351,7 @@ } ], "source": [ - "df1['a'].dropna()" + "df1[\"a\"].dropna()" ] }, { From d14d980b63402a779a3f75cc64cb3a5a0be7898d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 24 Feb 2023 15:30:07 -0800 Subject: [PATCH 09/10] Add dfg as a pre-commit hook (#12819) This change allows local and remote runs to handle calls to dfg identically, and removes the need for a separate CI check. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/12819 --- .github/workflows/pr.yaml | 2 ++ .pre-commit-config.yaml | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 952b58abda5..3a80139e333 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -30,6 +30,8 @@ jobs: checks: secrets: inherit uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04 + with: + enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a030f3bd25b..1eb2c508db9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -160,6 +160,11 @@ repos: .*test.*| ^CHANGELOG.md$ ) + - repo: https://github.com/rapidsai/dependency-file-generator + rev: v1.4.0 + hooks: + - id: rapids-dependency-file-generator + args: ["--clean"] default_language_version: python: python3 From eb4da9345f172c3911f78c5e851757ec2ec222b9 Mon Sep 17 00:00:00 2001 From: Carl Simon Adorf Date: Sat, 25 Feb 2023 01:13:34 +0100 Subject: [PATCH 10/10] CI: Remove specification of manual stage for check_style.sh script. (#12803) Do not explicitly specify to run the "manual" stage when running pre-commits as part of the ci/check_style.sh script. Authors: - Carl Simon Adorf (https://github.com/csadorf) - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/12803 --- ci/check_style.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/check_style.sh b/ci/check_style.sh index 020143095ce..f9bfea7b47c 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. set -euo pipefail @@ -20,4 +20,4 @@ mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} # Run pre-commit checks -pre-commit run --hook-stage manual --all-files --show-diff-on-failure +pre-commit run --all-files --show-diff-on-failure