From 90882d36641712a397cce58103b82b644d1ef5cc Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 28 Mar 2022 13:09:18 -0700 Subject: [PATCH] Update pre-commit to run black 22.3.0 (#10523) This PR updates us to use black 22.3.0, which is now necessary because older versions of black are not compatible with current versions of Click (see https://github.com/psf/black/issues/2964 is resolved). I've opened this for 22.06 since [I don't see any open PRs attempting to merge into 22.04](https://github.com/rapidsai/cudf/pulls?q=is%3Apr+is%3Aopen+base%3Abranch-22.04) anymore, but this issue will block CI (which runs style checks using pre-commit) so if necessary I can backport. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) --- .pre-commit-config.yaml | 2 +- python/cudf/cudf/_fuzz_testing/avro.py | 4 +- python/cudf/cudf/_fuzz_testing/csv.py | 6 +- python/cudf/cudf/_fuzz_testing/fuzzer.py | 8 +- python/cudf/cudf/_fuzz_testing/json.py | 6 +- python/cudf/cudf/_fuzz_testing/orc.py | 6 +- python/cudf/cudf/_fuzz_testing/parquet.py | 6 +- .../_fuzz_testing/tests/fuzz_test_parquet.py | 10 +- python/cudf/cudf/_lib/column.pyi | 100 +++--------- python/cudf/cudf/comm/serialize.py | 3 +- python/cudf/cudf/core/_base_index.py | 10 +- python/cudf/cudf/core/_internals/where.py | 27 +-- python/cudf/cudf/core/column/categorical.py | 27 ++- python/cudf/cudf/core/column/column.py | 23 ++- python/cudf/cudf/core/column/lists.py | 8 +- python/cudf/cudf/core/column/methods.py | 7 +- python/cudf/cudf/core/column/numerical.py | 12 +- .../cudf/cudf/core/column/numerical_base.py | 2 +- python/cudf/cudf/core/column/string.py | 30 +++- python/cudf/cudf/core/column/timedelta.py | 5 +- python/cudf/cudf/core/column_accessor.py | 8 +- python/cudf/cudf/core/dataframe.py | 49 +++++- python/cudf/cudf/core/df_protocol.py | 8 +- python/cudf/cudf/core/dtypes.py | 5 +- python/cudf/cudf/core/frame.py | 40 ++++- python/cudf/cudf/core/index.py | 26 ++- python/cudf/cudf/core/indexed_frame.py | 9 +- python/cudf/cudf/core/mixins/binops.py | 2 +- python/cudf/cudf/core/mixins/binops.pyi | 111 ++++--------- python/cudf/cudf/core/mixins/reductions.pyi | 87 +++------- python/cudf/cudf/core/mixins/scans.py | 7 +- python/cudf/cudf/core/mixins/scans.pyi | 15 +- python/cudf/cudf/core/multiindex.py | 17 +- python/cudf/cudf/core/resample.py | 6 +- python/cudf/cudf/core/reshape.py | 3 +- python/cudf/cudf/core/series.py | 29 +++- python/cudf/cudf/core/single_column_frame.py | 7 +- python/cudf/cudf/core/tools/datetimes.py | 12 +- python/cudf/cudf/core/tools/numeric.py | 4 +- python/cudf/cudf/core/udf/lowering.py | 5 +- python/cudf/cudf/core/udf/typing.py | 14 +- python/cudf/cudf/core/window/rolling.py | 5 +- python/cudf/cudf/io/avro.py | 5 +- python/cudf/cudf/io/csv.py | 3 +- python/cudf/cudf/io/json.py | 5 +- python/cudf/cudf/io/orc.py | 8 +- python/cudf/cudf/io/parquet.py | 29 +++- python/cudf/cudf/io/text.py | 5 +- python/cudf/cudf/testing/_utils.py | 3 +- python/cudf/cudf/testing/dataset_generator.py | 51 ++++-- python/cudf/cudf/tests/test_array_ufunc.py | 7 +- python/cudf/cudf/tests/test_binops.py | 6 +- python/cudf/cudf/tests/test_categorical.py | 4 +- python/cudf/cudf/tests/test_column.py | 15 +- python/cudf/cudf/tests/test_concat.py | 39 ++++- python/cudf/cudf/tests/test_csv.py | 4 +- python/cudf/cudf/tests/test_cut.py | 28 +++- python/cudf/cudf/tests/test_dataframe.py | 154 +++++++++++++----- python/cudf/cudf/tests/test_datetime.py | 17 +- python/cudf/cudf/tests/test_doctests.py | 6 +- python/cudf/cudf/tests/test_dropna.py | 5 +- python/cudf/cudf/tests/test_duplicates.py | 5 +- python/cudf/cudf/tests/test_groupby.py | 31 +++- python/cudf/cudf/tests/test_hash_vocab.py | 2 +- python/cudf/cudf/tests/test_index.py | 50 ++++-- python/cudf/cudf/tests/test_indexing.py | 9 +- python/cudf/cudf/tests/test_interval.py | 8 +- python/cudf/cudf/tests/test_joining.py | 9 +- python/cudf/cudf/tests/test_list.py | 55 +++++-- python/cudf/cudf/tests/test_monotonic.py | 5 +- python/cudf/cudf/tests/test_numerical.py | 47 +++--- python/cudf/cudf/tests/test_onehot.py | 2 +- python/cudf/cudf/tests/test_orc.py | 15 +- python/cudf/cudf/tests/test_parquet.py | 15 +- python/cudf/cudf/tests/test_query.py | 4 +- python/cudf/cudf/tests/test_rank.py | 5 +- python/cudf/cudf/tests/test_reductions.py | 7 +- python/cudf/cudf/tests/test_replace.py | 27 ++- python/cudf/cudf/tests/test_resampling.py | 8 +- python/cudf/cudf/tests/test_reshape.py | 4 +- python/cudf/cudf/tests/test_rolling.py | 2 +- python/cudf/cudf/tests/test_s3.py | 9 +- python/cudf/cudf/tests/test_series.py | 56 +++++-- python/cudf/cudf/tests/test_seriesmap.py | 6 +- python/cudf/cudf/tests/test_stats.py | 2 +- python/cudf/cudf/tests/test_string.py | 72 +++++--- python/cudf/cudf/tests/test_testing.py | 14 +- python/cudf/cudf/tests/test_timedelta.py | 12 +- python/cudf/cudf/tests/test_transform.py | 6 +- python/cudf/cudf/tests/test_udf_binops.py | 6 +- python/cudf/cudf/tests/test_udf_masked_ops.py | 8 +- python/cudf/cudf/utils/hash_vocab_utils.py | 14 +- python/cudf/cudf/utils/ioutils.py | 29 +++- python/cudf/setup.py | 5 +- python/cudf_kafka/setup.py | 5 +- python/cudf_kafka/versioneer.py | 3 +- python/dask_cudf/dask_cudf/backends.py | 1 - python/dask_cudf/dask_cudf/core.py | 4 +- python/dask_cudf/dask_cudf/groupby.py | 14 +- python/dask_cudf/dask_cudf/io/parquet.py | 5 +- .../dask_cudf/io/tests/test_parquet.py | 12 +- .../dask_cudf/tests/test_accessor.py | 27 ++- python/dask_cudf/dask_cudf/tests/test_core.py | 6 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 25 ++- .../dask_cudf/dask_cudf/tests/test_onehot.py | 5 +- 105 files changed, 1173 insertions(+), 648 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9e72c0119f3..21f15ade458 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,7 @@ repos: args: ["--settings-path=python/dask_cudf/setup.cfg"] files: python/dask_cudf/.* - repo: https://github.com/psf/black - rev: 19.10b0 + rev: 22.3.0 hooks: - id: black files: python/.* diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py index 4c167ac627f..d9974037daa 100644 --- a/python/cudf/cudf/_fuzz_testing/avro.py +++ b/python/cudf/cudf/_fuzz_testing/avro.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import copy import io @@ -73,7 +73,7 @@ def generate_input(self): self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2 ** 32 - 1) + seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 5f628904276..8ab7048cff0 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import logging import random @@ -50,7 +50,7 @@ def generate_input(self): seed, ) = self.get_next_regression_params() else: - seed = random.randint(0, 2 ** 32 - 1) + seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( @@ -155,7 +155,7 @@ def generate_input(self): seed, ) = self.get_next_regression_params() else: - seed = random.randint(0, 2 ** 32 - 1) + seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py index a51a5073510..b99cd938a63 100644 --- a/python/cudf/cudf/_fuzz_testing/fuzzer.py +++ b/python/cudf/cudf/_fuzz_testing/fuzzer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import datetime import json @@ -60,10 +60,12 @@ def write_crash(self, error): error_file_name = datetime.datetime.now().__str__() if self._crash_dir: crash_path = os.path.join( - self._crash_dir, error_file_name + "_crash.json", + self._crash_dir, + error_file_name + "_crash.json", ) crash_log_path = os.path.join( - self._crash_dir, error_file_name + "_crash.log", + self._crash_dir, + error_file_name + "_crash.log", ) else: crash_path = error_file_name + "_crash.json" diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index 8a8a3d5bff7..f850a7e79f9 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import logging import random @@ -65,7 +65,7 @@ def generate_input(self): seed, ) = self.get_next_regression_params() else: - seed = random.randint(0, 2 ** 32 - 1) + seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list( cudf.utils.dtypes.ALL_TYPES @@ -140,7 +140,7 @@ def generate_input(self): seed, ) = self.get_next_regression_params() else: - seed = random.randint(0, 2 ** 32 - 1) + seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list( cudf.utils.dtypes.ALL_TYPES diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 78e01fb76a4..65d2e09988f 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import copy import io @@ -69,7 +69,7 @@ def generate_input(self): ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2 ** 32 - 1) + seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols @@ -188,7 +188,7 @@ def generate_input(self): self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2 ** 32 - 1) + seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index 859d09b407f..31be9aa2a5e 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import logging import random @@ -64,7 +64,7 @@ def generate_input(self): self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2 ** 32 - 1) + seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols @@ -139,7 +139,7 @@ def generate_input(self): seed, ) = self.get_next_regression_params() else: - seed = random.randint(0, 2 ** 32 - 1) + seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list( cudf.utils.dtypes.ALL_TYPES diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py index db2bcf74112..5b5e7c5964d 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import sys @@ -91,10 +91,14 @@ def parquet_writer_test_rowgroup_index_compression( gdf = cudf.from_pandas(pdf) pdf.to_parquet( - pd_file_name, compression=compression, row_group_size=row_group_size, + pd_file_name, + compression=compression, + row_group_size=row_group_size, ) gdf.to_parquet( - gd_file_name, compression=compression, row_group_size=row_group_size, + gd_file_name, + compression=compression, + row_group_size=row_group_size, ) actual = cudf.read_parquet(gd_file_name) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index 235cb4fd973..0d61e4f02e5 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -32,94 +32,46 @@ class Column: offset: int = None, null_count: int = None, children: Tuple[ColumnBase, ...] = (), - ) -> None: - ... - + ) -> None: ... @property - def base_size(self) -> int: - ... - + def base_size(self) -> int: ... @property - def dtype(self) -> DtypeObj: - ... - + def dtype(self) -> DtypeObj: ... @property - def size(self) -> int: - ... - + def size(self) -> int: ... @property - def base_data(self) -> Optional[Buffer]: - ... - + def base_data(self) -> Optional[Buffer]: ... @property - def base_data_ptr(self) -> int: - ... - + def base_data_ptr(self) -> int: ... @property - def data(self) -> Optional[Buffer]: - ... - + def data(self) -> Optional[Buffer]: ... @property - def data_ptr(self) -> int: - ... - - def set_base_data(self, value: Buffer) -> None: - ... - + def data_ptr(self) -> int: ... + def set_base_data(self, value: Buffer) -> None: ... @property - def nullable(self) -> bool: - ... - - def has_nulls(self, include_nan: bool=False) -> bool: - ... - + def nullable(self) -> bool: ... + def has_nulls(self, include_nan: bool = False) -> bool: ... @property - def base_mask(self) -> Optional[Buffer]: - ... - + def base_mask(self) -> Optional[Buffer]: ... @property - def base_mask_ptr(self) -> int: - ... - + def base_mask_ptr(self) -> int: ... @property - def mask(self) -> Optional[Buffer]: - ... - + def mask(self) -> Optional[Buffer]: ... @property - def mask_ptr(self) -> int: - ... - - def set_base_mask(self, value: Optional[Buffer]) -> None: - ... - - def set_mask(self: T, value: Optional[Buffer]) -> T: - ... - + def mask_ptr(self) -> int: ... + def set_base_mask(self, value: Optional[Buffer]) -> None: ... + def set_mask(self: T, value: Optional[Buffer]) -> T: ... @property - def null_count(self) -> int: - ... - + def null_count(self) -> int: ... @property - def offset(self) -> int: - ... - + def offset(self) -> int: ... @property - def base_children(self) -> Tuple[ColumnBase, ...]: - ... - + def base_children(self) -> Tuple[ColumnBase, ...]: ... @property - def children(self) -> Tuple[ColumnBase, ...]: - ... - - def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: - ... - - def _mimic_inplace(self, other_col: ColumnBase, inplace=False) -> Optional[ColumnBase]: - ... - + def children(self) -> Tuple[ColumnBase, ...]: ... + def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... + def _mimic_inplace( + self, other_col: ColumnBase, inplace=False + ) -> Optional[ColumnBase]: ... @staticmethod - def from_scalar( - val: ScalarLike, - size: int - ) -> ColumnBase: # TODO: This should be Scalar, not ScalarLike - ... + def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ... diff --git a/python/cudf/cudf/comm/serialize.py b/python/cudf/cudf/comm/serialize.py index 431b6bb2984..9fb28907e73 100644 --- a/python/cudf/cudf/comm/serialize.py +++ b/python/cudf/cudf/comm/serialize.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import cudf # noqa: F401 from cudf.core.abc import Serializable @@ -26,7 +28,6 @@ def dask_deserialize_cudf_object(header, frames): with log_errors(): return Serializable.host_deserialize(header, frames) - except ImportError: # distributed is probably not installed on the system pass diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e05c55bd78f..259a7f711c3 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1389,7 +1389,9 @@ def _constructor_expanddim(self): return cudf.MultiIndex def drop_duplicates( - self, keep="first", nulls_are_equal=True, + self, + keep="first", + nulls_are_equal=True, ): """ Drop duplicate rows in index. @@ -1435,7 +1437,11 @@ def dropna(self, how="any"): ] return self._from_columns_like_self( - drop_nulls(data_columns, how=how, keys=range(len(data_columns)),), + drop_nulls( + data_columns, + how=how, + keys=range(len(data_columns)), + ), self._column_names, ) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 8bfcad4c8f4..59e7d629092 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -93,10 +93,10 @@ def _check_and_cast_columns_with_other( def _normalize_columns_and_scalars_type( - frame: Frame, other: Any, inplace: bool = False, -) -> Tuple[ - Union[Frame, ColumnLike], Any, -]: + frame: Frame, + other: Any, + inplace: bool = False, +) -> Tuple[Union[Frame, ColumnLike], Any]: """ Try to normalize the other's dtypes as per frame. @@ -176,7 +176,10 @@ def _normalize_columns_and_scalars_type( def where( - frame: Frame, cond: Any, other: Any = None, inplace: bool = False, + frame: Frame, + cond: Any, + other: Any = None, + inplace: bool = False, ) -> Optional[Union[Frame]]: """ Replace values where the condition is False. @@ -266,9 +269,10 @@ def where( # as `cond` has no column names. cond._set_column_names_like(frame) - (source_df, others,) = _normalize_columns_and_scalars_type( - frame, other - ) + ( + source_df, + others, + ) = _normalize_columns_and_scalars_type(frame, other) if isinstance(others, Frame): others = others._data.columns @@ -340,9 +344,10 @@ def where( """Array conditional must be same shape as self""" ) - (input_col, other,) = _normalize_columns_and_scalars_type( - frame, other, inplace - ) + ( + input_col, + other, + ) = _normalize_columns_and_scalars_type(frame, other, inplace) if isinstance(input_col, cudf.core.column.CategoricalColumn): if cudf.api.types.is_scalar(other): diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index e0022ed21ca..9f00f9a203f 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -350,7 +350,9 @@ def add_categories( return self._return_or_inplace(out_col, inplace=inplace) def remove_categories( - self, removals: Any, inplace: bool = False, + self, + removals: Any, + inplace: bool = False, ) -> Optional[SeriesOrIndex]: """ Remove the specified categories. @@ -768,7 +770,9 @@ def children(self) -> Tuple[NumericalColumn]: codes_column = cast( cudf.core.column.NumericalColumn, column.build_column( - data=buf, dtype=codes_column.dtype, size=self.size, + data=buf, + dtype=codes_column.dtype, + size=self.size, ), ) self._children = (codes_column,) @@ -988,7 +992,9 @@ def to_arrow(self) -> pa.Array: out_dictionary = categories.to_arrow() return pa.DictionaryArray.from_arrays( - out_indices, out_dictionary, ordered=self.ordered, + out_indices, + out_dictionary, + ordered=self.ordered, ) @property @@ -1216,7 +1222,8 @@ def fillna( # TODO: only required if fill_value has a subset of the # categories: fill_value = fill_value._set_categories( - self.categories, is_unique=True, + self.categories, + is_unique=True, ) fill_value = column.as_column(fill_value.codes).astype( self.codes.dtype @@ -1415,7 +1422,10 @@ def _with_type_metadata( return self def set_categories( - self, new_categories: Any, ordered: bool = False, rename: bool = False, + self, + new_categories: Any, + ordered: bool = False, + rename: bool = False, ) -> CategoricalColumn: # See CategoricalAccessor.set_categories. @@ -1460,7 +1470,8 @@ def set_categories( or not self.ordered == ordered ): out_col = out_col._set_categories( - new_categories, ordered=ordered, + new_categories, + ordered=ordered, ) return out_col @@ -1555,7 +1566,9 @@ def _set_categories( ) def reorder_categories( - self, new_categories: Any, ordered: bool = False, + self, + new_categories: Any, + ordered: bool = False, ) -> CategoricalColumn: new_categories = column.as_column(new_categories) # Compare new_categories against current categories. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 401d5f82743..bc59b67119e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -553,7 +553,10 @@ def _scatter_by_slice( # step != 1, create a scatter map with arange scatter_map = arange( - start=start, stop=stop, step=step, dtype=cudf.dtype(np.int32), + start=start, + stop=stop, + step=step, + dtype=cudf.dtype(np.int32), ) return self._scatter_by_column(scatter_map, value) @@ -620,7 +623,10 @@ def _check_scatter_key_length( raise ValueError(msg) def fillna( - self: T, value: Any = None, method: str = None, dtype: Dtype = None, + self: T, + value: Any = None, + method: str = None, + dtype: Dtype = None, ) -> T: """Fill null values with ``value``. @@ -844,7 +850,9 @@ def get_slice_bound(self, label: ScalarLike, side: str, kind: str) -> int: raise ValueError(f"Invalid value for side: {side}") def sort_by_values( - self: ColumnBase, ascending: bool = True, na_position: str = "last", + self: ColumnBase, + ascending: bool = True, + na_position: str = "last", ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]: col_inds = self.as_frame()._get_sorted_inds( ascending=ascending, na_position=na_position @@ -1884,7 +1892,8 @@ def as_column( # changing from pd array to series,possible arrow bug interval_series = pd.Series(arbitrary) data = as_column( - pa.Array.from_pandas(interval_series), dtype=arbitrary.dtype, + pa.Array.from_pandas(interval_series), + dtype=arbitrary.dtype, ) if dtype is not None: data = data.astype(dtype) @@ -2109,7 +2118,11 @@ def _construct_array( if ( dtype is None and not cudf._lib.scalar._is_null_host_scalar(arbitrary) - and infer_dtype(arbitrary) in ("mixed", "mixed-integer",) + and infer_dtype(arbitrary) + in ( + "mixed", + "mixed-integer", + ) ): native_dtype = "object" arbitrary = np.asarray( diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 0df5be2d862..60d13150b39 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -32,7 +32,13 @@ class ListColumn(ColumnBase): _VALID_BINARY_OPERATIONS = {"__add__", "__radd__"} def __init__( - self, size, dtype, mask=None, offset=0, null_count=None, children=(), + self, + size, + dtype, + mask=None, + offset=0, + null_count=None, + children=(), ): super().__init__( None, diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index a63fa927cfc..56dcd41666b 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -40,7 +40,10 @@ def _return_or_inplace( @overload def _return_or_inplace( - self, new_col, expand: bool = False, retain_index: bool = True, + self, + new_col, + expand: bool = False, + retain_index: bool = True, ) -> ParentType: ... diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index c9bc3c59aea..a89c8dfed54 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -173,7 +173,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if op in {"__truediv__", "__rtruediv__"}: # Division with integer types results in a suitable float. - if (truediv_type := int_float_dtype_mapping.get(self.dtype.type)) : + if truediv_type := int_float_dtype_mapping.get(self.dtype.type): return self.astype(truediv_type)._binaryop(other, op) reflect, op = self._check_reflected_op(op) @@ -258,7 +258,9 @@ def normalize_binop_value( other, size=len(self), dtype=other_dtype ) return column.build_column( - data=Buffer(ary), dtype=ary.dtype, mask=self.mask, + data=Buffer(ary), + dtype=ary.dtype, + mask=self.mask, ) else: return NotImplemented @@ -521,7 +523,11 @@ def _find_value( raise ValueError("Expected a numeric value") found = 0 if len(self): - found = find(self.data_array_view, value, mask=self.mask,) + found = find( + self.data_array_view, + value, + mask=self.mask, + ) if found == -1: if self.is_monotonic_increasing and closest: found = find( diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index b547cb43cf5..3ae60671b5a 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -62,7 +62,7 @@ def kurtosis(self, skipna: bool = None) -> float: return 0 term_one_section_one = (n * (n + 1)) / ((n - 1) * (n - 2) * (n - 3)) - term_one_section_two = m4_numerator / (V ** 2) + term_one_section_two = m4_numerator / (V**2) term_two = ((n - 1) ** 2) / ((n - 2) * (n - 3)) kurt = term_one_section_one * term_one_section_two - 3 * term_two return kurt diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d18bcaa84f4..c1ef33be975 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -254,7 +254,9 @@ def byte_count(self) -> SeriesOrIndex: 2 11 dtype: int32 """ - return self._return_or_inplace(libstrings.count_bytes(self._column),) + return self._return_or_inplace( + libstrings.count_bytes(self._column), + ) @overload def cat(self, sep: str = None, na_rep: str = None) -> str: @@ -355,7 +357,9 @@ def cat(self, others=None, sep=None, na_rep=None): if others is None: data = libstrings.join( - self._column, cudf.Scalar(sep), cudf.Scalar(na_rep, "str"), + self._column, + cudf.Scalar(sep), + cudf.Scalar(na_rep, "str"), ) else: other_cols = _get_cols_list(self._parent, others) @@ -783,7 +787,10 @@ def contains( ) return self._return_or_inplace(result_col) - def repeat(self, repeats: Union[int, Sequence],) -> SeriesOrIndex: + def repeat( + self, + repeats: Union[int, Sequence], + ) -> SeriesOrIndex: """ Duplicate each string in the Series or Index. Equivalent to `str.repeat() @@ -828,7 +835,8 @@ def repeat(self, repeats: Union[int, Sequence],) -> SeriesOrIndex: if can_convert_to_column(repeats): return self._return_or_inplace( libstrings.repeat_sequence( - self._column, column.as_column(repeats, dtype="int"), + self._column, + column.as_column(repeats, dtype="int"), ), ) @@ -921,7 +929,9 @@ def replace( return self._return_or_inplace( libstrings.replace_multi_re( - self._column, pat, column.as_column(repl, dtype="str"), + self._column, + pat, + column.as_column(repl, dtype="str"), ) if regex else libstrings.replace_multi( @@ -5173,7 +5183,10 @@ def to_arrow(self) -> pa.Array: return super().to_arrow() def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0, + self, + skipna: bool = None, + dtype: Dtype = None, + min_count: int = 0, ): result_col = self._process_for_reduction( skipna=skipna, min_count=min_count @@ -5417,7 +5430,10 @@ def find_and_replace( return libcudf.replace.replace(res, df._data["old"], df._data["new"]) def fillna( - self, fill_value: Any = None, method: str = None, dtype: Dtype = None, + self, + fill_value: Any = None, + method: str = None, + dtype: Dtype = None, ) -> StringColumn: if fill_value is not None: if not is_scalar(fill_value): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 11d295a6190..8e1b938033e 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -378,7 +378,10 @@ def quantile( return result.astype(self.dtype) def sum( - self, skipna: bool = None, min_count: int = 0, dtype: Dtype = None, + self, + skipna: bool = None, + min_count: int = 0, + dtype: Dtype = None, ) -> pd.Timedelta: return pd.Timedelta( # Since sum isn't overriden in Numerical[Base]Column, mypy only diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index c9c00692174..291e50386cc 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -378,7 +378,9 @@ def select_by_index(self, index: Any) -> ColumnAccessor: keys = self.get_labels_by_index(index) data = {k: self._data[k] for k in keys} return self.__class__( - data, multiindex=self.multiindex, level_names=self.level_names, + data, + multiindex=self.multiindex, + level_names=self.level_names, ) def set_by_label(self, key: Any, value: Any, validate: bool = True): @@ -412,7 +414,9 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: if self.multiindex: data = _to_flat_dict(data) return self.__class__( - data, multiindex=self.multiindex, level_names=self.level_names, + data, + multiindex=self.multiindex, + level_names=self.level_names, ) def _select_by_label_grouped(self, key: Any) -> ColumnAccessor: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 233a0b0beda..17cac3593a3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -826,7 +826,9 @@ def _init_from_dict_like( masked = index is not None data = { key: cudf.core.column.column_empty( - row_count=row_count, dtype=None, masked=masked, + row_count=row_count, + dtype=None, + masked=masked, ) for key in extra_cols } @@ -855,7 +857,10 @@ def _init_from_dict_like( col_name, tuple ) self._insert( - i, col_name, data[col_name], nan_as_null=nan_as_null, + i, + col_name, + data[col_name], + nan_as_null=nan_as_null, ) if columns is not None: @@ -2095,7 +2100,9 @@ def _set_column_names(self, names, multiindex=False, level_names=None): raise ValueError("Duplicate column names are not allowed") self._data = ColumnAccessor( - data, multiindex=multiindex, level_names=level_names, + data, + multiindex=multiindex, + level_names=level_names, ) def _set_column_names_like(self, other): @@ -3370,7 +3377,13 @@ def merge( @_cudf_nvtx_annotate def join( - self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False, + self, + other, + on=None, + how="left", + lsuffix="", + rsuffix="", + sort=False, ): """Join columns with other DataFrame on index or on a key column. @@ -4507,7 +4520,9 @@ def to_arrow(self, preserve_index=True): gen_names, self.index._data.names ): data._insert( - data.shape[1], gen_name, self.index._data[col_name], + data.shape[1], + gen_name, + self.index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -5095,7 +5110,12 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): @_cudf_nvtx_annotate def _reduce( - self, op, axis=None, level=None, numeric_only=None, **kwargs, + self, + op, + axis=None, + level=None, + numeric_only=None, + **kwargs, ): if level is not None: raise NotImplementedError("level parameter is not implemented yet") @@ -5123,7 +5143,11 @@ def _reduce( @_cudf_nvtx_annotate def _scan( - self, op, axis=None, *args, **kwargs, + self, + op, + axis=None, + *args, + **kwargs, ): axis = self._get_axis_from_axis_arg(axis) @@ -5355,7 +5379,11 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result = result.set_mask( cudf._lib.transform.bools_to_mask(mask._column) ) - return Series(result, index=self.index, dtype=result_dtype,) + return Series( + result, + index=self.index, + dtype=result_dtype, + ) else: result_df = DataFrame(result).set_index(self.index) result_df._set_column_names_like(prepared) @@ -6532,7 +6560,10 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): if idx in categories: cols[idx] = ( cols[idx] - ._set_categories(categories[idx], is_unique=True,) + ._set_categories( + categories[idx], + is_unique=True, + ) .codes ) cols[idx] = cols[idx].astype(dtype) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 8f00289afcb..4a30a78bf65 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -438,7 +438,9 @@ def _get_validity_buffer( f"See {self.__class__.__name__}.describe_null method." ) - def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + def _get_offsets_buffer( + self, + ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) @@ -464,7 +466,9 @@ def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: return buffer, dtype - def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: + def _get_data_buffer( + self, + ) -> Tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the data and the buffer's associated dtype. diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 3a1c366b429..21cae5f05b7 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -349,7 +349,10 @@ def deserialize(cls, header: dict, frames: list): dtype_header, (start, stop) = dtype fields[k] = pickle.loads( dtype_header["type-serialized"] - ).deserialize(dtype_header, frames[start:stop],) + ).deserialize( + dtype_header, + frames[start:stop], + ) else: fields[k] = dtype return cls(fields) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d78744a719f..a84606b0953 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -145,7 +145,9 @@ def _from_data(cls, data: MutableMapping): @classmethod @_cudf_nvtx_annotate def _from_columns( - cls, columns: List[ColumnBase], column_names: abc.Iterable[str], + cls, + columns: List[ColumnBase], + column_names: abc.Iterable[str], ): """Construct a `Frame` object from a list of columns.""" data = {name: columns[i] for i, name in enumerate(column_names)} @@ -688,7 +690,8 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): """ if isinstance(self, cudf.BaseIndex): warnings.warn( - "Index.clip is deprecated and will be removed.", FutureWarning, + "Index.clip is deprecated and will be removed.", + FutureWarning, ) if axis != 1: @@ -1131,7 +1134,8 @@ def fillna( filled_data[col_name] = col.copy(deep=True) return self._mimic_inplace( - self._from_data(data=filled_data), inplace=inplace, + self._from_data(data=filled_data), + inplace=inplace, ) @_cudf_nvtx_annotate @@ -2656,7 +2660,12 @@ def _reduce(self, *args, **kwargs): @_cudf_nvtx_annotate def min( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + **kwargs, ): """ Return the minimum of the values in the DataFrame. @@ -2702,7 +2711,12 @@ def min( @_cudf_nvtx_annotate def max( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + **kwargs, ): """ Return the maximum of the values in the DataFrame. @@ -3188,7 +3202,11 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): dtype: bool """ return self._reduce( - "all", axis=axis, skipna=skipna, level=level, **kwargs, + "all", + axis=axis, + skipna=skipna, + level=level, + **kwargs, ) @_cudf_nvtx_annotate @@ -3224,7 +3242,11 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): dtype: bool """ return self._reduce( - "any", axis=axis, skipna=skipna, level=level, **kwargs, + "any", + axis=axis, + skipna=skipna, + level=level, + **kwargs, ) @_cudf_nvtx_annotate @@ -5328,7 +5350,9 @@ def _get_replacement_values_for_columns( col: [value] if _is_non_decimal_numeric_dtype(columns_dtype_map[col]) else cudf.utils.utils.scalar_broadcast_to( - value, (len(to_replace),), cudf.dtype(type(value)), + value, + (len(to_replace),), + cudf.dtype(type(value)), ) for col in columns_dtype_map } diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d935da3bd14..7df5be3f692 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -852,7 +852,12 @@ def _from_data( return out def _binaryop( - self, other: T, op: str, fill_value: Any = None, *args, **kwargs, + self, + other: T, + op: str, + fill_value: Any = None, + *args, + **kwargs, ) -> SingleColumnFrame: reflect, op = self._check_reflected_op(op) operands = self._make_operands_for_binop(other, fill_value, reflect) @@ -2369,7 +2374,12 @@ def is_categorical(self): @_cudf_nvtx_annotate def interval_range( - start=None, end=None, periods=None, freq=None, name=None, closed="right", + start=None, + end=None, + periods=None, + freq=None, + name=None, + closed="right", ) -> "IntervalIndex": """ Returns a fixed frequency IntervalIndex. @@ -2532,7 +2542,12 @@ class IntervalIndex(GenericIndex): @_cudf_nvtx_annotate def __init__( - self, data, closed=None, dtype=None, copy=False, name=None, + self, + data, + closed=None, + dtype=None, + copy=False, + name=None, ): if copy: data = column.as_column(data, dtype=dtype).copy() @@ -2542,7 +2557,10 @@ def __init__( elif isinstance(data, pd.Series) and (is_interval_dtype(data.dtype)): data = column.as_column(data, data.dtype) elif isinstance(data, (pd._libs.interval.Interval, pd.IntervalIndex)): - data = column.as_column(data, dtype=dtype,) + data = column.as_column( + data, + dtype=dtype, + ) elif not data: dtype = IntervalDtype("int64", closed) data = column.column_empty_like_same_mask( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index b8077d7d28b..c5c2322d95a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -194,7 +194,9 @@ def _num_rows(self) -> int: @classmethod def _from_data( - cls, data: MutableMapping, index: Optional[BaseIndex] = None, + cls, + data: MutableMapping, + index: Optional[BaseIndex] = None, ): out = super()._from_data(data) out._index = RangeIndex(out._data.nrows) if index is None else index @@ -1758,7 +1760,10 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): index_names, ) = self._index._split_columns_by_levels(level) if index_columns: - index = _index_from_columns(index_columns, name=self._index.name,) + index = _index_from_columns( + index_columns, + name=self._index.name, + ) if isinstance(index, MultiIndex): index.names = index_names else: diff --git a/python/cudf/cudf/core/mixins/binops.py b/python/cudf/cudf/core/mixins/binops.py index e07977ed4c3..eaabc00f266 100644 --- a/python/cudf/cudf/core/mixins/binops.py +++ b/python/cudf/cudf/core/mixins/binops.py @@ -63,7 +63,7 @@ def _binaryop(self, other, op: str): def _check_reflected_op(op): - if (reflect := op[2] == "r" and op != "__rshift__") : + if reflect := op[2] == "r" and op != "__rshift__": op = op[:2] + op[3:] return reflect, op diff --git a/python/cudf/cudf/core/mixins/binops.pyi b/python/cudf/cudf/core/mixins/binops.pyi index ff47cdce418..8587b2dea48 100644 --- a/python/cudf/cudf/core/mixins/binops.pyi +++ b/python/cudf/cudf/core/mixins/binops.pyi @@ -8,87 +8,32 @@ BinaryOperandType = TypeVar("BinaryOperandType", bound="Any") class BinaryOperand: _SUPPORTED_BINARY_OPERATIONS: Set - def _binaryop(self, other: BinaryOperandType, op: str): - ... - - def __add__(self, other): - ... - - def __sub__(self, other): - ... - - def __mul__(self, other): - ... - - def __truediv__(self, other): - ... - - def __floordiv__(self, other): - ... - - def __mod__(self, other): - ... - - def __pow__(self, other): - ... - - def __and__(self, other): - ... - - def __xor__(self, other): - ... - - def __or__(self, other): - ... - - def __radd__(self, other): - ... - - def __rsub__(self, other): - ... - - def __rmul__(self, other): - ... - - def __rtruediv__(self, other): - ... - - def __rfloordiv__(self, other): - ... - - def __rmod__(self, other): - ... - - def __rpow__(self, other): - ... - - def __rand__(self, other): - ... - - def __rxor__(self, other): - ... - - def __ror__(self, other): - ... - - def __lt__(self, other): - ... - - def __le__(self, other): - ... - - def __eq__(self, other): - ... - - def __ne__(self, other): - ... - - def __gt__(self, other): - ... - - def __ge__(self, other): - ... - + def _binaryop(self, other: BinaryOperandType, op: str): ... + def __add__(self, other): ... + def __sub__(self, other): ... + def __mul__(self, other): ... + def __truediv__(self, other): ... + def __floordiv__(self, other): ... + def __mod__(self, other): ... + def __pow__(self, other): ... + def __and__(self, other): ... + def __xor__(self, other): ... + def __or__(self, other): ... + def __radd__(self, other): ... + def __rsub__(self, other): ... + def __rmul__(self, other): ... + def __rtruediv__(self, other): ... + def __rfloordiv__(self, other): ... + def __rmod__(self, other): ... + def __rpow__(self, other): ... + def __rand__(self, other): ... + def __rxor__(self, other): ... + def __ror__(self, other): ... + def __lt__(self, other): ... + def __le__(self, other): ... + def __eq__(self, other): ... + def __ne__(self, other): ... + def __gt__(self, other): ... + def __ge__(self, other): ... @staticmethod - def _check_reflected_op(op) -> Tuple[bool, str]: - ... + def _check_reflected_op(op) -> Tuple[bool, str]: ... diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi index 3769b7c360e..dbaafdb5cd2 100644 --- a/python/cudf/cudf/core/mixins/reductions.pyi +++ b/python/cudf/cudf/core/mixins/reductions.pyi @@ -5,68 +5,25 @@ from typing import Set class Reducible: _SUPPORTED_REDUCTIONS: Set - def sum(self): - ... - - def product(self): - ... - - def min(self): - ... - - def max(self): - ... - - def count(self): - ... - - def any(self): - ... - - def all(self): - ... - - def sum_of_squares(self): - ... - - def mean(self): - ... - - def var(self): - ... - - def std(self): - ... - - def median(self): - ... - - def argmax(self): - ... - - def argmin(self): - ... - - def nunique(self): - ... - - def nth(self): - ... - - def collect(self): - ... - - def prod(self): - ... - - def idxmin(self): - ... - - def idxmax(self): - ... - - def first(self): - ... - - def last(self): - ... + def sum(self): ... + def product(self): ... + def min(self): ... + def max(self): ... + def count(self): ... + def any(self): ... + def all(self): ... + def sum_of_squares(self): ... + def mean(self): ... + def var(self): ... + def std(self): ... + def median(self): ... + def argmax(self): ... + def argmin(self): ... + def nunique(self): ... + def nth(self): ... + def collect(self): ... + def prod(self): ... + def idxmin(self): ... + def idxmax(self): ... + def first(self): ... + def last(self): ... diff --git a/python/cudf/cudf/core/mixins/scans.py b/python/cudf/cudf/core/mixins/scans.py index 723fc758b13..b0f606e32e6 100644 --- a/python/cudf/cudf/core/mixins/scans.py +++ b/python/cudf/cudf/core/mixins/scans.py @@ -7,5 +7,10 @@ "Mixin encapsulating scan operations.", "SCAN", "_scan", - {"cumsum", "cumprod", "cummin", "cummax",}, # noqa: E231 + { + "cumsum", + "cumprod", + "cummin", + "cummax", + }, # noqa: E231 ) diff --git a/python/cudf/cudf/core/mixins/scans.pyi b/python/cudf/cudf/core/mixins/scans.pyi index 38cb9af284f..37995241b1f 100644 --- a/python/cudf/cudf/core/mixins/scans.pyi +++ b/python/cudf/cudf/core/mixins/scans.pyi @@ -5,14 +5,7 @@ from typing import Set class Scannable: _SUPPORTED_SCANS: Set - def cumsum(self): - ... - - def cumprod(self): - ... - - def cummin(self): - ... - - def cummax(self): - ... + def cumsum(self): ... + def cumprod(self): ... + def cummin(self): ... + def cummax(self): ... diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1b946a140c6..39228f034d4 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -278,7 +278,11 @@ def set_names(self, names, level=None, inplace=False): @classmethod @_cudf_nvtx_annotate - def _from_data(cls, data: MutableMapping, name: Any = None,) -> MultiIndex: + def _from_data( + cls, + data: MutableMapping, + name: Any = None, + ) -> MultiIndex: obj = cls.from_frame(cudf.DataFrame._from_data(data=data)) if name is not None: obj.name = name @@ -866,7 +870,8 @@ def _validate_indexer( def __eq__(self, other): if isinstance(other, MultiIndex): for self_col, other_col in zip( - self._data.values(), other._data.values(), + self._data.values(), + other._data.values(), ): if not self_col.equals(other_col): return False @@ -1675,9 +1680,11 @@ def get_loc(self, key, method=None, tolerance=None): partial_index = self.__class__._from_data( data=self._data.select_by_index(slice(key_as_table._num_columns)) ) - (lower_bound, upper_bound, sort_inds,) = _lexsorted_equal_range( - partial_index, key_as_table, is_sorted - ) + ( + lower_bound, + upper_bound, + sort_inds, + ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted) if lower_bound == upper_bound: raise KeyError(key) diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index a4810701781..2bed71ea751 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -157,7 +157,11 @@ def _handle_frequency_grouper(self, by): end += offset # generate the labels for binning the key column: - bin_labels = cudf.date_range(start=start, end=end, freq=freq,) + bin_labels = cudf.date_range( + start=start, + end=end, + freq=freq, + ) # We want the (resampled) column of timestamps in the result # to have a resolution closest to the resampling diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 5aa7f616e35..a388e2560ee 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -256,7 +256,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): ) elif isinstance(obj, pd.Series): result = cudf.Series( - data=obj, index=cudf.RangeIndex(len(obj)), + data=obj, + index=cudf.RangeIndex(len(obj)), ) else: result = cudf.DataFrame._from_data( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 40e09bb11b8..1f79672f30f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -104,7 +104,8 @@ def __getitem__(self, arg): ): return data return self._frame._from_data( - {self._frame.name: data}, index=cudf.Index(self._frame.index[arg]), + {self._frame.name: data}, + index=cudf.Index(self._frame.index[arg]), ) @_cudf_nvtx_annotate @@ -390,7 +391,12 @@ def from_masked_array(cls, data, mask, null_count=None): @_cudf_nvtx_annotate def __init__( - self, data=None, index=None, dtype=None, name=None, nan_as_null=True, + self, + data=None, + index=None, + dtype=None, + name=None, + nan_as_null=True, ): if isinstance(data, pd.Series): if name is None: @@ -2368,8 +2374,7 @@ def cov(self, other, min_periods=None): @_cudf_nvtx_annotate def transpose(self): - """Return the transpose, which is by definition self. - """ + """Return the transpose, which is by definition self.""" return self @@ -3762,7 +3767,9 @@ def quarter(self): np.int8 ) return Series._from_data( - {None: res}, index=self.series._index, name=self.series.name, + {None: res}, + index=self.series._index, + name=self.series.name, ) @_cudf_nvtx_annotate @@ -3960,7 +3967,9 @@ def is_quarter_start(self): result = ((day == cudf.Scalar(1)) & first_month).fillna(False) return Series._from_data( - {None: result}, index=self.series._index, name=self.series.name, + {None: result}, + index=self.series._index, + name=self.series.name, ) @property # type: ignore @@ -4009,7 +4018,9 @@ def is_quarter_end(self): result = ((day == last_day) & last_month).fillna(False) return Series._from_data( - {None: result}, index=self.series._index, name=self.series.name, + {None: result}, + index=self.series._index, + name=self.series.name, ) @property # type: ignore @@ -4081,7 +4092,9 @@ def is_year_end(self): result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates) result = result.fillna(False) return Series._from_data( - {None: result}, index=self.series._index, name=self.series.name, + {None: result}, + index=self.series._index, + name=self.series.name, ) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index de10261315c..3e91aa634f4 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -34,7 +34,12 @@ class SingleColumnFrame(Frame, NotIterable): @_cudf_nvtx_annotate def _reduce( - self, op, axis=None, level=None, numeric_only=None, **kwargs, + self, + op, + axis=None, + level=None, + numeric_only=None, + **kwargs, ): if axis not in (None, 0): raise NotImplementedError("axis parameter is not implemented yet") diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index b110a10e1e7..f766ea0de74 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -346,12 +346,14 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format): else: if infer_datetime_format and format is None: format = column.datetime.infer_format( - element=col[0], dayfirst=dayfirst, + element=col[0], + dayfirst=dayfirst, ) elif format is None: format = column.datetime.infer_format(element=col[0]) col = col.as_datetime_column( - dtype=_unit_dtype_map[unit], format=format, + dtype=_unit_dtype_map[unit], + format=format, ) return col @@ -923,8 +925,7 @@ def date_range( def _has_fixed_frequency(freq: DateOffset) -> bool: - """Utility to determine if `freq` contains fixed frequency offset - """ + """Utility to determine if `freq` contains fixed frequency offset""" fixed_frequencies = { "weeks", "days", @@ -940,8 +941,7 @@ def _has_fixed_frequency(freq: DateOffset) -> bool: def _has_non_fixed_frequency(freq: DateOffset) -> bool: - """Utility to determine if `freq` contains non-fixed frequency offset - """ + """Utility to determine if `freq` contains non-fixed frequency offset""" non_fixed_frequencies = {"years", "months"} return len(freq.kwds.keys() & non_fixed_frequencies) > 0 diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index d589b68e7b2..7eea7cedaad 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -247,6 +247,8 @@ def _proc_inf_strings(col): # TODO: This can be handled by libcudf in # future see StringColumn.as_numerical_column col = libstrings.replace_multi( - col, as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]), + col, + as_column(["+", "inf", "inity"]), + as_column(["", "Inf", ""]), ) return col diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py index 3b6b3b4b831..b54dd9c2367 100644 --- a/python/cudf/cudf/core/udf/lowering.py +++ b/python/cudf/cudf/core/udf/lowering.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + import operator from llvmlite import ir @@ -117,7 +119,8 @@ def masked_scalar_unary_op_impl(context, builder, sig, args): builder, lambda x: op(x), nb_signature( - masked_return_type.value_type, masked_type_1.value_type, + masked_return_type.value_type, + masked_type_1.value_type, ), (m1.value,), ) diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py index 56e8bec74dc..2be1691a1a6 100644 --- a/python/cudf/cudf/core/udf/typing.py +++ b/python/cudf/cudf/core/udf/typing.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + import operator from numba import types @@ -271,7 +273,11 @@ def generic(self, args, kws): if isinstance(args[0], MaskedType) and isinstance(args[1], NAType): # In the case of op(Masked, NA), the result has the same # dtype as the original regardless of what it is - return nb_signature(args[0], args[0], na_type,) + return nb_signature( + args[0], + args[0], + na_type, + ) elif isinstance(args[0], NAType) and isinstance(args[1], MaskedType): return nb_signature(args[1], na_type, args[1]) @@ -299,7 +305,11 @@ def generic(self, args, kws): return_type = self.context.resolve_function_type( self.key, to_resolve_types, kws ).return_type - return nb_signature(MaskedType(return_type), args[0], args[1],) + return nb_signature( + MaskedType(return_type), + args[0], + args[1], + ) @cuda_decl_registry.register_global(operator.is_) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index fa482d52104..53cbaebb9f1 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -273,7 +273,10 @@ def _apply_agg(self, agg_name): return self._apply_agg_dataframe(self.obj, agg_name) def _reduce( - self, op: str, *args, **kwargs, + self, + op: str, + *args, + **kwargs, ): """Calculate the rolling {op}. diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index 9e38b6e896d..e4824c2ccbe 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import cudf from cudf import _lib as libcudf from cudf.utils import ioutils @@ -16,7 +16,8 @@ def read_avro( """{docstring}""" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( - path_or_data=filepath_or_buffer, **kwargs, + path_or_data=filepath_or_buffer, + **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index f15fef19c07..a81563884d9 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -52,7 +52,8 @@ def read_csv( """{docstring}""" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( - path_or_data=filepath_or_buffer, **kwargs, + path_or_data=filepath_or_buffer, + **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 1f876214b16..142b9c26f96 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import warnings from io import BytesIO, StringIO @@ -65,7 +65,8 @@ def read_json( ) if not ioutils.ensure_single_filepath_or_buffer( - path_or_data=path_or_buf, **kwargs, + path_or_data=path_or_buf, + **kwargs, ): raise NotImplementedError( "`read_json` does not yet support reading " diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 62260cbb822..0ac0e02e4d1 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -162,7 +162,9 @@ def read_orc_metadata(path): @ioutils.doc_read_orc_statistics() def read_orc_statistics( - filepaths_or_buffers, columns=None, **kwargs, + filepaths_or_buffers, + columns=None, + **kwargs, ): """{docstring}""" @@ -321,7 +323,9 @@ def read_orc( for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem( - passed_filesystem=None, path=source, **kwargs, + passed_filesystem=None, + path=source, + **kwargs, ) source = stringify_path(source) source = fs.sep.join([source, "*.orc"]) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 52203d0194b..baedc3f174b 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -179,7 +179,11 @@ def read_parquet_metadata(path): @_cudf_nvtx_annotate def _process_dataset( - paths, fs, filters=None, row_groups=None, categorical_partitions=True, + paths, + fs, + filters=None, + row_groups=None, + categorical_partitions=True, ): # Returns: # file_list - Expanded/filtered list of paths @@ -203,7 +207,10 @@ def _process_dataset( # Initialize ds.FilesystemDataset dataset = ds.dataset( - paths, filesystem=fs, format="parquet", partitioning="hive", + paths, + filesystem=fs, + format="parquet", + partitioning="hive", ) file_list = dataset.files if len(file_list) == 0: @@ -287,7 +294,8 @@ def _process_dataset( filtered_row_groups = [ rg_info.id for rg_fragment in file_fragment.split_by_row_group( - filters, schema=dataset.schema, + filters, + schema=dataset.schema, ) for rg_info in rg_fragment.row_groups ] @@ -390,7 +398,10 @@ def read_parquet( filepaths_or_buffers = [] if use_python_file_object: open_file_options = _default_open_file_options( - open_file_options, columns, row_groups, fs=fs, + open_file_options, + columns, + row_groups, + fs=fs, ) for i, source in enumerate(filepath_or_buffer): tmp_source, compression = ioutils.get_filepath_or_buffer( @@ -455,7 +466,10 @@ def _parquet_to_frame( # one call to `_read_parquet` if not partition_keys: return _read_parquet( - paths_or_buffers, *args, row_groups=row_groups, **kwargs, + paths_or_buffers, + *args, + row_groups=row_groups, + **kwargs, ) # For partitioned data, we need a distinct read for each @@ -477,7 +491,10 @@ def _parquet_to_frame( # Add new DataFrame to our list dfs.append( _read_parquet( - key_paths, *args, row_groups=key_row_groups, **kwargs, + key_paths, + *args, + row_groups=key_row_groups, + **kwargs, ) ) # Add partition columns to the last DataFrame diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index e5a3beb7d61..86f99b319f0 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -11,7 +11,10 @@ @_cudf_nvtx_annotate @ioutils.doc_read_text() def read_text( - filepath_or_buffer, delimiter=None, byte_range=None, **kwargs, + filepath_or_buffer, + delimiter=None, + byte_range=None, + **kwargs, ): """{docstring}""" diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index f6b5e0f3ccc..2ff311c1399 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -320,7 +320,8 @@ def gen_rand_series(dtype, size, **kwargs): def _decimal_series(input, dtype): return cudf.Series( - [x if x is None else Decimal(x) for x in input], dtype=dtype, + [x if x is None else Decimal(x) for x in input], + dtype=dtype, ) diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index e1c7b42c7a3..c3e25adad77 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -75,7 +75,10 @@ class Parameters: """ def __init__( - self, num_rows=2048, column_parameters=None, seed=None, + self, + num_rows=2048, + column_parameters=None, + seed=None, ): self.num_rows = num_rows if column_parameters is None: @@ -201,7 +204,10 @@ def _generate_column(column_params, num_rows): def generate( - path, parameters, format=None, use_threads=True, + path, + parameters, + format=None, + use_threads=True, ): """ Generate dataset using given parameters and write to given format @@ -294,7 +300,10 @@ def get_dataframe(parameters, use_threads): pool.close() pool.join() # Convert to Pandas DataFrame and sort columns appropriately - tbl = pa.Table.from_arrays(column_data, schema=schema,) + tbl = pa.Table.from_arrays( + column_data, + schema=schema, + ) if columns_to_sort: tbl = tbl.to_pandas() tbl = tbl.sort_values(columns_to_sort) @@ -303,7 +312,7 @@ def get_dataframe(parameters, use_threads): def rand_dataframe( - dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1), use_threads=True + dtypes_meta, rows, seed=random.randint(0, 2**32 - 1), use_threads=True ): """ Generates a random table. @@ -550,7 +559,11 @@ def rand_dataframe( # is merged. df = get_dataframe( - Parameters(num_rows=rows, column_parameters=column_params, seed=seed,), + Parameters( + num_rows=rows, + column_parameters=column_params, + seed=seed, + ), use_threads=use_threads, ) @@ -568,7 +581,10 @@ def int_generator(dtype, size, min_bound=None, max_bound=None): low, high = iinfo.min, iinfo.max return lambda: np.random.randint( - low=low, high=high, size=size, dtype=dtype, + low=low, + high=high, + size=size, + dtype=dtype, ) @@ -578,12 +594,18 @@ def float_generator(dtype, size, min_bound=None, max_bound=None): """ if min_bound is not None and max_bound is not None: low, high = min_bound, max_bound - return lambda: np.random.uniform(low=low, high=high, size=size,) + return lambda: np.random.uniform( + low=low, + high=high, + size=size, + ) else: finfo = np.finfo(dtype) return ( lambda: np.random.uniform( - low=finfo.min / 2, high=finfo.max / 2, size=size, + low=finfo.min / 2, + high=finfo.max / 2, + size=size, ) * 2 ) @@ -632,11 +654,11 @@ def boolean_generator(size): def decimal_generator(dtype, size): max_integral = 10 ** (dtype.precision - dtype.scale) - 1 - max_float = (10 ** dtype.scale - 1) if dtype.scale != 0 else 0 + max_float = (10**dtype.scale - 1) if dtype.scale != 0 else 0 return lambda: ( np.random.uniform( low=-max_integral, - high=max_integral + (max_float / 10 ** dtype.scale), + high=max_integral + (max_float / 10**dtype.scale), size=size, ) ) @@ -658,7 +680,10 @@ def get_values_for_nested_data(dtype, lists_max_length=None, size=None): values = float_generator(dtype=dtype, size=cardinality)() elif dtype.kind in ("U", "O"): values = [ - mimesis.random.random.schoice(string.printable, 100,) + mimesis.random.random.schoice( + string.printable, + 100, + ) for _ in range(cardinality) ] elif dtype.kind == "M": @@ -722,7 +747,9 @@ def make_array_for_struct(dtype, cardinality, size, max_null_frequency): return pa.array( vals, mask=np.random.choice( - [True, False], size=size, p=[null_frequency, 1 - null_frequency], + [True, False], + size=size, + p=[null_frequency, 1 - null_frequency], ) if null_frequency > 0.0 else None, diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 9d762f26ebd..19ef2b66c2a 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -60,7 +60,9 @@ def test_ufunc_index(ufunc): # scale to avoid issues with overflow, etc. We use ints because some # operations (like bitwise ops) are not defined for floats. pandas_args = args = [ - cudf.Index(cp.random.randint(low=1, high=10, size=N),) + cudf.Index( + cp.random.randint(low=1, high=10, size=N), + ) for _ in range(ufunc.nin) ] @@ -283,7 +285,8 @@ def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect): @pytest.mark.parametrize( - "func", [np.add], + "func", + [np.add], ) def test_ufunc_cudf_series_error_with_out_kwarg(func): cudf_s1 = cudf.Series(data=[-1, 2, 3, 0]) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index aa4075eb887..742a3d7cd06 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1463,8 +1463,8 @@ def test_scalar_power(dtype_l, dtype_r): lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - expect = lval_host ** rval_host - got = lval_gpu ** rval_gpu + expect = lval_host**rval_host + got = lval_gpu**rval_gpu assert expect == got.value assert expect.dtype == got.dtype @@ -1478,7 +1478,7 @@ def test_scalar_power_invalid(dtype_l, dtype_r): rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) with pytest.raises(TypeError): - lval_gpu ** rval_gpu + lval_gpu**rval_gpu @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 5bceaac45c7..61f09c39123 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -38,7 +38,9 @@ def _hide_deprecated_pandas_categorical_inplace_warnings(function_name): def _hide_cudf_safe_casting_warning(): with warnings.catch_warnings(): warnings.filterwarnings( - "ignore", "Can't safely cast column", category=UserWarning, + "ignore", + "Can't safely cast column", + category=UserWarning, ) yield diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 365b351061d..854e79af9f4 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -415,8 +415,16 @@ def test_as_column_buffer(data, expected): {"type": pa.decimal128(3)}, {"dtype": cudf.core.dtypes.Decimal128Dtype(3, 0)}, ), - ([{"a": 1, "b": 3}, {"c": 2, "d": 4}], {}, {},), - ([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], {}, {},), + ( + [{"a": 1, "b": 3}, {"c": 2, "d": 4}], + {}, + {}, + ), + ( + [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + {}, + {}, + ), ], ) def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs): @@ -533,7 +541,8 @@ def test_concatenate_large_column_strings(): ], ) @pytest.mark.parametrize( - "data", [[1, 2, 0]], + "data", + [[1, 2, 0]], ) def test_astype_with_aliases(alias, expect_dtype, data): pd_data = pd.Series(data) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 1ab5931fe5f..3cc3e4153b1 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -341,7 +341,9 @@ def test_pandas_concat_compatibility_axis1(): got = gd.concat([d1, d2, d3, d4, d5], axis=1) assert_eq( - got, expect, check_index_type=True, + got, + expect, + check_index_type=True, ) @@ -658,7 +660,9 @@ def test_concat_dataframe_with_multiIndex(df1, df2): expected = pd.concat([pdf1, pdf2], axis=1) assert_eq( - expected, actual, check_index_type=True, + expected, + actual, + check_index_type=True, ) @@ -749,7 +753,14 @@ def test_concat_join_axis_1_dup_error(objs): # we do not support duplicate columns with pytest.raises(NotImplementedError): assert_eq( - pd.concat(objs, axis=1,), gd.concat(gpu_objs, axis=1,), + pd.concat( + objs, + axis=1, + ), + gd.concat( + gpu_objs, + axis=1, + ), ) @@ -781,7 +792,11 @@ def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis ) actual = gd.concat( - gpu_objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis, + gpu_objs, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, ) # TODO: Remove special handling below # after following bug from pandas is fixed: @@ -969,7 +984,9 @@ def test_concat_join_no_overlapping_columns_many_and_empty( axis=axis, ) assert_eq( - expected, actual, check_index_type=False, + expected, + actual, + check_index_type=False, ) @@ -1028,10 +1045,18 @@ def test_concat_join_no_overlapping_columns_many_and_empty2( objs_gd = [gd.from_pandas(o) if o is not None else o for o in objs] expected = pd.concat( - objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis, + objs, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, ) actual = gd.concat( - objs_gd, sort=sort, join=join, ignore_index=ignore_index, axis=axis, + objs_gd, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, ) assert_eq(expected, actual, check_index_type=False) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 6176184b670..0c4bf68faa9 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -580,7 +580,9 @@ def test_csv_reader_NaN_values(): # data type detection should evaluate the column to int8 (all nulls) gdf = read_csv( - StringIO(all_cells), header=None, na_values=custom_na_values, + StringIO(all_cells), + header=None, + na_values=custom_na_values, ) assert gdf.dtypes[0] == "int8" assert all(gdf["0"][idx] is cudf.NA for idx in range(len(gdf["0"]))) diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py index 710df78e36b..8dda5e793a0 100644 --- a/python/cudf/cudf/tests/test_cut.py +++ b/python/cudf/cudf/tests/test_cut.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. """ Test related to Cut @@ -132,15 +132,26 @@ def test_cut_labels_non_unique( ], ) @pytest.mark.parametrize( - "bins", [1, 2, 3, [1, 2, 3], [0, 2, 4, 6, 10]], + "bins", + [1, 2, 3, [1, 2, 3], [0, 2, 4, 6, 10]], ) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("precision", [3]) def test_cut_right(x, bins, right, precision): - pcat = pd.cut(x=x, bins=bins, right=right, precision=precision,) + pcat = pd.cut( + x=x, + bins=bins, + right=right, + precision=precision, + ) pindex = pd.CategoricalIndex(pcat) - gindex = cut(x=x, bins=bins, right=right, precision=precision,) + gindex = cut( + x=x, + bins=bins, + right=right, + precision=precision, + ) assert_eq(pindex, gindex) @@ -155,7 +166,8 @@ def test_cut_right(x, bins, right, precision): ], ) @pytest.mark.parametrize( - "bins", [[0, 2, 4, 6, 10, 10], [1, 2, 2, 3, 3]], + "bins", + [[0, 2, 4, 6, 10, 10], [1, 2, 2, 3, 3]], ) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) @@ -199,7 +211,8 @@ def test_cut_drop_duplicates( ], ) @pytest.mark.parametrize( - "bins", [[0, 2, 4, 6, 10, 10], [1, 2, 2, 3, 3]], + "bins", + [[0, 2, 4, 6, 10, 10], [1, 2, 2, 3, 3]], ) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) @@ -244,7 +257,8 @@ def test_cut_drop_duplicates_raises( ], ) @pytest.mark.parametrize( - "bins", [pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])], + "bins", + [pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])], ) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("precision", [1, 2, 3]) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 08c8e3485a3..303c245777c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -322,7 +322,8 @@ def test_dataframe_basic(): ], ) @pytest.mark.parametrize( - "columns", [["a"], ["b"], "a", "b", ["a", "b"]], + "columns", + [["a"], ["b"], "a", "b", ["a", "b"]], ) @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_columns(pdf, columns, inplace): @@ -423,8 +424,14 @@ def test_dataframe_drop_index(pdf, index, inplace): ("weight", 1), ("length", 1), ("cow", None), - ("lama", None,), - ("falcon", None,), + ( + "lama", + None, + ), + ( + "falcon", + None, + ), ], ) @pytest.mark.parametrize("inplace", [True, False]) @@ -452,7 +459,8 @@ def test_dataframe_drop_multiindex(pdf, index, level, inplace): ], ) @pytest.mark.parametrize( - "labels", [["a"], ["b"], "a", "b", ["a", "b"]], + "labels", + [["a"], ["b"], "a", "b", ["a", "b"]], ) @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_labels_axis_1(pdf, labels, inplace): @@ -1828,7 +1836,8 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): for i in range(num_cols): colname = string.ascii_lowercase[i] data = pd.Series( - np.random.randint(0, 26, num_rows).astype(np_dtype), dtype=dtype, + np.random.randint(0, 26, num_rows).astype(np_dtype), + dtype=dtype, ) if nulls == "some": idx = np.random.choice( @@ -3039,7 +3048,8 @@ def test_dataframe_sort_index( index, axis, ascending, inplace, ignore_index, na_position ): pdf = pd.DataFrame( - {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, index=index, + {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, + index=index, ) gdf = cudf.DataFrame.from_pandas(pdf) @@ -3276,7 +3286,8 @@ def test_select_dtype(): ) assert_exceptions_equal( - lfunc=pdf.select_dtypes, rfunc=gdf.select_dtypes, + lfunc=pdf.select_dtypes, + rfunc=gdf.select_dtypes, ) gdf = cudf.DataFrame( @@ -3297,7 +3308,8 @@ def test_select_dtype(): ) pdf = gdf.to_pandas() assert_eq( - pdf.select_dtypes("int64"), gdf.select_dtypes("int64"), + pdf.select_dtypes("int64"), + gdf.select_dtypes("int64"), ) @@ -4151,7 +4163,8 @@ def test_series_values_host_property(data): marks=pytest.mark.xfail(raises=NotImplementedError), ), pytest.param( - ["m", "a", "d", "v"], marks=pytest.mark.xfail(raises=TypeError), + ["m", "a", "d", "v"], + marks=pytest.mark.xfail(raises=TypeError), ), ], ) @@ -4706,7 +4719,8 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): { "a": [10, 11, 12, 13, 14, 15], "b": cudf.Series( - [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False, + [10, None, np.NaN, 2234, None, np.NaN], + nan_as_null=False, ), } ) @@ -4725,33 +4739,51 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op, expected): [ ( "max", - cudf.Series([10, None, None, 2234, None, 453], dtype="int64",), + cudf.Series( + [10, None, None, 2234, None, 453], + dtype="int64", + ), + ), + ( + "min", + cudf.Series( + [10, None, None, 13, None, 15], + dtype="int64", + ), ), - ("min", cudf.Series([10, None, None, 13, None, 15], dtype="int64",),), ( "sum", - cudf.Series([20, None, None, 2247, None, 468], dtype="int64",), + cudf.Series( + [20, None, None, 2247, None, 468], + dtype="int64", + ), ), ( "product", - cudf.Series([100, None, None, 29042, None, 6795], dtype="int64",), + cudf.Series( + [100, None, None, 29042, None, 6795], + dtype="int64", + ), ), ( "mean", cudf.Series( - [10.0, None, None, 1123.5, None, 234.0], dtype="float32", + [10.0, None, None, 1123.5, None, 234.0], + dtype="float32", ), ), ( "var", cudf.Series( - [0.0, None, None, 1233210.25, None, 47961.0], dtype="float32", + [0.0, None, None, 1233210.25, None, 47961.0], + dtype="float32", ), ), ( "std", cudf.Series( - [0.0, None, None, 1110.5, None, 219.0], dtype="float32", + [0.0, None, None, 1110.5, None, 219.0], + dtype="float32", ), ), ], @@ -4761,7 +4793,8 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): { "a": [10, 11, None, 13, None, 15], "b": cudf.Series( - [10, None, 323, 2234, None, 453], nan_as_null=False, + [10, None, 323, 2234, None, 453], + nan_as_null=False, ), } ) @@ -4977,7 +5010,8 @@ def test_insert(data): @pytest.mark.parametrize( - "data", [{"A": [1, 2, 3], "B": ["a", "b", "c"]}], + "data", + [{"A": [1, 2, 3], "B": ["a", "b", "c"]}], ) def test_insert_NA(data): pdf = pd.DataFrame.from_dict(data) @@ -8090,7 +8124,8 @@ def custom_func(df, column): @pytest.mark.parametrize( - "op", ["count", "kurt", "kurtosis", "skew"], + "op", + ["count", "kurt", "kurtosis", "skew"], ) def test_dataframe_axis1_unsupported_ops(op): df = cudf.DataFrame({"a": [1, 2, 3], "b": [8, 9, 10]}) @@ -8273,13 +8308,16 @@ def test_agg_for_dataframe_with_string_columns(aggs): @pytest.mark.parametrize( - "join", ["left"], + "join", + ["left"], ) @pytest.mark.parametrize( - "overwrite", [True, False], + "overwrite", + [True, False], ) @pytest.mark.parametrize( - "errors", ["ignore"], + "errors", + ["ignore"], ) @pytest.mark.parametrize( "data", @@ -8336,7 +8374,8 @@ def test_update_for_dataframes(data, data2, join, overwrite, errors): @pytest.mark.parametrize( - "join", ["right"], + "join", + ["right"], ) def test_update_for_right_join(join): gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) @@ -8349,7 +8388,8 @@ def test_update_for_right_join(join): @pytest.mark.parametrize( - "errors", ["raise"], + "errors", + ["raise"], ) def test_update_for_data_overlap(errors): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) @@ -8440,10 +8480,12 @@ def test_dataframe_setitem_cupy_array(): "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}] ) @pytest.mark.parametrize( - "index", [{0: 123, 1: 4, 2: 6}], + "index", + [{0: 123, 1: 4, 2: 6}], ) @pytest.mark.parametrize( - "level", ["x", 0], + "level", + ["x", 0], ) def test_rename_for_level_MultiIndex_dataframe(data, index, level): pdf = pd.DataFrame( @@ -8463,10 +8505,12 @@ def test_rename_for_level_MultiIndex_dataframe(data, index, level): "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}] ) @pytest.mark.parametrize( - "columns", [{"a": "f", "b": "g"}, {1: 3, 2: 4}, lambda s: 2 * s], + "columns", + [{"a": "f", "b": "g"}, {1: 3, 2: 4}, lambda s: 2 * s], ) @pytest.mark.parametrize( - "level", [0, 1], + "level", + [0, 1], ) def test_rename_for_level_MultiColumn_dataframe(data, columns, level): gdf = cudf.DataFrame(data) @@ -8654,7 +8698,8 @@ def test_dataframe_indexing_setitem_np_cp_array(array, is_error): @pytest.mark.parametrize( - "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], + "data", + [{"a": [1, 2, 3], "b": [1, 1, 0]}], ) def test_frame_series_where_other(data): gdf = cudf.DataFrame(data) @@ -8714,7 +8759,8 @@ def test_frame_series_where_other(data): ], ) @pytest.mark.parametrize( - "min_per", [0, 1, 2, 3, 4], + "min_per", + [0, 1, 2, 3, 4], ) def test_pearson_corr_passing(data, gkey, min_per): gdf = cudf.DataFrame(data) @@ -8752,7 +8798,10 @@ def test_pearson_corr_empty_columns(): expected = pdf.groupby("id").corr("pearson") assert_eq( - expected, actual, check_dtype=False, check_index_type=False, + expected, + actual, + check_dtype=False, + check_index_type=False, ) @@ -8774,7 +8823,8 @@ def test_pearson_corr_empty_columns(): @pytest.mark.parametrize("gkey", ["id", "val1", "val2"]) def test_pearson_corr_invalid_column_types(data, gkey): with pytest.raises( - TypeError, match="Correlation accepts only numerical column-pairs", + TypeError, + match="Correlation accepts only numerical column-pairs", ): cudf.DataFrame(data).groupby(gkey).corr("pearson") @@ -8865,10 +8915,12 @@ def test_dataframe_add_suffix(): ], ) @pytest.mark.parametrize( - "min_periods", [0, 3], + "min_periods", + [0, 3], ) @pytest.mark.parametrize( - "ddof", [1, 2], + "ddof", + [1, 2], ) def test_groupby_covariance(data, gkey, min_periods, ddof): gdf = cudf.DataFrame(data) @@ -8904,7 +8956,10 @@ def test_groupby_covariance_empty_columns(): expected = pdf.groupby("id").cov() assert_eq( - expected, actual, check_dtype=False, check_index_type=False, + expected, + actual, + check_dtype=False, + check_index_type=False, ) @@ -8917,7 +8972,8 @@ def test_groupby_cov_invalid_column_types(): }, ) with pytest.raises( - TypeError, match="Covariance accepts only numerical column-pairs", + TypeError, + match="Covariance accepts only numerical column-pairs", ): gdf.groupby("id").cov() @@ -8940,7 +8996,9 @@ def test_groupby_cov_positive_semidefinite_matrix(): expected.reset_index(drop=True, inplace=True) assert_eq( - expected, actual, check_dtype=False, + expected, + actual, + check_dtype=False, ) @@ -8979,15 +9037,19 @@ def test_diff_dataframe_numeric_dtypes(data, periods): expected = pdf.diff(periods=periods, axis=0) assert_eq( - expected, actual, check_dtype=False, + expected, + actual, + check_dtype=False, ) @pytest.mark.parametrize( - ("precision", "scale"), [(5, 2), (8, 5)], + ("precision", "scale"), + [(5, 2), (8, 5)], ) @pytest.mark.parametrize( - "dtype", [cudf.Decimal32Dtype, cudf.Decimal64Dtype], + "dtype", + [cudf.Decimal32Dtype, cudf.Decimal64Dtype], ) def test_diff_decimal_dtypes(precision, scale, dtype): gdf = cudf.DataFrame( @@ -9000,7 +9062,9 @@ def test_diff_decimal_dtypes(precision, scale, dtype): expected = pdf.diff() assert_eq( - expected, actual, check_dtype=False, + expected, + actual, + check_dtype=False, ) @@ -9043,7 +9107,8 @@ def test_dataframe_assign_cp_np_array(): @pytest.mark.parametrize( - "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], + "data", + [{"a": [1, 2, 3], "b": [1, 1, 0]}], ) def test_dataframe_nunique(data): gdf = cudf.DataFrame(data) @@ -9056,7 +9121,8 @@ def test_dataframe_nunique(data): @pytest.mark.parametrize( - "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], + "data", + [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], ) def test_dataframe_nunique_index(data): gdf = cudf.DataFrame(data) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 9d120819248..964ac9e5457 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import datetime import datetime as dt @@ -580,7 +580,11 @@ def test_datetime_dataframe(): dtype="datetime64[ns]", freq=None, ), - pd.DatetimeIndex([], dtype="datetime64[ns]", freq=None,), + pd.DatetimeIndex( + [], + dtype="datetime64[ns]", + freq=None, + ), pd.Series([1, 2, 3]).astype("datetime64[ns]"), pd.Series([1, 2, 3]).astype("datetime64[us]"), pd.Series([1, 2, 3]).astype("datetime64[ms]"), @@ -681,7 +685,11 @@ def test_to_datetime_not_implemented(): pd.Series([0, 1, -1]), pd.Series([0, 1, -1, 100, 200, 47637]), [10, 12, 1200, 15003], - pd.DatetimeIndex([], dtype="datetime64[ns]", freq=None,), + pd.DatetimeIndex( + [], + dtype="datetime64[ns]", + freq=None, + ), pd.Index([1, 2, 3, 4]), ], ) @@ -941,7 +949,8 @@ def test_datetime_subtract(data, other, data_dtype, other_dtype): ) @pytest.mark.parametrize("dtype", DATETIME_TYPES) @pytest.mark.parametrize( - "op", ["add", "sub"], + "op", + ["add", "sub"], ) def test_datetime_series_ops_with_scalars(data, other_scalars, dtype, op): gsr = cudf.Series(data=data, dtype=dtype) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 05d6886c297..e779ac276a3 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -1,3 +1,4 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. import contextlib import doctest import inspect @@ -88,7 +89,10 @@ def test_docstring(self, docstring): # These global names are pre-defined and can be used in doctests # without first importing them. - globals = dict(cudf=cudf, np=np,) + globals = dict( + cudf=cudf, + np=np, + ) docstring.globs = globals # Capture stdout and include failing outputs in the traceback. diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 1e24dd9d275..3e7891ba0af 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -199,7 +199,8 @@ def test_dropna_thresh_cols(thresh, subset, inplace): actual = gdf assert_eq( - expected, actual, + expected, + actual, ) diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index bc43c82729b..e8a695570f0 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import itertools as it import random @@ -615,5 +615,6 @@ def test_drop_duplicates_multi_index(): for col in gdf.columns: assert_df( - gdf[col].drop_duplicates().to_pandas(), pdf[col].drop_duplicates(), + gdf[col].drop_duplicates().to_pandas(), + pdf[col].drop_duplicates(), ) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 61c7d1958a0..eba37c1f5af 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import datetime import itertools @@ -223,7 +223,8 @@ def test_groupby_getitem_getattr(as_index): by="x", ) assert_groupby_results_equal( - pdf.groupby("x")[["y"]].sum(), gdf.groupby("x")[["y"]].sum(), + pdf.groupby("x")[["y"]].sum(), + gdf.groupby("x")[["y"]].sum(), ) assert_groupby_results_equal( pdf.groupby(["x", "y"], as_index=as_index).sum(), @@ -375,7 +376,7 @@ def test_groupby_agg_decimal(num_groups, nelem_per_group, func): # The number of digits before the decimal to use. whole_digits = 2 - scale = 10 ** whole_digits + scale = 10**whole_digits nelem = num_groups * nelem_per_group # The unique is necessary because otherwise if there are duplicates idxmin @@ -589,7 +590,8 @@ def test_groupby_levels(level): pdf = pd.DataFrame({"c": [1, 2, 3], "d": [2, 3, 4]}, index=idx) gdf = cudf.from_pandas(pdf) assert_groupby_results_equal( - pdf.groupby(level=level).sum(), gdf.groupby(level=level).sum(), + pdf.groupby(level=level).sum(), + gdf.groupby(level=level).sum(), ) @@ -840,7 +842,11 @@ def test_groupby_multi_agg_hash_groupby(agg): coll_dict[prefix + this_name] = float coll_dict["id"] = int gdf = cudf.datasets.timeseries( - start="2000", end="2000-01-2", dtypes=coll_dict, freq="1s", seed=1, + start="2000", + end="2000-01-2", + dtypes=coll_dict, + freq="1s", + seed=1, ).reset_index(drop=True) pdf = gdf.to_pandas() check_dtype = False if "count" in agg else True @@ -975,7 +981,9 @@ def test_groupby_cat(): ) gdf = cudf.from_pandas(pdf) assert_groupby_results_equal( - pdf.groupby("a").count(), gdf.groupby("a").count(), check_dtype=False, + pdf.groupby("a").count(), + gdf.groupby("a").count(), + check_dtype=False, ) @@ -1046,7 +1054,9 @@ def test_groupby_size(): gdf = cudf.from_pandas(pdf) assert_groupby_results_equal( - pdf.groupby("a").size(), gdf.groupby("a").size(), check_dtype=False, + pdf.groupby("a").size(), + gdf.groupby("a").size(), + check_dtype=False, ) assert_groupby_results_equal( @@ -1057,7 +1067,9 @@ def test_groupby_size(): sr = pd.Series(range(len(pdf))) assert_groupby_results_equal( - pdf.groupby(sr).size(), gdf.groupby(sr).size(), check_dtype=False, + pdf.groupby(sr).size(), + gdf.groupby(sr).size(), + check_dtype=False, ) @@ -1282,7 +1294,8 @@ def test_groupby_nunique(agg, by): @pytest.mark.parametrize( - "n", [0, 1, 2, 10], + "n", + [0, 1, 2, 10], ) @pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) def test_groupby_nth(n, by): diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py index a30f4e20849..dcf40417e4f 100644 --- a/python/cudf/cudf/tests/test_hash_vocab.py +++ b/python/cudf/cudf/tests/test_hash_vocab.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import filecmp import os diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index b96b8386b10..37286c65341 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -464,7 +464,8 @@ def test_range_index_from_range(data): @pytest.mark.parametrize( - "n", [-10, -5, -2, 0, 1, 0, 2, 5, 10], + "n", + [-10, -5, -2, 0, 1, 0, 2, 5, 10], ) def test_empty_df_head_tail_index(n): df = cudf.DataFrame() @@ -511,11 +512,36 @@ def test_empty_df_head_tail_index(n): -pd.Index(np.arange(10)), None, ), - (pd.Index([1, 2, np.nan]), pd.Index([1, 2, np.nan]) == 4, None, None,), - (pd.Index([1, 2, np.nan]), pd.Index([1, 2, np.nan]) != 4, None, None,), - (pd.Index([-2, 3, -4, -79]), [True, True, True], None, ValueError,), - (pd.Index([-2, 3, -4, -79]), [True, True, True, False], None, None,), - (pd.Index([-2, 3, -4, -79]), [True, True, True, False], 17, None,), + ( + pd.Index([1, 2, np.nan]), + pd.Index([1, 2, np.nan]) == 4, + None, + None, + ), + ( + pd.Index([1, 2, np.nan]), + pd.Index([1, 2, np.nan]) != 4, + None, + None, + ), + ( + pd.Index([-2, 3, -4, -79]), + [True, True, True], + None, + ValueError, + ), + ( + pd.Index([-2, 3, -4, -79]), + [True, True, True, False], + None, + None, + ), + ( + pd.Index([-2, 3, -4, -79]), + [True, True, True, False], + 17, + None, + ), (pd.Index(list("abcdgh")), pd.Index(list("abcdgh")) != "g", "3", None), ( pd.Index(list("abcdgh")), @@ -1818,7 +1844,8 @@ def test_index_rangeindex_search_range(): @pytest.mark.parametrize( - "rge", [(1, 10, 1), (1, 10, 3), (10, -17, -1), (10, -17, -3)], + "rge", + [(1, 10, 1), (1, 10, 3), (10, -17, -1), (10, -17, -3)], ) def test_index_rangeindex_get_item_basic(rge): pridx = pd.RangeIndex(*rge) @@ -1829,7 +1856,8 @@ def test_index_rangeindex_get_item_basic(rge): @pytest.mark.parametrize( - "rge", [(1, 10, 3), (10, 1, -3)], + "rge", + [(1, 10, 3), (10, 1, -3)], ) def test_index_rangeindex_get_item_out_of_bounds(rge): gridx = cudf.RangeIndex(*rge) @@ -1838,7 +1866,8 @@ def test_index_rangeindex_get_item_out_of_bounds(rge): @pytest.mark.parametrize( - "rge", [(10, 1, 1), (-17, 10, -3)], + "rge", + [(10, 1, 1), (-17, 10, -3)], ) def test_index_rangeindex_get_item_null_range(rge): gridx = cudf.RangeIndex(*rge) @@ -1945,7 +1974,8 @@ def test_get_loc_single_unique_numeric(idx, key, method): @pytest.mark.parametrize( - "idx", [pd.RangeIndex(3, 100, 4)], + "idx", + [pd.RangeIndex(3, 100, 4)], ) @pytest.mark.parametrize("key", list(range(1, 110, 3))) @pytest.mark.parametrize("method", [None, "ffill"]) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index c3b414c2d4a..740c32a8a26 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1379,7 +1379,8 @@ def test_dataframe_sliced(gdf_kwargs, slice): ], ) @pytest.mark.parametrize( - "slice", [slice(6), slice(1), slice(7), slice(1, 3)], + "slice", + [slice(6), slice(1), slice(7), slice(1, 3)], ) def test_dataframe_iloc_index(gdf, slice): pdf = gdf.to_pandas() @@ -1481,7 +1482,7 @@ def test_iloc_decimal(): cudf.Decimal64Dtype(scale=2, precision=3) ) got = sr.iloc[[3, 2, 1, 0]] - expect = cudf.Series(["4.00", "3.00", "2.00", "1.00"],).astype( - cudf.Decimal64Dtype(scale=2, precision=3) - ) + expect = cudf.Series( + ["4.00", "3.00", "2.00", "1.00"], + ).astype(cudf.Decimal64Dtype(scale=2, precision=3)) assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True)) diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index fc193441113..e1104829914 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import pandas as pd import pytest @@ -8,7 +8,8 @@ @pytest.mark.parametrize( - "data1, data2", [(1, 2), (1.0, 2.0), (3, 4.0)], + "data1, data2", + [(1, 2), (1.0, 2.0), (3, 4.0)], ) @pytest.mark.parametrize("data3, data4", [(6, 10), (5.0, 9.0), (2, 6.0)]) @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) @@ -48,7 +49,8 @@ def test_create_interval_series(data1, data2, data3, data4, closed): @pytest.mark.parametrize( - "data1, data2", [(1, 2), (1.0, 2.0), (3, 4.0)], + "data1, data2", + [(1, 2), (1.0, 2.0), (3, 4.0)], ) @pytest.mark.parametrize("data3, data4", [(6, 10), (5.0, 9.0), (2, 6.0)]) @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index f478216cdcf..c03d26a0ed2 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1248,10 +1248,12 @@ def test_decimal_typecast_outer(dtype): @pytest.mark.parametrize( - "dtype_l", [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5)], + "dtype_l", + [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5)], ) @pytest.mark.parametrize( - "dtype_r", [Decimal64Dtype(8, 3), Decimal64Dtype(11, 6)], + "dtype_r", + [Decimal64Dtype(8, 3), Decimal64Dtype(11, 6)], ) def test_mixed_decimal_typecast(dtype_l, dtype_r): other_data = ["a", "b", "c", "d"] @@ -1893,7 +1895,8 @@ def test_join_merge_with_on(lhs_col, lhs_idx, rhs_col, rhs_idx, on, how): @pytest.mark.parametrize( - "on", ["A", "L0"], + "on", + ["A", "L0"], ) @pytest.mark.parametrize( "how", ["left", "inner", "right", "outer", "leftanti", "leftsemi"] diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 8cc65de739e..6a665a2b43c 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -301,12 +301,32 @@ def test_get_nulls(): @pytest.mark.parametrize( "data, scalar, expect", [ - ([[1, 2, 3], []], 1, [True, False],), - ([[1, 2, 3], [], [3, 4, 5]], 6, [False, False, False],), - ([[1.0, 2.0, 3.0], None, []], 2.0, [True, None, False],), - ([[None, "b", "c"], [], ["b", "e", "f"]], "b", [True, False, True],), + ( + [[1, 2, 3], []], + 1, + [True, False], + ), + ( + [[1, 2, 3], [], [3, 4, 5]], + 6, + [False, False, False], + ), + ( + [[1.0, 2.0, 3.0], None, []], + 2.0, + [True, None, False], + ), + ( + [[None, "b", "c"], [], ["b", "e", "f"]], + "b", + [True, False, True], + ), ([[None, 2, 3], None, []], 1, [False, None, False]), - ([[None, "b", "c"], [], ["b", "e", "f"]], "d", [False, False, False],), + ( + [[None, "b", "c"], [], ["b", "e", "f"]], + "d", + [False, False, False], + ), ], ) def test_contains_scalar(data, scalar, expect): @@ -319,11 +339,26 @@ def test_contains_scalar(data, scalar, expect): @pytest.mark.parametrize( "data, expect", [ - ([[1, 2, 3], []], [None, None],), - ([[1.0, 2.0, 3.0], None, []], [None, None, None],), - ([[None, 2, 3], [], None], [None, None, None],), - ([[1, 2, 3], [3, 4, 5]], [None, None],), - ([[], [], []], [None, None, None],), + ( + [[1, 2, 3], []], + [None, None], + ), + ( + [[1.0, 2.0, 3.0], None, []], + [None, None, None], + ), + ( + [[None, 2, 3], [], None], + [None, None, None], + ), + ( + [[1, 2, 3], [3, 4, 5]], + [None, None], + ), + ( + [[], [], []], + [None, None, None], + ), ], ) def test_contains_null_search_key(data, expect): diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 7643bfdf050..4eb9ed44a98 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. """ Tests related to is_unique and is_monotonic attributes @@ -261,7 +261,8 @@ def test_rangeindex_get_slice_bound_basic(bounds, indices, side, kind): [(3, 20, 5), (20, 3, -5), (20, 3, 5), (3, 20, -5), (0, 0, 2), (3, 3, 2)], ) @pytest.mark.parametrize( - "label", [3, 8, 13, 18, 20, 15, 10, 5, -1, 0, 19, 21, 6, 11, 17], + "label", + [3, 8, 13, 18, 20, 15, 10, 5, -1, 0, 19, 21, 6, 11, 17], ) @pytest.mark.parametrize("side", ["left", "right"]) @pytest.mark.parametrize("kind", ["getitem", "loc"]) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 21b179caa38..160db7053b9 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -21,7 +21,7 @@ def test_can_cast_safely_same_kind(): assert data.can_cast_safely(to_dtype) - data = cudf.Series([1, 2, 2 ** 31], dtype="int64")._column + data = cudf.Series([1, 2, 2**31], dtype="int64")._column assert not data.can_cast_safely(to_dtype) # 'u' -> 'u' @@ -35,7 +35,7 @@ def test_can_cast_safely_same_kind(): assert data.can_cast_safely(to_dtype) - data = cudf.Series([1, 2, 2 ** 33], dtype="uint64")._column + data = cudf.Series([1, 2, 2**33], dtype="uint64")._column assert not data.can_cast_safely(to_dtype) # 'f' -> 'f' @@ -56,7 +56,7 @@ def test_can_cast_safely_mixed_kind(): assert data.can_cast_safely(to_dtype) # too big to fit into f32 exactly - data = cudf.Series([1, 2, 2 ** 24 + 1], dtype="int32")._column + data = cudf.Series([1, 2, 2**24 + 1], dtype="int32")._column assert not data.can_cast_safely(to_dtype) data = cudf.Series([1, 2, 3], dtype="uint32")._column @@ -64,7 +64,7 @@ def test_can_cast_safely_mixed_kind(): assert data.can_cast_safely(to_dtype) # too big to fit into f32 exactly - data = cudf.Series([1, 2, 2 ** 24 + 1], dtype="uint32")._column + data = cudf.Series([1, 2, 2**24 + 1], dtype="uint32")._column assert not data.can_cast_safely(to_dtype) to_dtype = np.dtype("float64") @@ -82,7 +82,7 @@ def test_can_cast_safely_mixed_kind(): assert data.can_cast_safely(to_dtype) # float out of int range - data = cudf.Series([1.0, 2.0, 1.0 * (2 ** 31)], dtype="float32")._column + data = cudf.Series([1.0, 2.0, 1.0 * (2**31)], dtype="float32")._column assert not data.can_cast_safely(to_dtype) # negative signed integers casting to unsigned integers @@ -174,9 +174,9 @@ def test_to_numeric_basic_1d(data): @pytest.mark.parametrize( "data", [ - [1, 2 ** 11], - [1, 2 ** 33], - [1, 2 ** 63], + [1, 2**11], + [1, 2**33], + [1, 2**63], [np.iinfo(np.int64).max, np.iinfo(np.int64).min], ], ) @@ -196,12 +196,12 @@ def test_to_numeric_downcast_int(data, downcast): @pytest.mark.parametrize( "data", [ - [1.0, 2.0 ** 11], - [-1.0, -(2.0 ** 11)], - [1.0, 2.0 ** 33], - [-1.0, -(2.0 ** 33)], - [1.0, 2.0 ** 65], - [-1.0, -(2.0 ** 65)], + [1.0, 2.0**11], + [-1.0, -(2.0**11)], + [1.0, 2.0**33], + [-1.0, -(2.0**33)], + [1.0, 2.0**65], + [-1.0, -(2.0**65)], [1.0, float("inf")], [1.0, float("-inf")], [1.0, float("nan")], @@ -225,11 +225,11 @@ def test_to_numeric_downcast_float(data, downcast): @pytest.mark.parametrize( "data", [ - [1.0, 2.0 ** 129], - [1.0, 2.0 ** 257], + [1.0, 2.0**129], + [1.0, 2.0**257], [1.0, 1.79e308], - [-1.0, -(2.0 ** 129)], - [-1.0, -(2.0 ** 257)], + [-1.0, -(2.0**129)], + [-1.0, -(2.0**257)], [-1.0, -1.79e308], ], ) @@ -247,11 +247,11 @@ def test_to_numeric_downcast_large_float(data, downcast): @pytest.mark.parametrize( "data", [ - [1.0, 2.0 ** 129], - [1.0, 2.0 ** 257], + [1.0, 2.0**129], + [1.0, 2.0**257], [1.0, 1.79e308], - [-1.0, -(2.0 ** 129)], - [-1.0, -(2.0 ** 257)], + [-1.0, -(2.0**129)], + [-1.0, -(2.0**257)], [-1.0, -1.79e308], ], ) @@ -400,7 +400,8 @@ def test_series_construction_with_nulls(dtype, input_obj): @pytest.mark.parametrize( - "data", [[True, False, True]], + "data", + [[True, False, True]], ) @pytest.mark.parametrize( "downcast", ["signed", "integer", "unsigned", "float"] diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index 2b0422ffecb..41af6a64155 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from string import ascii_lowercase diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index bd7335c577c..62715ad7580 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -674,7 +674,10 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): # Read back written ORC's statistics orc_file = pa.orc.ORCFile(fname) - (file_stats, stripes_stats,) = cudf.io.orc.read_orc_statistics([fname]) + ( + file_stats, + stripes_stats, + ) = cudf.io.orc.read_orc_statistics([fname]) # check file stats for col in gdf: @@ -726,7 +729,10 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): # Read back written ORC's statistics orc_file = pa.orc.ORCFile(fname) - (file_stats, stripes_stats,) = cudf.io.orc.read_orc_statistics([fname]) + ( + file_stats, + stripes_stats, + ) = cudf.io.orc.read_orc_statistics([fname]) # check file stats col = "col_bool" @@ -1070,7 +1076,10 @@ def test_skip_rows_for_nested_types(columns, list_struct_buff): RuntimeError, match="skip_rows is not supported by nested column" ): cudf.read_orc( - list_struct_buff, columns=columns, use_index=True, skiprows=5, + list_struct_buff, + columns=columns, + use_index=True, + skiprows=5, ) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 91b4009995b..58ba77d0b0e 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1759,7 +1759,8 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols): @pytest.mark.parametrize( - "pfilters", [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]], + "pfilters", + [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]], ) @pytest.mark.parametrize("selection", ["directory", "files", "row-groups"]) @pytest.mark.parametrize("use_cat", [True, False]) @@ -1821,12 +1822,20 @@ def test_read_parquet_partitioned_filtered( # backend will filter by row (and cudf can # only filter by column, for now) filters = [("a", "==", 10)] - got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,) + got = cudf.read_parquet( + read_path, + filters=filters, + row_groups=row_groups, + ) assert len(got) < len(df) and 10 in got["a"] # Filter on both kinds of columns filters = [[("a", "==", 10)], [("c", "==", 1)]] - got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,) + got = cudf.read_parquet( + read_path, + filters=filters, + row_groups=row_groups, + ) assert len(got) < len(df) and (1 in got["c"] and 10 in got["a"]) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index 09129a43f07..46b48b8244c 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import datetime @@ -58,7 +58,7 @@ def test_query(data, fn, nulls): params_query_env_fn = [ (lambda a, b, c, d: a * c > b + d, "a * @c > b + @d"), ( - lambda a, b, c, d: ((a / c) < d) | ((b ** c) > d), + lambda a, b, c, d: ((a / c) < d) | ((b**c) > d), "((a / @c) < @d) | ((b ** @c) > @d)", ), ] diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index e1ca006e0ac..15a7eab738a 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from itertools import chain, combinations_with_replacement, product @@ -134,7 +134,8 @@ def test_rank_error_arguments(pdf): "elem,dtype", list( product( - combinations_with_replacement(sort_group_args, 4), sort_dtype_args, + combinations_with_replacement(sort_group_args, 4), + sort_dtype_args, ) ), ) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 7106ab54686..a24fef93f89 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -125,7 +125,7 @@ def test_sum_of_squares(dtype, nelem): got = sr.sum_of_squares() got_df = df.sum_of_squares() - expect = (data ** 2).sum() + expect = (data**2).sum() if cudf.dtype(dtype).kind in {"u", "i"}: if 0 <= expect <= np.iinfo(dtype).max: @@ -261,7 +261,7 @@ def test_sum_boolean(): def test_date_minmax(): - np_data = np.random.normal(size=10 ** 3) + np_data = np.random.normal(size=10**3) gdf_data = Series(np_data) np_casted = np_data.astype("datetime64[ms]") @@ -277,7 +277,8 @@ def test_date_minmax(): @pytest.mark.parametrize( - "op", ["sum", "product", "var", "kurt", "kurtosis", "skew"], + "op", + ["sum", "product", "var", "kurt", "kurtosis", "skew"], ) def test_datetime_unsupported_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="datetime64[ns]") diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 90429945cc5..14e81d6ad30 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -1057,7 +1057,14 @@ def test_replace_df_error(): @pytest.mark.parametrize( ("lower", "upper"), - [([2, 7.4], [4, 7.9]), ([2, 7.4], None), (None, [4, 7.9],)], + [ + ([2, 7.4], [4, 7.9]), + ([2, 7.4], None), + ( + None, + [4, 7.9], + ), + ], ) @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_clip(lower, upper, inplace): @@ -1076,7 +1083,8 @@ def test_dataframe_clip(lower, upper, inplace): @pytest.mark.parametrize( - ("lower", "upper"), [("b", "d"), ("b", None), (None, "c"), (None, None)], + ("lower", "upper"), + [("b", "d"), ("b", None), (None, "c"), (None, None)], ) @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_category_clip(lower, upper, inplace): @@ -1173,7 +1181,15 @@ def test_index_clip(data, lower, upper, inplace): @pytest.mark.parametrize( - ("lower", "upper"), [([2, 3], [4, 5]), ([2, 3], None), (None, [4, 5],)], + ("lower", "upper"), + [ + ([2, 3], [4, 5]), + ([2, 3], None), + ( + None, + [4, 5], + ), + ], ) @pytest.mark.parametrize("inplace", [True, False]) def test_multiindex_clip(lower, upper, inplace): @@ -1257,7 +1273,10 @@ def test_series_replace_errors(): gsr.replace([1, 2], ["a", "b"]) assert_exceptions_equal( - psr.replace, gsr.replace, ([{"a": 1}, 1],), ([{"a": 1}, 1],), + psr.replace, + gsr.replace, + ([{"a": 1}, 1],), + ([{"a": 1}, 1],), ) assert_exceptions_equal( diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index 3b8e807c3b6..f0101803995 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest @@ -25,7 +27,8 @@ def test_series_downsample_simple(ts_resolution): gsr = cudf.from_pandas(psr) gsr.index = gsr.index.astype(f"datetime64[{ts_resolution}]") assert_resample_results_equal( - psr.resample("3T").sum(), gsr.resample("3T").sum(), + psr.resample("3T").sum(), + gsr.resample("3T").sum(), ) @@ -36,7 +39,8 @@ def test_series_upsample_simple(): psr = pd.Series(range(10), index=index) gsr = cudf.from_pandas(psr) assert_resample_results_equal( - psr.resample("3T").sum(), gsr.resample("3T").sum(), + psr.resample("3T").sum(), + gsr.resample("3T").sum(), ) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 2efa781c506..14fa4be7fed 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -460,7 +460,9 @@ def test_unstack_multiindex(level): ).set_index(["foo", "bar", "baz"]) gdf = cudf.from_pandas(pdf) assert_eq( - pdf.unstack(level=level), gdf.unstack(level=level), check_dtype=False, + pdf.unstack(level=level), + gdf.unstack(level=level), + check_dtype=False, ) diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index abf38f74b86..87d1faf33ca 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -365,7 +365,7 @@ def test_rolling_dataframe_numba_udf_basic(data, center): def some_func(A): b = 0 for a in A: - b = b + a ** 2 + b = b + a**2 return b / len(A) for window_size in range(1, len(data) + 1): diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 4807879a730..d783483a8cb 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import os import shlex @@ -282,7 +282,12 @@ def test_read_parquet( @pytest.mark.parametrize("columns", [None, ["List", "Struct"]]) @pytest.mark.parametrize("index", [None, "Integer"]) def test_read_parquet_ext( - s3_base, s3so, pdf_ext, bytes_per_thread, columns, index, + s3_base, + s3so, + pdf_ext, + bytes_per_thread, + columns, + index, ): fname = "test_parquet_reader_ext.parquet" bname = "parquet" diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 385f7f41f72..b5be0b208a0 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -560,7 +560,7 @@ def test_categorical_value_counts(dropna, normalize, num_elements): @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("normalize", [True, False]) def test_series_value_counts(dropna, normalize): - for size in [10 ** x for x in range(5)]: + for size in [10**x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 sr = cudf.Series.from_masked_array( @@ -867,8 +867,14 @@ def test_series_memory_usage(): ), ), ( - cudf.Series([1, 2, None, 10.2, None], dtype="float32",), - pd.Series([1, 2, None, 10.2, None], dtype=pd.Float32Dtype(),), + cudf.Series( + [1, 2, None, 10.2, None], + dtype="float32", + ), + pd.Series( + [1, 2, None, 10.2, None], + dtype=pd.Float32Dtype(), + ), ), ], ) @@ -1077,9 +1083,18 @@ def test_series_drop_index(ps, index, inplace): ("speed", 1), ("weight", 1), ("length", 1), - ("cow", None,), - ("lama", None,), - ("falcon", None,), + ( + "cow", + None, + ), + ( + "lama", + None, + ), + ( + "falcon", + None, + ), ], ) @pytest.mark.parametrize("inplace", [True, False]) @@ -1158,7 +1173,8 @@ def test_series_drop_raises(): @pytest.mark.parametrize( - "data", [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], + "data", + [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], ) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize( @@ -1431,8 +1447,14 @@ def test_reset_index_dup_level_name_exceptions(): assert_exceptions_equal( lfunc=ps.reset_index, rfunc=gs.reset_index, - lfunc_args_and_kwargs=([], {"level": [None]},), - rfunc_args_and_kwargs=([], {"level": [None]},), + lfunc_args_and_kwargs=( + [], + {"level": [None]}, + ), + rfunc_args_and_kwargs=( + [], + {"level": [None]}, + ), expected_error_message="occurs multiple times, use a level number", ) @@ -1440,8 +1462,14 @@ def test_reset_index_dup_level_name_exceptions(): assert_exceptions_equal( lfunc=ps.reset_index, rfunc=gs.reset_index, - lfunc_args_and_kwargs=([], {"drop": False, "inplace": True},), - rfunc_args_and_kwargs=([], {"drop": False, "inplace": True},), + lfunc_args_and_kwargs=( + [], + {"drop": False, "inplace": True}, + ), + rfunc_args_and_kwargs=( + [], + {"drop": False, "inplace": True}, + ), ) # Pandas raises the above exception should these two inputs crosses. @@ -1518,7 +1546,8 @@ def test_series_transpose(data): @pytest.mark.parametrize( - "data", [1, 3, 5, 7, 7], + "data", + [1, 3, 5, 7, 7], ) def test_series_nunique(data): cd_s = cudf.Series(data) @@ -1531,7 +1560,8 @@ def test_series_nunique(data): @pytest.mark.parametrize( - "data", [1, 3, 5, 7, 7], + "data", + [1, 3, 5, 7, 7], ) def test_series_nunique_index(data): cd_s = cudf.Series(data) diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py index d4ef3ba235d..f1a51a45779 100644 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ b/python/cudf/cudf/tests/test_seriesmap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from itertools import product from math import floor @@ -37,8 +37,8 @@ def test_series_map_callable_numeric_basic(): gd2 = cudf.Series([1, 2, 3, 4, np.nan]) pdf2 = gd2.to_pandas() - expected_function = pdf2.map(lambda x: x ** 2) - actual_function = gd2.map(lambda x: x ** 2) + expected_function = pdf2.map(lambda x: x**2) + actual_function = gd2.map(lambda x: x**2) assert_eq(expected_function, actual_function) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 98e3b255aaf..977a01952db 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -81,7 +81,7 @@ def test_series_std(ddof): def test_series_unique(): - for size in [10 ** x for x in range(5)]: + for size in [10**x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 sr = cudf.Series(arr) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index f5bfcd8c9d2..d5d21f0b3c5 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -244,7 +244,8 @@ def test_string_empty_to_decimal(): gs = cudf.Series(["", "-85", ""], dtype="str") got = gs.astype(cudf.Decimal64Dtype(scale=0, precision=5)) expected = cudf.Series( - [0, -85, 0], dtype=cudf.Decimal64Dtype(scale=0, precision=5), + [0, -85, 0], + dtype=cudf.Decimal64Dtype(scale=0, precision=5), ) assert_eq(expected, got) @@ -272,7 +273,8 @@ def test_string_from_decimal(data, scale, precision, decimal_dtype): else: decimal_data.append(Decimal(d)) fp = cudf.Series( - decimal_data, dtype=decimal_dtype(scale=scale, precision=precision), + decimal_data, + dtype=decimal_dtype(scale=scale, precision=precision), ) gs = fp.astype("str") got = gs.astype(decimal_dtype(scale=scale, precision=precision)) @@ -532,7 +534,8 @@ def _cat_convert_seq_to_cudf(others): @pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) @pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) @pytest.mark.parametrize( - "index", [["1", "2", "3", "4", "5"]], + "index", + [["1", "2", "3", "4", "5"]], ) def test_string_cat(ps_gs, others, sep, na_rep, index): ps, gs = ps_gs @@ -682,12 +685,15 @@ def test_string_index_str_cat(data, others, sep, na_rep, name): got = gi.str.cat(others=gd_others, sep=sep, na_rep=na_rep) assert_eq( - expect, got, exact=False, + expect, + got, + exact=False, ) @pytest.mark.parametrize( - "data", [["a", None, "c", None, "e"], ["a", "b", "c", "d", "a"]], + "data", + [["a", None, "c", None, "e"], ["a", "b", "c", "d", "a"]], ) @pytest.mark.parametrize( "others", @@ -869,7 +875,8 @@ def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise): @pytest.mark.parametrize( - "data", [["hello", "world", None, "", "!"]], + "data", + [["hello", "world", None, "", "!"]], ) @pytest.mark.parametrize( "repeats", @@ -1207,7 +1214,8 @@ def test_string_get(string, index): gds = cudf.Series(string) assert_eq( - pds.str.get(index).fillna(""), gds.str.get(index).fillna(""), + pds.str.get(index).fillna(""), + gds.str.get(index).fillna(""), ) @@ -1220,10 +1228,12 @@ def test_string_get(string, index): ], ) @pytest.mark.parametrize( - "number", [-10, 0, 1, 3, 10], + "number", + [-10, 0, 1, 3, 10], ) @pytest.mark.parametrize( - "diff", [0, 2, 5, 9], + "diff", + [0, 2, 5, 9], ) def test_string_slice_str(string, number, diff): pds = pd.Series(string) @@ -1719,7 +1729,8 @@ def test_strings_zfill_tests(data, width): ) @pytest.mark.parametrize("width", [0, 1, 4, 9, 100]) @pytest.mark.parametrize( - "side", ["left", "right", "both"], + "side", + ["left", "right", "both"], ) @pytest.mark.parametrize("fillchar", [" ", ".", "\n", "+", "\t"]) def test_strings_pad_tests(data, width, side, fillchar): @@ -1920,7 +1931,8 @@ def test_string_table_view_creation(): ], ) @pytest.mark.parametrize( - "pat", ["", None, " ", "a", "abc", "cat", "$", "\n"], + "pat", + ["", None, " ", "a", "abc", "cat", "$", "\n"], ) def test_string_starts_ends(data, pat): ps = pd.Series(data) @@ -1996,7 +2008,8 @@ def test_string_starts_ends_list_like_pat(data, pat): ], ) @pytest.mark.parametrize( - "sub", ["", " ", "a", "abc", "cat", "$", "\n"], + "sub", + ["", " ", "a", "abc", "cat", "$", "\n"], ) def test_string_find(data, sub): ps = pd.Series(data) @@ -2005,49 +2018,65 @@ def test_string_find(data, sub): got = gs.str.find(sub) expect = ps.str.find(sub) assert_eq( - expect, got, check_dtype=False, + expect, + got, + check_dtype=False, ) got = gs.str.find(sub, start=1) expect = ps.str.find(sub, start=1) assert_eq( - expect, got, check_dtype=False, + expect, + got, + check_dtype=False, ) got = gs.str.find(sub, end=10) expect = ps.str.find(sub, end=10) assert_eq( - expect, got, check_dtype=False, + expect, + got, + check_dtype=False, ) got = gs.str.find(sub, start=2, end=10) expect = ps.str.find(sub, start=2, end=10) assert_eq( - expect, got, check_dtype=False, + expect, + got, + check_dtype=False, ) got = gs.str.rfind(sub) expect = ps.str.rfind(sub) assert_eq( - expect, got, check_dtype=False, + expect, + got, + check_dtype=False, ) got = gs.str.rfind(sub, start=1) expect = ps.str.rfind(sub, start=1) assert_eq( - expect, got, check_dtype=False, + expect, + got, + check_dtype=False, ) got = gs.str.rfind(sub, end=10) expect = ps.str.rfind(sub, end=10) assert_eq( - expect, got, check_dtype=False, + expect, + got, + check_dtype=False, ) got = gs.str.rfind(sub, start=2, end=10) expect = ps.str.rfind(sub, start=2, end=10) assert_eq( - expect, got, check_dtype=False, + expect, + got, + check_dtype=False, ) @@ -2176,7 +2205,8 @@ def test_string_contains_multi(data, sub, expect): # Pandas does not allow 'case' or 'flags' if 'pat' is re.Pattern # This covers contains, match, count, and replace @pytest.mark.parametrize( - "pat", [re.compile("[n-z]"), re.compile("[A-Z]"), re.compile("de"), "A"], + "pat", + [re.compile("[n-z]"), re.compile("[A-Z]"), re.compile("de"), "A"], ) @pytest.mark.parametrize("repl", ["xyz", "", " "]) def test_string_compiled_re(ps_gs, pat, repl): diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index 4dc4d86d94c..efb3ce96838 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -43,7 +43,12 @@ def arrow_arrays(request): "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] ) def test_basic_assert_index_equal( - rdata, exact, check_names, rname, check_categorical, dtype, + rdata, + exact, + check_names, + rname, + check_categorical, + dtype, ): p_left = pd.Index([1, 2, 3], name="a", dtype=dtype) p_right = pd.Index(rdata, name=rname, dtype=dtype) @@ -100,7 +105,12 @@ def test_basic_assert_index_equal( "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] ) def test_basic_assert_series_equal( - rdata, rname, check_names, check_category_order, check_categorical, dtype, + rdata, + rname, + check_names, + check_category_order, + check_categorical, + dtype, ): p_left = pd.Series([1, 2, 3], name="a", dtype=dtype) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 2623b755cfb..71c30e0aaa5 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -318,7 +318,8 @@ def test_timedelta_ops_misc_inputs(data, other, dtype, ops): @pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) @pytest.mark.parametrize("timedelta_dtype", utils.TIMEDELTA_TYPES) @pytest.mark.parametrize( - "ops", ["add", "sub"], + "ops", + ["add", "sub"], ) def test_timedelta_ops_datetime_inputs( datetime_data, timedelta_data, datetime_dtype, timedelta_dtype, ops @@ -645,7 +646,8 @@ def test_timedelta_reduction_ops(data, dtype, reduction_op): @pytest.mark.parametrize( - "data", _TIMEDELTA_DATA, + "data", + _TIMEDELTA_DATA, ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) def test_timedelta_dt_components(data, dtype): @@ -662,7 +664,8 @@ def test_timedelta_dt_components(data, dtype): @pytest.mark.parametrize( - "data", _TIMEDELTA_DATA, + "data", + _TIMEDELTA_DATA, ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) def test_timedelta_dt_properties(data, dtype): @@ -697,7 +700,8 @@ def local_assert(expected, actual): @pytest.mark.parametrize( - "data", _TIMEDELTA_DATA, + "data", + _TIMEDELTA_DATA, ) @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) def test_timedelta_index(data, dtype): diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py index bd7ee45fbf8..b5bcf9df8f5 100644 --- a/python/cudf/cudf/tests/test_transform.py +++ b/python/cudf/cudf/tests/test_transform.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import numpy as np @@ -11,14 +11,14 @@ def _generic_function(a): - return a ** 3 + return a**3 @pytest.mark.parametrize("dtype", supported_types) @pytest.mark.parametrize( "udf,testfunc", [ - (_generic_function, lambda ser: ser ** 3), + (_generic_function, lambda ser: ser**3), (lambda x: x in [1, 2, 3, 4], lambda ser: np.isin(ser, [1, 2, 3, 4])), ], ) diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py index 173515509cd..1ad45e721a3 100644 --- a/python/cudf/cudf/tests/test_udf_binops.py +++ b/python/cudf/cudf/tests/test_udf_binops.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import numpy as np import pytest @@ -33,7 +33,7 @@ def test_generic_ptx(dtype): rhs_col = Series(rhs_arr)._column def generic_function(a, b): - return a ** 3 + b + return a**3 + b nb_type = numpy_support.from_dtype(cudf.dtype(dtype)) type_signature = (nb_type, nb_type) @@ -46,6 +46,6 @@ def generic_function(a, b): out_col = libcudf.binaryop.binaryop_udf(lhs_col, rhs_col, ptx_code, dtype) - result = lhs_arr ** 3 + rhs_arr + result = lhs_arr**3 + rhs_arr np.testing.assert_almost_equal(result, out_col.values_host) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index faaea6eec08..36750adf6ee 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -644,16 +644,16 @@ def test_masked_udf_caching(): # recompile data = cudf.Series([1, 2, 3]) - expect = data ** 2 - got = data.applymap(lambda x: x ** 2) + expect = data**2 + got = data.applymap(lambda x: x**2) assert_eq(expect, got, check_dtype=False) # update the constant value being used and make sure # it does not result in a cache hit - expect = data ** 3 - got = data.applymap(lambda x: x ** 3) + expect = data**3 + got = data.applymap(lambda x: x**3) assert_eq(expect, got, check_dtype=False) # make sure we get a hit when reapplying diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index 11029cbfe5e..cecf0c36bc2 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # This function is from the rapidsai/clx repo at below link # https://github.com/rapidsai/clx/blob/267c6d30805c9dcbf80840f222bf31c5c4b7068a/python/clx/analytics/_perfect_hash.py import numpy as np @@ -10,16 +10,16 @@ A_SECOND_LEVEL_POW = np.uint8(48) B_SECOND_LEVEL_POW = np.uint8(7) -A_LBOUND_SECOND_LEVEL_HASH = 2 ** 16 -A_HBOUND_SECOND_LEVEL_HASH = 2 ** A_SECOND_LEVEL_POW +A_LBOUND_SECOND_LEVEL_HASH = 2**16 +A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW B_LBOUND_SECOND_LEVEL_HASH = 0 -B_HBOUND_SECOND_LEVEL_HASH = 2 ** B_SECOND_LEVEL_POW +B_HBOUND_SECOND_LEVEL_HASH = 2**B_SECOND_LEVEL_POW # Extremely generous and should not ever happen. This limit is imposed # To ensure we can bit pack all the information needed for the bin hash # functions - a, b and table size -MAX_SIZE_FOR_INITIAL_BIN = 2 ** 8 - 1 +MAX_SIZE_FOR_INITIAL_BIN = 2**8 - 1 # Shifts for bit packing @@ -71,8 +71,8 @@ def _get_space_util(bins, init_bins): def _pick_initial_a_b(data, max_constant, init_bins): while True: - a = np.random.randint(2 ** 12, 2 ** 15) - b = np.random.randint(2 ** 12, 2 ** 15) + a = np.random.randint(2**12, 2**15) + b = np.random.randint(2**12, 2**15) bins = _make_bins(data, init_bins, a, b) score = _get_space_util(bins, init_bins) / len(data) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 8f8a40ae4ab..cfe1957dfd6 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import datetime import os @@ -1396,13 +1396,18 @@ def get_filepath_or_buffer( else: if use_python_file_object: path_or_data = _open_remote_files( - paths, fs, **(open_file_options or {}), + paths, + fs, + **(open_file_options or {}), ) else: path_or_data = [ BytesIO( _fsspec_data_transfer( - fpath, fs=fs, mode=mode, **kwargs, + fpath, + fs=fs, + mode=mode, + **kwargs, ) ) for fpath in paths @@ -1685,7 +1690,11 @@ def _fsspec_data_transfer( for b in range(0, file_size, bytes_per_thread) ] _read_byte_ranges( - path_or_fob, byte_ranges, buf, fs=fs, **kwargs, + path_or_fob, + byte_ranges, + buf, + fs=fs, + **kwargs, ) return buf.tobytes() @@ -1717,19 +1726,25 @@ def _assign_block(fs, path_or_fob, local_buffer, offset, nbytes): # We have an open fsspec file object path_or_fob.seek(offset) local_buffer[offset : offset + nbytes] = np.frombuffer( - path_or_fob.read(nbytes), dtype="b", + path_or_fob.read(nbytes), + dtype="b", ) else: # We have an fsspec filesystem and a path with fs.open(path_or_fob, mode="rb", cache_type="none") as fob: fob.seek(offset) local_buffer[offset : offset + nbytes] = np.frombuffer( - fob.read(nbytes), dtype="b", + fob.read(nbytes), + dtype="b", ) def _read_byte_ranges( - path_or_fob, ranges, local_buffer, fs=None, **kwargs, + path_or_fob, + ranges, + local_buffer, + fs=None, + **kwargs, ): # Simple utility to copy remote byte ranges # into a local buffer for IO in libcudf diff --git a/python/cudf/setup.py b/python/cudf/setup.py index e4e43bc1595..9d7b3a36235 100644 --- a/python/cudf/setup.py +++ b/python/cudf/setup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import os import re @@ -253,7 +253,8 @@ def run(self): ext_modules=extensions, packages=find_packages(include=["cudf", "cudf.*"]), package_data=dict.fromkeys( - find_packages(include=["cudf._lib*"]), ["*.pxd"], + find_packages(include=["cudf._lib*"]), + ["*.pxd"], ), cmdclass=cmdclass, install_requires=install_requires, diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py index 824babfa10a..4aff8ca7990 100644 --- a/python/cudf_kafka/setup.py +++ b/python/cudf_kafka/setup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import os import shutil import sysconfig @@ -104,7 +104,8 @@ ), packages=find_packages(include=["cudf_kafka", "cudf_kafka.*"]), package_data=dict.fromkeys( - find_packages(include=["cudf_kafka._lib*"]), ["*.pxd"], + find_packages(include=["cudf_kafka._lib*"]), + ["*.pxd"], ), cmdclass=versioneer.get_cmdclass(), install_requires=install_requires, diff --git a/python/cudf_kafka/versioneer.py b/python/cudf_kafka/versioneer.py index c7dbfd76734..a3b0246e785 100644 --- a/python/cudf_kafka/versioneer.py +++ b/python/cudf_kafka/versioneer.py @@ -1123,7 +1123,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( - full_tag, tag_prefix, + full_tag, + tag_prefix, ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index dbb1109b7d3..d1edfb071a2 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -342,7 +342,6 @@ def percentile_cudf(a, q, interpolation="linear"): n, ) - except ImportError: pass diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 4d193f34b9f..5a21068feac 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -456,7 +456,7 @@ class Index(Series, dd.core.Index): def _naive_var(ddf, meta, skipna, ddof, split_every, out): num = ddf._get_numeric_data() x = 1.0 * num.sum(skipna=skipna, split_every=split_every) - x2 = 1.0 * (num ** 2).sum(skipna=skipna, split_every=split_every) + x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) n = num.count(split_every=split_every) name = ddf._token_prefix + "var" result = map_partitions( @@ -489,7 +489,7 @@ def _aggregate_var(parts): n = n_a + n_b avg = (n_a * avg_a + n_b * avg_b) / n delta = avg_b - avg_a - m2 = m2_a + m2_b + delta ** 2 * n_a * n_b / n + m2 = m2_a + m2_b + delta**2 * n_a * n_b / n return n, avg, m2 def _finalize_var(vals): diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 76533706030..684b1f71099 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -46,11 +46,19 @@ def __init__(self, *args, **kwargs): def __getitem__(self, key): if isinstance(key, list): g = CudfDataFrameGroupBy( - self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna, + self.obj, + by=self.by, + slice=key, + sort=self.sort, + **self.dropna, ) else: g = CudfSeriesGroupBy( - self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna, + self.obj, + by=self.by, + slice=key, + sort=self.sort, + **self.dropna, ) g._meta = g._meta[key] @@ -540,7 +548,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): x2 = df[pow2_sum_name] # Use sum-squared approach to get variance - var = x2 - x ** 2 / n + var = x2 - x**2 / n div = n - ddof div[div < 1] = 1 # Avoid division by 0 var /= div diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ac5795fa2ec..042759f68cf 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import warnings from contextlib import ExitStack from functools import partial @@ -130,7 +130,8 @@ def _read_paths( # Build the column from `codes` directly # (since the category is often a larger dtype) codes = as_column( - partitions[i].keys.index(index2), length=len(df), + partitions[i].keys.index(index2), + length=len(df), ) df[name] = build_categorical_column( categories=partitions[i].keys, diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index c5d3cf293fd..d9b8ee4595a 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -408,7 +408,10 @@ def test_row_groups_per_part(tmpdir, row_groups, index): write_metadata_file=True, ) - ddf2 = dask_cudf.read_parquet(str(tmpdir), row_groups_per_part=row_groups,) + ddf2 = dask_cudf.read_parquet( + str(tmpdir), + row_groups_per_part=row_groups, + ) dd.assert_eq(ddf1, ddf2, check_divisions=False) @@ -426,7 +429,9 @@ def test_create_metadata_file(tmpdir, partition_on): df1.index.name = "myindex" ddf1 = dask_cudf.from_cudf(df1, npartitions=10) ddf1.to_parquet( - tmpdir, write_metadata_file=False, partition_on=partition_on, + tmpdir, + write_metadata_file=False, + partition_on=partition_on, ) # Add global _metadata file @@ -435,7 +440,8 @@ def test_create_metadata_file(tmpdir, partition_on): else: fns = glob.glob(os.path.join(tmpdir, "*.parquet")) dask_cudf.io.parquet.create_metadata_file( - fns, split_every=3, # Force tree reduction + fns, + split_every=3, # Force tree reduction ) # Check that we can now read the ddf diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index db4b655fcbd..84c0e0e9b39 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -351,7 +351,8 @@ def test_create_list_series(data): @pytest.mark.parametrize( - "data", [data_test_1(), data_test_2(), data_test_non_numeric()], + "data", + [data_test_1(), data_test_2(), data_test_non_numeric()], ) def test_unique(data): expect = Series(data).list.unique() @@ -360,7 +361,8 @@ def test_unique(data): @pytest.mark.parametrize( - "data", [data_test_2(), data_test_non_numeric()], + "data", + [data_test_2(), data_test_non_numeric()], ) def test_len(data): expect = Series(data).list.len() @@ -369,7 +371,8 @@ def test_len(data): @pytest.mark.parametrize( - "data, search_key", [(data_test_2(), 1)], + "data, search_key", + [(data_test_2(), 1)], ) def test_contains(data, search_key): expect = Series(data).list.contains(search_key) @@ -394,7 +397,8 @@ def test_get(data, index, expectation): @pytest.mark.parametrize( - "data", [data_test_1(), data_test_2(), data_test_nested()], + "data", + [data_test_1(), data_test_2(), data_test_nested()], ) def test_leaves(data): expect = Series(data).list.leaves @@ -459,7 +463,8 @@ def test_sorting(data, ascending, na_position, ignore_index): @pytest.mark.parametrize( - "data", struct_accessor_data_params, + "data", + struct_accessor_data_params, ) def test_create_struct_series(data): expect = pd.Series(data) @@ -468,7 +473,8 @@ def test_create_struct_series(data): @pytest.mark.parametrize( - "data", struct_accessor_data_params, + "data", + struct_accessor_data_params, ) def test_struct_field_str(data): for test_key in ["a", "b"]: @@ -478,7 +484,8 @@ def test_struct_field_str(data): @pytest.mark.parametrize( - "data", struct_accessor_data_params, + "data", + struct_accessor_data_params, ) def test_struct_field_integer(data): for test_key in [0, 1]: @@ -488,7 +495,8 @@ def test_struct_field_integer(data): @pytest.mark.parametrize( - "data", struct_accessor_data_params, + "data", + struct_accessor_data_params, ) def test_dask_struct_field_Key_Error(data): got = dgd.from_cudf(Series(data), 2) @@ -498,7 +506,8 @@ def test_dask_struct_field_Key_Error(data): @pytest.mark.parametrize( - "data", struct_accessor_data_params, + "data", + struct_accessor_data_params, ) def test_dask_struct_field_Int_Error(data): # breakpoint() diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 67fed62c582..89326b60f37 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import random @@ -720,7 +720,9 @@ def test_series_describe(): pdsr = dd.from_pandas(psr, npartitions=4) dd.assert_eq( - dsr.describe(), pdsr.describe(), check_less_precise=3, + dsr.describe(), + pdsr.describe(), + check_less_precise=3, ) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 274c6670426..e3545149c24 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -316,7 +316,8 @@ def test_groupby_multiindex_reset_index(npartitions): gr_out[("b", "count")] = gr_out[("b", "count")].astype("int64") dd.assert_eq( - gr_out, pr.compute().sort_values(by=["a", "c"]).reset_index(drop=True), + gr_out, + pr.compute().sort_values(by=["a", "c"]).reset_index(drop=True), ) @@ -464,7 +465,8 @@ def test_groupby_categorical_key(): @pytest.mark.parametrize("npartitions", [1, 10]) def test_groupby_agg_params(npartitions, split_every, split_out, as_index): df = cudf.datasets.randomdata( - nrows=150, dtypes={"name": str, "a": int, "b": int, "c": float}, + nrows=150, + dtypes={"name": str, "a": int, "b": int, "c": float}, ) df["a"] = [0, 1, 2] * 50 ddf = dask_cudf.from_cudf(df, npartitions) @@ -480,7 +482,11 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): if split_out == 1: gf = ( ddf.groupby(["name", "a"], sort=True, as_index=as_index) - .aggregate(agg_dict, split_every=split_every, split_out=split_out,) + .aggregate( + agg_dict, + split_every=split_every, + split_out=split_out, + ) .compute() ) if as_index: @@ -499,10 +505,14 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): # Full check (`sort=False`) gr = ddf.groupby(["name", "a"], sort=False, as_index=as_index).aggregate( - agg_dict, split_every=split_every, split_out=split_out, + agg_dict, + split_every=split_every, + split_out=split_out, ) pr = pddf.groupby(["name", "a"], sort=False).agg( - agg_dict, split_every=split_every, split_out=split_out, + agg_dict, + split_every=split_every, + split_out=split_out, ) # Test `as_index` argument @@ -573,7 +583,8 @@ def test_groupby_unique_lists(): gddf.groupby("a").b.unique().compute(), ) dd.assert_eq( - gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(), + gdf.groupby("a").b.unique(), + gddf.groupby("a").b.unique().compute(), ) diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py index a9d88b5203c..6453d843467 100644 --- a/python/dask_cudf/dask_cudf/tests/test_onehot.py +++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import pandas as pd import pytest @@ -118,5 +120,6 @@ def test_get_dummies_categorical(): got = dd.get_dummies(gddf, columns=["B"]) dd.assert_eq( - expect, got, + expect, + got, )