From f87e47f96b8ccdd8ebdf90b62344c3318d5642f8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Mar 2024 15:03:29 -0700 Subject: [PATCH 1/9] Replace black with ruff-format --- .pre-commit-config.yaml | 9 +- python/cudf/benchmarks/API/bench_dataframe.py | 21 +- python/cudf/benchmarks/API/bench_functions.py | 12 +- .../cudf/benchmarks/API/bench_multiindex.py | 6 +- python/cudf/benchmarks/API/cases_functions.py | 20 +- python/cudf/benchmarks/common/utils.py | 6 +- python/cudf/benchmarks/conftest.py | 17 +- .../cudf/benchmarks/internal/bench_column.py | 8 +- python/cudf/cudf/_fuzz_testing/avro.py | 9 +- python/cudf/cudf/_fuzz_testing/csv.py | 38 +- python/cudf/cudf/_fuzz_testing/fuzzer.py | 4 +- python/cudf/cudf/_fuzz_testing/io.py | 10 +- python/cudf/cudf/_fuzz_testing/json.py | 14 +- python/cudf/cudf/_fuzz_testing/main.py | 6 +- python/cudf/cudf/_fuzz_testing/orc.py | 20 +- python/cudf/cudf/_fuzz_testing/parquet.py | 16 +- .../_fuzz_testing/tests/fuzz_test_json.py | 14 +- .../cudf/_fuzz_testing/tests/fuzz_test_orc.py | 10 +- .../_fuzz_testing/tests/fuzz_test_parquet.py | 6 +- python/cudf/cudf/_fuzz_testing/utils.py | 16 +- python/cudf/cudf/_typing.py | 8 +- python/cudf/cudf/_version.py | 6 +- python/cudf/cudf/api/extensions/accessor.py | 4 +- python/cudf/cudf/api/types.py | 31 +- python/cudf/cudf/core/_base_index.py | 72 +-- .../cudf/cudf/core/_internals/expressions.py | 5 +- python/cudf/cudf/core/_internals/timezones.py | 32 +- python/cudf/cudf/core/_internals/where.py | 10 +- python/cudf/cudf/core/abc.py | 8 +- python/cudf/cudf/core/algorithms.py | 4 +- python/cudf/cudf/core/buffer/buffer.py | 12 +- python/cudf/cudf/core/buffer/spill_manager.py | 16 +- .../cudf/cudf/core/buffer/spillable_buffer.py | 19 +- python/cudf/cudf/core/buffer/utils.py | 7 +- python/cudf/cudf/core/column/categorical.py | 176 ++---- python/cudf/cudf/core/column/column.py | 219 ++----- python/cudf/cudf/core/column/datetime.py | 79 +-- python/cudf/cudf/core/column/decimal.py | 36 +- python/cudf/cudf/core/column/interval.py | 12 +- python/cudf/cudf/core/column/lists.py | 53 +- python/cudf/cudf/core/column/methods.py | 8 +- python/cudf/cudf/core/column/numerical.py | 82 +-- .../cudf/cudf/core/column/numerical_base.py | 34 +- python/cudf/cudf/core/column/string.py | 373 +++-------- python/cudf/cudf/core/column/struct.py | 24 +- python/cudf/cudf/core/column/timedelta.py | 104 ++- python/cudf/cudf/core/column_accessor.py | 43 +- python/cudf/cudf/core/common.py | 6 +- python/cudf/cudf/core/copy_types.py | 10 +- python/cudf/cudf/core/cut.py | 18 +- python/cudf/cudf/core/dataframe.py | 593 +++++------------- python/cudf/cudf/core/df_protocol.py | 91 +-- python/cudf/cudf/core/dtypes.py | 51 +- python/cudf/cudf/core/frame.py | 106 +--- python/cudf/cudf/core/groupby/groupby.py | 183 ++---- python/cudf/cudf/core/index.py | 156 ++--- python/cudf/cudf/core/indexed_frame.py | 297 +++------ python/cudf/cudf/core/indexing_utils.py | 20 +- python/cudf/cudf/core/join/_join_helpers.py | 22 +- python/cudf/cudf/core/join/join.py | 47 +- python/cudf/cudf/core/mixins/mixin_factory.py | 16 +- python/cudf/cudf/core/multiindex.py | 152 ++--- python/cudf/cudf/core/resample.py | 16 +- python/cudf/cudf/core/reshape.py | 45 +- python/cudf/cudf/core/scalar.py | 37 +- python/cudf/cudf/core/series.py | 203 ++---- python/cudf/cudf/core/single_column_frame.py | 24 +- python/cudf/cudf/core/subword_tokenizer.py | 22 +- python/cudf/cudf/core/tools/datetimes.py | 42 +- python/cudf/cudf/core/tools/numeric.py | 4 +- python/cudf/cudf/core/udf/groupby_lowering.py | 46 +- python/cudf/cudf/core/udf/groupby_typing.py | 34 +- python/cudf/cudf/core/udf/groupby_utils.py | 12 +- python/cudf/cudf/core/udf/masked_lowering.py | 38 +- python/cudf/cudf/core/udf/masked_typing.py | 25 +- python/cudf/cudf/core/udf/row_function.py | 10 +- python/cudf/cudf/core/udf/strings_lowering.py | 38 +- python/cudf/cudf/core/udf/strings_typing.py | 8 +- python/cudf/cudf/core/udf/utils.py | 32 +- python/cudf/cudf/core/window/rolling.py | 30 +- python/cudf/cudf/datasets.py | 8 +- python/cudf/cudf/io/avro.py | 6 +- python/cudf/cudf/io/csv.py | 16 +- python/cudf/cudf/io/dlpack.py | 7 +- python/cudf/cudf/io/json.py | 18 +- python/cudf/cudf/io/orc.py | 47 +- python/cudf/cudf/io/parquet.py | 111 +--- python/cudf/cudf/options.py | 16 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 16 +- python/cudf/cudf/pandas/fast_slow_proxy.py | 30 +- python/cudf/cudf/pandas/module_accelerator.py | 38 +- python/cudf/cudf/pandas/profiler.py | 34 +- .../pandas/scripts/analyze-test-failures.py | 6 +- .../pandas/scripts/summarize-test-results.py | 14 +- python/cudf/cudf/testing/_utils.py | 16 +- python/cudf/cudf/testing/dataset_generator.py | 38 +- python/cudf/cudf/testing/testing.py | 62 +- .../tests/indexes/datetime/test_indexing.py | 10 +- .../indexes/datetime/test_time_specific.py | 4 +- .../cudf/cudf/tests/indexes/test_interval.py | 60 +- .../cudf/cudf/tests/input_output/test_text.py | 30 +- .../cudf/tests/series/test_datetimelike.py | 34 +- python/cudf/cudf/tests/test_array_function.py | 8 +- python/cudf/cudf/tests/test_array_ufunc.py | 20 +- .../test_avro_reader_fastavro_integration.py | 32 +- python/cudf/cudf/tests/test_binops.py | 212 ++----- python/cudf/cudf/tests/test_categorical.py | 38 +- python/cudf/cudf/tests/test_column.py | 23 +- .../cudf/cudf/tests/test_column_accessor.py | 8 +- python/cudf/cudf/tests/test_concat.py | 175 ++---- python/cudf/cudf/tests/test_contains.py | 6 +- python/cudf/cudf/tests/test_copying.py | 20 +- python/cudf/cudf/tests/test_csv.py | 136 +--- python/cudf/cudf/tests/test_cuda_apply.py | 7 +- .../cudf/tests/test_cuda_array_interface.py | 11 +- python/cudf/cudf/tests/test_cut.py | 12 +- python/cudf/cudf/tests/test_dask.py | 6 +- python/cudf/cudf/tests/test_dataframe.py | 553 +++++----------- python/cudf/cudf/tests/test_dataframe_copy.py | 46 +- python/cudf/cudf/tests/test_datasets.py | 4 +- python/cudf/cudf/tests/test_datetime.py | 58 +- python/cudf/cudf/tests/test_decimal.py | 24 +- python/cudf/cudf/tests/test_doctests.py | 10 +- python/cudf/cudf/tests/test_dropna.py | 8 +- python/cudf/cudf/tests/test_dtypes.py | 37 +- python/cudf/cudf/tests/test_duplicates.py | 16 +- python/cudf/cudf/tests/test_feather.py | 7 +- python/cudf/cudf/tests/test_groupby.py | 257 ++------ python/cudf/cudf/tests/test_hash_vocab.py | 6 +- python/cudf/cudf/tests/test_hdf.py | 12 +- python/cudf/cudf/tests/test_hdfs.py | 30 +- python/cudf/cudf/tests/test_index.py | 108 +--- python/cudf/cudf/tests/test_indexing.py | 124 +--- python/cudf/cudf/tests/test_interpolate.py | 4 +- python/cudf/cudf/tests/test_interval.py | 4 +- python/cudf/cudf/tests/test_join_order.py | 7 +- python/cudf/cudf/tests/test_joining.py | 144 ++--- python/cudf/cudf/tests/test_json.py | 103 +-- python/cudf/cudf/tests/test_list.py | 30 +- python/cudf/cudf/tests/test_monotonic.py | 20 +- python/cudf/cudf/tests/test_multiindex.py | 134 ++-- python/cudf/cudf/tests/test_numerical.py | 36 +- python/cudf/cudf/tests/test_numpy_interop.py | 10 +- python/cudf/cudf/tests/test_onehot.py | 24 +- python/cudf/cudf/tests/test_options.py | 29 +- python/cudf/cudf/tests/test_orc.py | 113 +--- python/cudf/cudf/tests/test_pack.py | 40 +- python/cudf/cudf/tests/test_parquet.py | 128 +--- python/cudf/cudf/tests/test_pickling.py | 4 +- python/cudf/cudf/tests/test_query.py | 6 +- python/cudf/cudf/tests/test_rank.py | 8 +- python/cudf/cudf/tests/test_reductions.py | 8 +- python/cudf/cudf/tests/test_replace.py | 83 +-- python/cudf/cudf/tests/test_repr.py | 80 +-- python/cudf/cudf/tests/test_resampling.py | 8 +- python/cudf/cudf/tests/test_reshape.py | 104 +-- python/cudf/cudf/tests/test_rolling.py | 56 +- python/cudf/cudf/tests/test_s3.py | 28 +- python/cudf/cudf/tests/test_scalar.py | 18 +- python/cudf/cudf/tests/test_search.py | 4 +- python/cudf/cudf/tests/test_serialize.py | 20 +- python/cudf/cudf/tests/test_series.py | 129 +--- python/cudf/cudf/tests/test_seriesmap.py | 6 +- python/cudf/cudf/tests/test_setitem.py | 36 +- python/cudf/cudf/tests/test_sorting.py | 60 +- python/cudf/cudf/tests/test_spilling.py | 34 +- python/cudf/cudf/tests/test_stats.py | 42 +- python/cudf/cudf/tests/test_string.py | 107 +--- python/cudf/cudf/tests/test_string_udfs.py | 4 +- python/cudf/cudf/tests/test_struct.py | 14 +- python/cudf/cudf/tests/test_testing.py | 21 +- python/cudf/cudf/tests/test_timedelta.py | 40 +- python/cudf/cudf/tests/test_udf_masked_ops.py | 31 +- python/cudf/cudf/tests/test_unaops.py | 5 +- .../cudf/tests/text/test_subword_tokenizer.py | 21 +- .../cudf/cudf/tests/text/test_text_methods.py | 22 +- python/cudf/cudf/utils/_numba.py | 20 +- python/cudf/cudf/utils/_ptxcompiler.py | 10 +- python/cudf/cudf/utils/applyutils.py | 51 +- python/cudf/cudf/utils/cudautils.py | 10 +- python/cudf/cudf/utils/dtypes.py | 96 +-- python/cudf/cudf/utils/gpu_utils.py | 8 +- python/cudf/cudf/utils/hash_vocab_utils.py | 27 +- python/cudf/cudf/utils/ioutils.py | 40 +- python/cudf/cudf/utils/nvtx_annotation.py | 6 +- python/cudf/cudf/utils/queryutils.py | 16 +- python/cudf/cudf/utils/utils.py | 11 +- .../cudf_pandas_tests/test_cudf_pandas.py | 80 +-- .../cudf_pandas_tests/test_fast_slow_proxy.py | 12 +- .../cudf/cudf_pandas_tests/test_profiler.py | 8 +- python/cudf_kafka/cudf_kafka/_version.py | 7 +- python/custreamz/custreamz/_version.py | 7 +- python/custreamz/custreamz/kafka.py | 5 +- python/custreamz/custreamz/tests/conftest.py | 4 +- .../custreamz/tests/test_dataframes.py | 64 +- python/dask_cudf/dask_cudf/_version.py | 7 +- python/dask_cudf/dask_cudf/backends.py | 51 +- python/dask_cudf/dask_cudf/core.py | 27 +- .../dask_cudf/dask_cudf/expr/_collection.py | 4 +- python/dask_cudf/dask_cudf/groupby.py | 32 +- python/dask_cudf/dask_cudf/io/orc.py | 22 +- python/dask_cudf/dask_cudf/io/parquet.py | 15 +- .../dask_cudf/dask_cudf/io/tests/test_csv.py | 22 +- .../dask_cudf/io/tests/test_parquet.py | 56 +- .../dask_cudf/dask_cudf/io/tests/test_s3.py | 4 +- python/dask_cudf/dask_cudf/sorting.py | 15 +- .../dask_cudf/tests/test_accessor.py | 29 +- python/dask_cudf/dask_cudf/tests/test_core.py | 46 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 89 +-- python/dask_cudf/dask_cudf/tests/test_join.py | 34 +- .../dask_cudf/tests/test_reductions.py | 4 +- python/dask_cudf/dask_cudf/tests/test_sort.py | 12 +- 212 files changed, 2531 insertions(+), 7053 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9235c80bdc9..e7bea7dbbb1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,13 +23,6 @@ repos: args: ["--config-root=python/", "--resolve-all-configs"] files: python/.* types_or: [python, cython, pyi] - - repo: https://github.com/psf/black - rev: 23.12.1 - hooks: - - id: black - files: python/.* - # Explicitly specify the pyproject.toml at the repo root, not per-project. - args: ["--config", "pyproject.toml"] - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.16.0 hooks: @@ -155,6 +148,8 @@ repos: hooks: - id: ruff files: python/.*$ + - id: ruff-format + files: python/.*$ - repo: https://github.com/rapidsai/pre-commit-hooks rev: v0.0.1 hooks: diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index 59d73015962..9b51d5bd5f2 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -30,9 +30,7 @@ def bench_eval_func(benchmark, expr, dataframe): [2, 3, 4], ) def bench_merge(benchmark, dataframe, num_key_cols): - benchmark( - dataframe.merge, dataframe, on=list(dataframe.columns[:num_key_cols]) - ) + benchmark(dataframe.merge, dataframe, on=list(dataframe.columns[:num_key_cols])) # TODO: Some of these cases could be generalized to an IndexedFrame benchmark @@ -67,9 +65,7 @@ def random_state(request): def bench_sample(benchmark, dataframe, axis, frac, random_state): if axis == 1 and isinstance(random_state, cupy.random.RandomState): pytest.skip("Unsupported params.") - benchmark( - dataframe.sample, frac=frac, axis=axis, random_state=random_state - ) + benchmark(dataframe.sample, frac=frac, axis=axis, random_state=random_state) @benchmark_with_object(cls="dataframe", dtype="int") @@ -121,10 +117,7 @@ def bench_groupby(benchmark, dataframe, num_key_cols): [ "sum", ["sum", "mean"], - { - f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"] - for i in range(6) - }, + {f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"] for i in range(6)}, ], ) @pytest.mark.parametrize( @@ -154,9 +147,7 @@ def bench_groupby_sample( kwargs = {"frac": target_sample_frac, "replace": replace} else: minsize = grouper.size().min() - target_size = numpy.round( - target_sample_frac * minsize, decimals=0 - ).astype(int) + target_size = numpy.round(target_sample_frac * minsize, decimals=0).astype(int) kwargs = {"n": target_size, "replace": replace} benchmark(grouper.sample, **kwargs) @@ -165,9 +156,7 @@ def bench_groupby_sample( @benchmark_with_object(cls="dataframe", dtype="int") @pytest.mark.parametrize("num_cols_to_sort", [1]) def bench_sort_values(benchmark, dataframe, num_cols_to_sort): - benchmark( - dataframe.sort_values, list(dataframe.columns[:num_cols_to_sort]) - ) + benchmark(dataframe.sort_values, list(dataframe.columns[:num_cols_to_sort])) @benchmark_with_object(cls="dataframe", dtype="int") diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py index 93109838900..9ab1a55ff33 100644 --- a/python/cudf/benchmarks/API/bench_functions.py +++ b/python/cudf/benchmarks/API/bench_functions.py @@ -9,9 +9,7 @@ from utils import benchmark_with_object -@pytest_cases.parametrize_with_cases( - "objs", prefix="concat", cases="cases_functions" -) +@pytest_cases.parametrize_with_cases("objs", prefix="concat", cases="cases_functions") @pytest.mark.parametrize( "axis", [ @@ -21,9 +19,7 @@ @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("ignore_index", [True, False]) def bench_concat_axis_1(benchmark, objs, axis, join, ignore_index): - benchmark( - cudf.concat, objs=objs, axis=axis, join=join, ignore_index=ignore_index - ) + benchmark(cudf.concat, objs=objs, axis=axis, join=join, ignore_index=ignore_index) @pytest.mark.parametrize("size", [10_000, 100_000]) @@ -51,9 +47,7 @@ def bench_get_dummies_simple(benchmark, prefix): "col3": cudf.Series(list(range(100, 110)), dtype="category"), } ) - benchmark( - cudf.get_dummies, df, columns=["col1", "col2", "col3"], prefix=prefix - ) + benchmark(cudf.get_dummies, df, columns=["col1", "col2", "col3"], prefix=prefix) @benchmark_with_object(cls="dataframe", dtype="int", cols=6) diff --git a/python/cudf/benchmarks/API/bench_multiindex.py b/python/cudf/benchmarks/API/bench_multiindex.py index 6268bcc4267..6d4d6ec0942 100644 --- a/python/cudf/benchmarks/API/bench_multiindex.py +++ b/python/cudf/benchmarks/API/bench_multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Benchmarks of MultiIndex methods.""" @@ -31,9 +31,7 @@ def bench_from_pandas(benchmark, pidx): def bench_constructor(benchmark, midx): - benchmark( - cudf.MultiIndex, codes=midx.codes, levels=midx.levels, names=midx.names - ) + benchmark(cudf.MultiIndex, codes=midx.codes, levels=midx.levels, names=midx.names) def bench_from_frame(benchmark, midx): diff --git a/python/cudf/benchmarks/API/cases_functions.py b/python/cudf/benchmarks/API/cases_functions.py index 6bc66aa4a9b..627447fbc12 100644 --- a/python/cudf/benchmarks/API/cases_functions.py +++ b/python/cudf/benchmarks/API/cases_functions.py @@ -28,9 +28,7 @@ def concat_case_contiguous_indexes(nr): @pytest_cases.parametrize("nr", NUM_ROWS) def concat_case_contiguous_indexes_different_cols(nr): return [ - cudf.DataFrame( - {"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)} - ), + cudf.DataFrame({"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)}), cudf.DataFrame( {"c": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), @@ -117,30 +115,22 @@ def concat_case_unique_columns(nr): @pytest_cases.parametrize("nr", NUM_ROWS) def concat_case_unique_columns_with_different_range_index(nr): return [ - cudf.DataFrame( - {"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)} - ), + cudf.DataFrame({"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)}), cudf.DataFrame( {"c": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), ), - cudf.DataFrame( - {"d": cupy.tile([1, 2, 3], nr), "e": cupy.tile([4, 5, 7], nr)} - ), + cudf.DataFrame({"d": cupy.tile([1, 2, 3], nr), "e": cupy.tile([4, 5, 7], nr)}), cudf.DataFrame( {"f": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), ), - cudf.DataFrame( - {"g": cupy.tile([1, 2, 3], nr), "h": cupy.tile([4, 5, 7], nr)} - ), + cudf.DataFrame({"g": cupy.tile([1, 2, 3], nr), "h": cupy.tile([4, 5, 7], nr)}), cudf.DataFrame( {"i": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), ), - cudf.DataFrame( - {"j": cupy.tile([1, 2, 3], nr), "k": cupy.tile([4, 5, 7], nr)} - ), + cudf.DataFrame({"j": cupy.tile([1, 2, 3], nr), "k": cupy.tile([4, 5, 7], nr)}), cudf.DataFrame( {"l": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), diff --git a/python/cudf/benchmarks/common/utils.py b/python/cudf/benchmarks/common/utils.py index 363316f0930..1b79882ca17 100644 --- a/python/cudf/benchmarks/common/utils.py +++ b/python/cudf/benchmarks/common/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Common utilities for fixture creation and benchmarking.""" @@ -42,9 +42,7 @@ def make_boolean_mask_column(size): return cudf.core.column.as_column(rstate.randint(0, 2, size).astype(bool)) -def benchmark_with_object( - cls, *, dtype="int", nulls=None, cols=None, rows=None -): +def benchmark_with_object(cls, *, dtype="int", nulls=None, cols=None, rows=None): """Pass "standard" cudf fixtures to functions without renaming parameters. The fixture generation logic in conftest.py provides a plethora of useful diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 7b2b71cf216..4c568f9bdc8 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -93,10 +93,7 @@ def make_dataframe(nr, nc, column_generator=column_generator): string.ascii_lowercase ), "make_dataframe only supports a maximum of 26 columns" return cudf.DataFrame( - { - f"{string.ascii_lowercase[i]}": column_generator(nr) - for i in range(nc) - } + {f"{string.ascii_lowercase[i]}": column_generator(nr) for i in range(nc)} ) for nr in NUM_ROWS: @@ -108,9 +105,7 @@ def make_dataframe(nr, nc, column_generator=column_generator): # https://github.com/smarie/python-pytest-cases/issues/278 # Once that is fixed we could remove all the extraneous `request` # fixtures in these fixtures. - def series_nulls_false( - request, nr=nr, column_generator=column_generator - ): + def series_nulls_false(request, nr=nr, column_generator=column_generator): return cudf.Series(column_generator(nr)) make_fixture( @@ -120,9 +115,7 @@ def series_nulls_false( fixtures, ) - def series_nulls_true( - request, nr=nr, column_generator=column_generator - ): + def series_nulls_true(request, nr=nr, column_generator=column_generator): s = cudf.Series(column_generator(nr)) s.iloc[::2] = None return s @@ -135,9 +128,7 @@ def series_nulls_true( ) # For now, not bothering to include a nullable index fixture. - def index_nulls_false( - request, nr=nr, column_generator=column_generator - ): + def index_nulls_false(request, nr=nr, column_generator=column_generator): return cudf.Index(column_generator(nr)) make_fixture( diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py index 8da769b7858..cacd5574b57 100644 --- a/python/cudf/benchmarks/internal/bench_column.py +++ b/python/cudf/benchmarks/internal/bench_column.py @@ -31,9 +31,7 @@ def bench_unique_single_column(benchmark, column): @pytest.mark.parametrize("nullify", [True, False]) @pytest.mark.parametrize("gather_how", ["sequence", "reverse", "random"]) def bench_take(benchmark, column, gather_how, nullify): - gather_map = make_gather_map( - column.size * 0.4, column.size, gather_how - )._column + gather_map = make_gather_map(column.size * 0.4, column.size, gather_how)._column benchmark(column.take, gather_map, nullify=nullify) @@ -107,8 +105,6 @@ def setitem_case_int_column_align_to_col_size(column): # column (len(val) != len(key) and len == num_true) -@pytest_cases.parametrize_with_cases( - "column,key,value", cases=".", prefix="setitem" -) +@pytest_cases.parametrize_with_cases("column,key,value", cases=".", prefix="setitem") def bench_setitem(benchmark, column, key, value): benchmark(column.__setitem__, key, value) diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py index d9974037daa..ed647e45528 100644 --- a/python/cudf/cudf/_fuzz_testing/avro.py +++ b/python/cudf/cudf/_fuzz_testing/avro.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import io @@ -69,17 +69,14 @@ def generate_input(self): - cudf.utils.dtypes.TIMEDELTA_TYPES ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 5b49143fd5a..54bac6c5f26 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -53,16 +53,13 @@ def generate_input(self): seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -84,18 +81,12 @@ def set_rand_params(self, params): col_val = np.random.choice( [ None, - np.unique( - np.random.choice(self._df.columns, col_size) - ), + np.unique(np.random.choice(self._df.columns, col_size)), ] ) - params_dict[param] = ( - col_val if col_val is None else list(col_val) - ) + params_dict[param] = col_val if col_val is None else list(col_val) elif param == "dtype": - dtype_val = np.random.choice( - [None, self._df.dtypes.to_dict()] - ) + dtype_val = np.random.choice([None, self._df.dtypes.to_dict()]) if dtype_val is not None: dtype_val = { col_name: "category" @@ -110,13 +101,9 @@ def set_rand_params(self, params): ) params_dict[param] = header_val elif param == "skiprows": - params_dict[param] = np.random.randint( - low=0, high=len(self._df) - ) + params_dict[param] = np.random.randint(low=0, high=len(self._df)) elif param == "skipfooter": - params_dict[param] = np.random.randint( - low=0, high=len(self._df) - ) + params_dict[param] = np.random.randint(low=0, high=len(self._df)) elif param == "nrows": nrows_val = np.random.choice( [None, np.random.randint(low=0, high=len(self._df))] @@ -158,16 +145,13 @@ def generate_input(self): seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -188,9 +172,7 @@ def set_rand_params(self, params): col_size = self._rand(len(self._current_buffer.columns)) params_dict[param] = list( np.unique( - np.random.choice( - self._current_buffer.columns, col_size - ) + np.random.choice(self._current_buffer.columns, col_size) ) ) elif param == "chunksize": diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py index ee1b2c1f1c4..ccea536df00 100644 --- a/python/cudf/cudf/_fuzz_testing/fuzzer.py +++ b/python/cudf/cudf/_fuzz_testing/fuzzer.py @@ -71,9 +71,7 @@ def write_crash(self, error): crash_log_path = error_file_name + "_crash.log" with open(crash_path, "w") as f: - json.dump( - self._data_handler.current_params, f, sort_keys=True, indent=4 - ) + json.dump(self._data_handler.current_params, f, sort_keys=True, indent=4) logging.info(f"Crash params was written to {crash_path}") diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py index ffb7171a855..e757e2b602b 100644 --- a/python/cudf/cudf/_fuzz_testing/io.py +++ b/python/cudf/cudf/_fuzz_testing/io.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import json @@ -49,9 +49,7 @@ def __init__( else: for i in os.listdir(path): file_name = os.path.join(path, i) - if os.path.isfile(file_name) and file_name.endswith( - "_crash.json" - ): + if os.path.isfile(file_name) and file_name.endswith("_crash.json"): self._load_params(file_name) self._regression = bool(self._inputs) self._idx = 0 @@ -76,9 +74,7 @@ def current_params(self): def get_next_regression_params(self): if self._idx >= len(self._inputs): - logging.info( - "Reached the end of all crash.json files to run..Exiting.." - ) + logging.info("Reached the end of all crash.json files to run..Exiting..") sys.exit(0) param = self._inputs[self._idx] dtypes_meta = param["dtypes_meta"] diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index bffd508b2ef..800c4baa851 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -79,16 +79,13 @@ def generate_input(self): # issue is fixed: # https://github.com/rapidsai/cudf/issues/7086 # dtypes_list.extend(["list"]) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -154,16 +151,13 @@ def generate_input(self): # issue is fixed: # https://github.com/rapidsai/cudf/issues/7086 # dtypes_list.extend(["list"]) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) diff --git a/python/cudf/cudf/_fuzz_testing/main.py b/python/cudf/cudf/_fuzz_testing/main.py index 54e49b63e41..beb68ab2e43 100644 --- a/python/cudf/cudf/_fuzz_testing/main.py +++ b/python/cudf/cudf/_fuzz_testing/main.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf._fuzz_testing import fuzzer @@ -20,9 +20,7 @@ def __init__(self, func, params=None, data_handle=None, **kwargs): params=params, write_data_on_failure=kwargs.get("write_data_on_failure", True), max_lists_length=kwargs.get("max_lists_length", None), - max_lists_nesting_depth=kwargs.get( - "max_lists_nesting_depth", None - ), + max_lists_nesting_depth=kwargs.get("max_lists_nesting_depth", None), ) def __call__(self, *args, **kwargs): diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index ecddc72fa85..8136c022be5 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import io @@ -63,9 +63,7 @@ def generate_input(self): - {"datetime64[ns]"} ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) @@ -73,8 +71,7 @@ def generate_input(self): self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -112,9 +109,7 @@ def set_rand_params(self, params): map( int, np.unique( - np.random.choice( - stripes, orcFile.nstripes - ) + np.random.choice(stripes, orcFile.nstripes) ), ) ), @@ -178,17 +173,14 @@ def generate_input(self): - cudf.utils.dtypes.DATETIME_TYPES ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index 2d934e4816d..caf2e2b7a93 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import logging import random @@ -60,17 +60,14 @@ def generate_input(self): | {"list", "decimal64"} ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -145,16 +142,13 @@ def generate_input(self): - {"uint32"} | {"list", "decimal64"} ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) + dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py index 2f5e6204f7c..a34b3051608 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import io import sys @@ -44,9 +44,7 @@ def json_writer_test(pdf): # https://github.com/rapidsai/cudf/issues/6429 # compare_content(pdf_buffer, gdf_buffer) - actual = cudf.read_json( - gdf_buffer, engine="cudf", lines=True, orient="records" - ) + actual = cudf.read_json(gdf_buffer, engine="cudf", lines=True, orient="records") expected = pd.read_json(pdf_buffer, lines=True, orient="records") expected.columns = expected.columns.astype("str") assert_eq(actual, expected) @@ -62,12 +60,8 @@ def json_writer_test(pdf): def json_writer_test_params(pdf, compression, dtype): gdf = cudf.from_pandas(pdf) - pdf_buffer = pdf.to_json( - lines=True, orient="records", compression=compression - ) - gdf_buffer = gdf.to_json( - lines=True, orient="records", compression=compression - ) + pdf_buffer = pdf.to_json(lines=True, orient="records", compression=compression) + gdf_buffer = gdf.to_json(lines=True, orient="records", compression=compression) # TODO: Uncomment once this is fixed: # https://github.com/rapidsai/cudf/issues/6429 diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index 977038d1fcb..d67d3989b2d 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import io import sys @@ -54,18 +54,14 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): ) def orc_reader_stripes_test(input_tuple, columns, stripes): _, file_buffer = input_tuple - expected_pdf = orc_to_pandas( - file_io_obj=io.BytesIO(file_buffer), stripes=stripes - ) + expected_pdf = orc_to_pandas(file_io_obj=io.BytesIO(file_buffer), stripes=stripes) if columns is not None and len(columns) > 0: # ORC reader picks columns if only # there are any elements in `columns` expected_pdf = expected_pdf[columns] - gdf = cudf.read_orc( - io.BytesIO(file_buffer), columns=columns, stripes=stripes - ) + gdf = cudf.read_orc(io.BytesIO(file_buffer), columns=columns, stripes=stripes) compare_dataframe(expected_pdf, gdf) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py index 3d070576a12..115c1b67518 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import sys @@ -72,9 +72,7 @@ def parquet_writer_test(pdf): "compression": ["snappy", None], }, ) -def parquet_writer_test_rowgroup_index_compression( - pdf, compression, row_group_size -): +def parquet_writer_test_rowgroup_index_compression(pdf, compression, row_group_size): pd_file_name = "cpu_pdf.parquet" gd_file_name = "gpu_pdf.parquet" diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 6e53195ac2d..4378989d6e3 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -74,9 +74,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["lists_max_length"] = obj._max_lists_length if obj._max_lists_nesting_depth is None: - meta["nesting_max_depth"] = np.random.randint( - 1, np.iinfo("int64").max - ) + meta["nesting_max_depth"] = np.random.randint(1, np.iinfo("int64").max) else: meta["nesting_max_depth"] = obj._max_lists_nesting_depth @@ -95,13 +93,9 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["max_null_frequency"] = obj._max_struct_null_frequency if obj._max_struct_types_at_each_level is None: - meta["max_types_at_each_level"] = np.random.randint( - low=1, high=10 - ) + meta["max_types_at_each_level"] = np.random.randint(low=1, high=10) else: - meta[ - "max_types_at_each_level" - ] = obj._max_struct_types_at_each_level + meta["max_types_at_each_level"] = obj._max_struct_types_at_each_level elif dtype == "decimal64": meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION @@ -123,9 +117,7 @@ def run_test(funcs, args): try: funcs[function_name_to_run]() except KeyError: - print( - f"Provided function name({function_name_to_run}) does not exist." - ) + print(f"Provided function name({function_name_to_run}) does not exist.") def pyarrow_to_pandas(table): diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 206173919e1..bab9514d622 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -24,9 +24,7 @@ DtypeObj = Union["ExtensionDtype", np.dtype] # scalars -DatetimeLikeScalar = TypeVar( - "DatetimeLikeScalar", Period, Timestamp, Timedelta -) +DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", Period, Timestamp, Timedelta) ScalarLike = Any # columns @@ -41,6 +39,4 @@ # Groupby aggregation AggType = Union[str, Callable] -MultiColumnAggType = Union[ - AggType, Iterable[AggType], Dict[Any, Iterable[AggType]] -] +MultiColumnAggType = Union[AggType, Iterable[AggType], Dict[Any, Iterable[AggType]]] diff --git a/python/cudf/cudf/_version.py b/python/cudf/cudf/_version.py index ecf6ddd8e3b..8e4105d9a2e 100644 --- a/python/cudf/cudf/_version.py +++ b/python/cudf/cudf/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,5 @@ import importlib.resources -__version__ = ( - importlib.resources.files("cudf").joinpath("VERSION").read_text().strip() -) +__version__ = importlib.resources.files("cudf").joinpath("VERSION").read_text().strip() __git_commit__ = "" diff --git a/python/cudf/cudf/api/extensions/accessor.py b/python/cudf/cudf/api/extensions/accessor.py index e4988c1fa68..73e0594dd1c 100644 --- a/python/cudf/cudf/api/extensions/accessor.py +++ b/python/cudf/cudf/api/extensions/accessor.py @@ -117,9 +117,7 @@ ) doc_register_index_accessor = docfmt_partial( - docstring=_docstring_register_accessor.format( - klass="Index", example=_index_example - ) + docstring=_docstring_register_accessor.format(klass="Index", example=_index_example) ) doc_register_series_accessor = docfmt_partial( diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 417d8b0922a..86fa4f24c4f 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -108,10 +108,7 @@ def is_string_dtype(obj): Whether or not the array or dtype is of the string dtype. """ return ( - ( - isinstance(obj, (cudf.Index, cudf.Series)) - and obj.dtype == cudf.dtype("O") - ) + (isinstance(obj, (cudf.Index, cudf.Series)) and obj.dtype == cudf.dtype("O")) or (isinstance(obj, cudf.core.column.StringColumn)) or ( pd.api.types.is_string_dtype(obj) @@ -174,9 +171,9 @@ def _is_scalar_or_zero_d_array(val): bool Return True if given object is scalar. """ - return ( - isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0 - ) or is_scalar(val) + return (isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0) or is_scalar( + val + ) # TODO: We should be able to reuse the pandas function for this, need to figure @@ -242,9 +239,7 @@ def _union_categoricals( ) if sort_categories: sorted_categories = result_col.categories.sort_values(ascending=True) - result_col = result_col.reorder_categories( - new_categories=sorted_categories - ) + result_col = result_col.reorder_categories(new_categories=sorted_categories) return cudf.Index(result_col) @@ -293,9 +288,7 @@ def is_bool_dtype(arr_or_dtype): else: return pd_types.is_bool_dtype(arr_or_dtype=arr_or_dtype.dtype) elif isinstance(arr_or_dtype, cudf.CategoricalDtype): - return pd_types.is_bool_dtype( - arr_or_dtype=arr_or_dtype.categories.dtype - ) + return pd_types.is_bool_dtype(arr_or_dtype=arr_or_dtype.categories.dtype) else: return pd_types.is_bool_dtype(arr_or_dtype=arr_or_dtype) @@ -506,9 +499,7 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: elif isinstance(dtype_to_check, pd.CategoricalDtype): if dtype_to_check.categories is None: return False - return _is_pandas_nullable_extension_dtype( - dtype_to_check.categories.dtype - ) + return _is_pandas_nullable_extension_dtype(dtype_to_check.categories.dtype) elif isinstance(dtype_to_check, pd.IntervalDtype): return _is_pandas_nullable_extension_dtype(dtype_to_check.subtype) return False @@ -525,18 +516,14 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: is_datetime_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype) is_datetime64_any_dtype = pd_types.is_datetime64_any_dtype is_datetime64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype) -is_datetime64_ns_dtype = _wrap_pandas_is_dtype_api( - pd_types.is_datetime64_ns_dtype -) +is_datetime64_ns_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_ns_dtype) is_extension_array_dtype = pd_types.is_extension_array_dtype is_int64_dtype = pd_types.is_int64_dtype is_period_dtype = pd_types.is_period_dtype is_signed_integer_dtype = pd_types.is_signed_integer_dtype is_timedelta_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype) is_timedelta64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype) -is_timedelta64_ns_dtype = _wrap_pandas_is_dtype_api( - pd_types.is_timedelta64_ns_dtype -) +is_timedelta64_ns_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_ns_dtype) is_unsigned_integer_dtype = pd_types.is_unsigned_integer_dtype is_sparse = pd_types.is_sparse # is_list_like = pd_types.is_list_like diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index de44f392eef..59489700f8f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -277,9 +277,7 @@ def __getitem__(self, key): def __contains__(self, item): return item in self._values - def _copy_type_metadata( - self, other: Self, *, override_dtypes=None - ) -> Self: + def _copy_type_metadata(self, other: Self, *, override_dtypes=None) -> Self: raise NotImplementedError def get_level_values(self, level): @@ -348,9 +346,7 @@ def names(self, values): num_values = len(values) if num_values > 1: - raise ValueError( - "Length of new names must be 1, got %d" % num_values - ) + raise ValueError("Length of new names must be 1, got %d" % num_values) self.name = values[0] @@ -605,9 +601,7 @@ def union(self, other, sort=None): ) if cudf.get_option("mode.pandas_compatible"): - if ( - is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype) - ) or ( + if (is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype)) or ( not is_bool_dtype(self.dtype) and is_bool_dtype(other.dtype) ): # Bools + other types will result in mixed type. @@ -625,17 +619,13 @@ def union(self, other, sort=None): raise MixedTypeError("Cannot perform union with mixed types") if not len(other) or self.equals(other): - common_dtype = cudf.utils.dtypes.find_common_type( - [self.dtype, other.dtype] - ) + common_dtype = cudf.utils.dtypes.find_common_type([self.dtype, other.dtype]) res = self._get_reconciled_name_object(other).astype(common_dtype) if sort: return res.sort_values() return res elif not len(self): - common_dtype = cudf.utils.dtypes.find_common_type( - [self.dtype, other.dtype] - ) + common_dtype = cudf.utils.dtypes.find_common_type([self.dtype, other.dtype]) res = other._get_reconciled_name_object(self).astype(common_dtype) if sort: return res.sort_values() @@ -795,9 +785,7 @@ def fillna(self, value, downcast=None): Index([1, 2, 3, 4], dtype='int64') """ if downcast is not None: - raise NotImplementedError( - "`downcast` parameter is not yet supported" - ) + raise NotImplementedError("`downcast` parameter is not yet supported") return super().fillna(value=value) @@ -855,9 +843,7 @@ def to_frame(self, index=True, name=no_default): else: col_name = name - return cudf.DataFrame( - {col_name: self._values}, index=self if index else None - ) + return cudf.DataFrame({col_name: self._values}, index=self if index else None) def to_arrow(self): """Convert to a suitable Arrow object.""" @@ -1489,9 +1475,7 @@ def _union(self, other, sort=None): self_df["order"] = self_df.index other_df["order"] = other_df.index res = self_df.merge(other_df, on=[0], how="outer") - res = res.sort_values( - by=res._data.to_pandas_index()[1:], ignore_index=True - ) + res = res.sort_values(by=res._data.to_pandas_index()[1:], ignore_index=True) union_result = cudf.core.index._index_from_data({0: res._data[0]}) if sort in {None, True} and len(other): @@ -1609,9 +1593,7 @@ def sort_values( else: return index_sorted - def join( - self, other, how="left", level=None, return_indexers=False, sort=False - ): + def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ Compute join_index and indexers to conform data structures to the new index. @@ -1786,16 +1768,8 @@ def find_label_range(self, loc: slice) -> slice: start_side, stop_side = "right", "left" else: start_side, stop_side = "left", "right" - istart = ( - None - if start is None - else self.get_slice_bound(start, side=start_side) - ) - istop = ( - None - if stop is None - else self.get_slice_bound(stop, side=stop_side) - ) + istart = None if start is None else self.get_slice_bound(start, side=start_side) + istop = None if stop is None else self.get_slice_bound(stop, side=stop_side) if step < 0: # Fencepost istart = None if istart is None else max(istart - 1, 0) @@ -1866,9 +1840,7 @@ def get_slice_bound( except ValueError: raise KeyError(f"{label=} not in index") if left != right: - raise KeyError( - f"Cannot get slice bound for non-unique label {label=}" - ) + raise KeyError(f"Cannot get slice bound for non-unique label {label=}") if side == "left": return left else: @@ -1890,9 +1862,7 @@ def __array_function__(self, func, types, args, kwargs): # check if we don't handle any of the types (including sub-class) for t in types: - if not any( - issubclass(t, handled_type) for handled_type in handled_types - ): + if not any(issubclass(t, handled_type) for handled_type in handled_types): return NotImplemented if hasattr(cudf_index_module, fname): @@ -1943,9 +1913,7 @@ def from_pandas(cls, index: pd.Index, nan_as_null=no_default): Index([10.0, 20.0, 30.0, nan], dtype='float64') """ if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) + nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None if not isinstance(index, pd.Index): raise TypeError("not a pandas.Index") @@ -2093,9 +2061,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): if not is_integer_dtype(gather_map.dtype): gather_map = gather_map.astype(size_type_dtype) - if not _gather_map_is_valid( - gather_map, len(self), check_bounds, nullify - ): + if not _gather_map_is_valid(gather_map, len(self), check_bounds, nullify): raise IndexError("Gather map index is out of bounds.") return self._from_columns_like_self( @@ -2128,13 +2094,9 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None): """ if axis not in {0, "index"}: - raise NotImplementedError( - "Gather along column axis is not yet supported." - ) + raise NotImplementedError("Gather along column axis is not yet supported.") if not allow_fill or fill_value is not None: - raise NotImplementedError( - "`allow_fill` and `fill_value` are unsupported." - ) + raise NotImplementedError("`allow_fill` and `fill_value` are unsupported.") return self._gather(indices) diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py index 5cb9f0363e0..b69c456b7ba 100644 --- a/python/cudf/cudf/core/_internals/expressions.py +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import ast import functools @@ -208,8 +208,7 @@ def visit_Call(self, node): # Assuming only unary functions are supported, which is checked above. if len(node.args) != 1 or node.keywords: raise ValueError( - f"Function {node.func} only accepts one positional " - "argument." + f"Function {node.func} only accepts one positional " "argument." ) self.visit(node.args[0]) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 053425fff8d..6f260424b5e 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -78,27 +78,22 @@ def _find_and_read_tzfile_tzdata(zone_name): def _read_tzfile_as_frame(tzdir, zone_name): - transition_times_and_offsets = make_timezone_transition_table( - tzdir, zone_name - ) + transition_times_and_offsets = make_timezone_transition_table(tzdir, zone_name) if not transition_times_and_offsets: # this happens for UTC-like zones min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]") - transition_times_and_offsets = as_column([min_date]), as_column( - [np.timedelta64(0, "s")] + transition_times_and_offsets = ( + as_column([min_date]), + as_column([np.timedelta64(0, "s")]), ) return DataFrame._from_data( - dict( - zip(["transition_times", "offsets"], transition_times_and_offsets) - ) + dict(zip(["transition_times", "offsets"], transition_times_and_offsets)) ) -def _find_ambiguous_and_nonexistent( - data: DatetimeColumn, zone_name: str -) -> Tuple: +def _find_ambiguous_and_nonexistent(data: DatetimeColumn, zone_name: str) -> Tuple: """ Recognize ambiguous and nonexistent timestamps for the given timezone. @@ -112,9 +107,7 @@ def _find_ambiguous_and_nonexistent( """ tz_data_for_zone = get_tz_data(zone_name) transition_times = tz_data_for_zone["transition_times"] - offsets = tz_data_for_zone["offsets"].astype( - f"timedelta64[{data._time_unit}]" - ) + offsets = tz_data_for_zone["offsets"].astype(f"timedelta64[{data._time_unit}]") if len(offsets) == 1: # no transitions return False, False @@ -170,17 +163,12 @@ def localize( data: DatetimeColumn, zone_name: str, ambiguous, nonexistent ) -> DatetimeTZColumn: if ambiguous != "NaT": - raise NotImplementedError( - "Only ambiguous='NaT' is currently supported" - ) + raise NotImplementedError("Only ambiguous='NaT' is currently supported") if nonexistent != "NaT": - raise NotImplementedError( - "Only nonexistent='NaT' is currently supported" - ) + raise NotImplementedError("Only nonexistent='NaT' is currently supported") if isinstance(data, DatetimeTZColumn): raise ValueError( - "Already localized. " - "Use `tz_convert` to convert between time zones." + "Already localized. " "Use `tz_convert` to convert between time zones." ) dtype = pd.DatetimeTZDtype(data._time_unit, zone_name) ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index ef6b10f66c1..dd6069cbb4e 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import warnings from typing import Tuple, Union @@ -81,9 +81,7 @@ def _check_and_cast_columns_with_other( ) return _normalize_categorical(source_col, other.astype(source_dtype)) - if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast( - other, source_dtype - ): + if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast(other, source_dtype): common_dtype = source_dtype elif ( isinstance(source_col, cudf.core.column.NumericalColumn) @@ -116,9 +114,7 @@ def _make_categorical_like(result, column): if isinstance(column, cudf.core.column.CategoricalColumn): result = cudf.core.column.build_categorical_column( categories=column.categories, - codes=cudf.core.column.build_column( - result.base_data, dtype=result.dtype - ), + codes=cudf.core.column.build_column(result.base_data, dtype=result.dtype), mask=result.base_mask, size=result.size, offset=result.offset, diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index ce6bb83bc77..044b709266c 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -42,9 +42,7 @@ def serialize(self): :meta private: """ - raise NotImplementedError( - "Subclasses of Serializable must implement serialize" - ) + raise NotImplementedError("Subclasses of Serializable must implement serialize") @classmethod def deserialize(cls, header, frames): @@ -99,9 +97,7 @@ def device_serialize(self): for f in frames ) header["type-serialized"] = pickle.dumps(type(self)) - header["is-cuda"] = [ - hasattr(f, "__cuda_array_interface__") for f in frames - ] + header["is-cuda"] = [hasattr(f, "__cuda_array_interface__") for f in frames] header["lengths"] = [f.nbytes for f in frames] return header, frames diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 33cec21caa5..deb98d2f39f 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -138,9 +138,7 @@ def _index_or_values_interpolation(column, index=None): return column to_interp = IndexedFrame(data={None: column}, index=index) - known_x_and_y = to_interp._apply_boolean_mask( - BooleanMask(~mask, len(to_interp)) - ) + known_x_and_y = to_interp._apply_boolean_mask(BooleanMask(~mask, len(to_interp))) known_x = known_x_and_y._index._column.values known_y = known_x_and_y._data.columns[0].values diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 8d278c9c065..55780142001 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -160,9 +160,7 @@ def _from_device_memory(cls, data: Any, exposed: bool) -> Self: ret._ptr = data.ptr ret._size = data.size else: - ret._ptr, ret._size = get_ptr_and_size( - data.__cuda_array_interface__ - ) + ret._ptr, ret._size = get_ptr_and_size(data.__cuda_array_interface__) if ret.size < 0: raise ValueError("size cannot be negative") return ret @@ -258,9 +256,7 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int: """ return self._ptr - def memoryview( - self, *, offset: int = 0, size: Optional[int] = None - ) -> memoryview: + def memoryview(self, *, offset: int = 0, size: Optional[int] = None) -> memoryview: """Read-only access to the buffer through host memory.""" size = self._size if size is None else size host_buf = host_memory_allocation(size) @@ -308,9 +304,7 @@ def __init__( if offset < 0: raise ValueError("offset cannot be negative") if offset + size > owner.size: - raise ValueError( - "offset+size cannot be greater than the size of owner" - ) + raise ValueError("offset+size cannot be greater than the size of owner") self._owner = owner self._offset = offset self._size = size diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index 3e654e01401..cc1eecf827d 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -21,9 +21,7 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.string import format_bytes -_spill_cudf_nvtx_annotate = partial( - _cudf_nvtx_annotate, domain="cudf_python-spill" -) +_spill_cudf_nvtx_annotate = partial(_cudf_nvtx_annotate, domain="cudf_python-spill") def get_traceback() -> str: @@ -382,21 +380,15 @@ def spill_to_device_limit(self, device_limit: Optional[int] = None) -> int: int The number of bytes spilled. """ - limit = ( - self._device_memory_limit if device_limit is None else device_limit - ) + limit = self._device_memory_limit if device_limit is None else device_limit if limit is None: return 0 - unspilled = sum( - buf.size for buf in self.buffers() if not buf.is_spilled - ) + unspilled = sum(buf.size for buf in self.buffers() if not buf.is_spilled) return self.spill_device_memory(nbytes=unspilled - limit) def __repr__(self) -> str: spilled = sum(buf.size for buf in self.buffers() if buf.is_spilled) - unspilled = sum( - buf.size for buf in self.buffers() if not buf.is_spilled - ) + unspilled = sum(buf.size for buf in self.buffers() if not buf.is_spilled) unspillable = 0 for buf in self.buffers(): if not (buf.is_spilled or buf.spillable): diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index b25af13679c..35c5a3e1abe 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -111,8 +111,7 @@ def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None: manager = get_global_manager() if manager is None: raise ValueError( - f"cannot create {self.__class__} without " - "a global spill manager" + f"cannot create {self.__class__} without " "a global spill manager" ) self._manager = manager @@ -198,9 +197,7 @@ def spill(self, target: str = "cpu") -> None: return if not self.spillable: - raise ValueError( - f"Cannot in-place move an unspillable buffer: {self}" - ) + raise ValueError(f"Cannot in-place move an unspillable buffer: {self}") if (ptr_type, target) == ("gpu", "cpu"): with annotate( @@ -209,9 +206,7 @@ def spill(self, target: str = "cpu") -> None: domain="cudf_python-spill", ): host_mem = host_memory_allocation(self.size) - rmm._lib.device_buffer.copy_ptr_to_host( - self._ptr, host_mem - ) + rmm._lib.device_buffer.copy_ptr_to_host(self._ptr, host_mem) self._ptr_desc["memoryview"] = host_mem self._ptr = 0 self._owner = None @@ -343,9 +338,7 @@ def __cuda_array_interface__(self) -> dict: "version": 0, } - def memoryview( - self, *, offset: int = 0, size: Optional[int] = None - ) -> memoryview: + def memoryview(self, *, offset: int = 0, size: Optional[int] = None) -> memoryview: size = self._size if size is None else size with self.lock: if self.spillable: @@ -354,9 +347,7 @@ def memoryview( else: assert self._ptr_desc["type"] == "gpu" ret = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host( - self._ptr + offset, ret - ) + rmm._lib.device_buffer.copy_ptr_to_host(self._ptr + offset, ret) return ret def __str__(self) -> str: diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index c2ec7effd13..0d4dee7255f 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -106,9 +106,7 @@ def as_buffer( # the Buffer (and its sub-classes) do not have to. if isinstance(data, int): if size is None: - raise ValueError( - "size must be specified when `data` is an integer" - ) + raise ValueError("size must be specified when `data` is an integer") data = cuda_array_interface_wrapper(ptr=data, size=size, owner=owner) elif size is not None or owner is not None: raise ValueError( @@ -151,8 +149,7 @@ def as_buffer( and get_spill_lock() is None ): raise ValueError( - "An owning spillable buffer must " - "either be exposed or spill locked." + "An owning spillable buffer must " "either be exposed or spill locked." ) ptr, size = get_ptr_and_size(data.__cuda_array_interface__) base_ptr = owner.get_ptr(mode="read") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 88bb4521a5b..f245cb90d4a 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -100,9 +100,7 @@ class CategoricalAccessor(ColumnMethods): def __init__(self, parent: SeriesOrSingleColumnIndex): if not isinstance(parent.dtype, CategoricalDtype): - raise AttributeError( - "Can only use .cat accessor with a 'category' dtype" - ) + raise AttributeError("Can only use .cat accessor with a 'category' dtype") super().__init__(parent=parent) @property @@ -117,11 +115,7 @@ def codes(self) -> "cudf.Series": """ Return Series of codes as well as the index. """ - index = ( - self._parent.index - if isinstance(self._parent, cudf.Series) - else None - ) + index = self._parent.index if isinstance(self._parent, cudf.Series) else None return cudf.Series(self._column.codes, index=index) @property @@ -267,9 +261,7 @@ def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: f"type-cast new_categories to the same type as " f"existing categories." ) - common_dtype = find_common_type( - [old_categories.dtype, new_categories.dtype] - ) + common_dtype = find_common_type([old_categories.dtype, new_categories.dtype]) new_categories = new_categories.astype(common_dtype) old_categories = old_categories.astype(common_dtype) @@ -557,9 +549,7 @@ def __init__( @property def base_size(self) -> int: - return int( - (self.base_children[0].size) / self.base_children[0].dtype.itemsize - ) + return int((self.base_children[0].size) / self.base_children[0].dtype.itemsize) def __contains__(self, item: ScalarLike) -> bool: try: @@ -617,9 +607,7 @@ def categories(self) -> ColumnBase: @categories.setter def categories(self, value): - self._dtype = CategoricalDtype( - categories=value, ordered=self.dtype.ordered - ) + self._dtype = CategoricalDtype(categories=value, ordered=self.dtype.ordered) @property def codes(self) -> NumericalColumn: @@ -636,9 +624,9 @@ def ordered(self, value: bool): self.dtype.ordered = value def __setitem__(self, key, value): - if cudf.api.types.is_scalar( + if cudf.api.types.is_scalar(value) and cudf._lib.scalar._is_null_host_scalar( value - ) and cudf._lib.scalar._is_null_host_scalar(value): + ): to_add_categories = 0 else: if cudf.api.types.is_scalar(value): @@ -683,26 +671,20 @@ def _fill( return self if inplace else self.copy() fill_code = self._encode(fill_value) - fill_scalar = cudf._lib.scalar.as_device_scalar( - fill_code, self.codes.dtype - ) + fill_scalar = cudf._lib.scalar.as_device_scalar(fill_code, self.codes.dtype) result = self if inplace else self.copy() libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) return result - def slice( - self, start: int, stop: int, stride: Optional[int] = None - ) -> Self: + def slice(self, start: int, stop: int, stride: Optional[int] = None) -> Self: codes = self.codes.slice(start, stop, stride) return cast( Self, cudf.core.column.build_categorical_column( categories=self.categories, - codes=cudf.core.column.build_column( - codes.base_data, dtype=codes.dtype - ), + codes=cudf.core.column.build_column(codes.base_data, dtype=codes.dtype), mask=codes.base_mask, ordered=self.ordered, size=codes.size, @@ -729,9 +711,7 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: if not isinstance(other, CategoricalColumn): return NotImplemented if other.dtype != self.dtype: - raise TypeError( - "Categoricals can only compare with the same type" - ) + raise TypeError("Categoricals can only compare with the same type") return other ary = column.as_column( @@ -817,9 +797,7 @@ def to_arrow(self) -> pa.Array: """Convert to PyArrow Array.""" # arrow doesn't support unsigned codes signed_type = ( - min_signed_type(self.codes.max()) - if self.codes.size > 0 - else np.int8 + min_signed_type(self.codes.max()) if self.codes.size > 0 else np.int8 ) codes = self.codes.astype(signed_type) categories = self.categories @@ -848,13 +826,9 @@ def values(self): raise NotImplementedError("cudf.Categorical is not yet implemented") def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": - return ( - self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) - ) + return self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) - def data_array_view( - self, *, mode="write" - ) -> cuda.devicearray.DeviceNDArray: + def data_array_view(self, *, mode="write") -> cuda.devicearray.DeviceNDArray: return self.codes.data_array_view(mode=mode) def unique(self) -> CategoricalColumn: @@ -898,9 +872,7 @@ def find_and_replace( f"got to_replace dtype: {to_replace_col.dtype} and " f"value dtype: {replacement_col.dtype}" ) - df = cudf.DataFrame._from_data( - {"old": to_replace_col, "new": replacement_col} - ) + df = cudf.DataFrame._from_data({"old": to_replace_col, "new": replacement_col}) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: fill_value = ( @@ -917,9 +889,7 @@ def find_and_replace( if fill_value in self.categories: # type: ignore replaced = self.fillna(fill_value) else: - new_categories = self.categories.append( - column.as_column([fill_value]) - ) + new_categories = self.categories.append(column.as_column([fill_value])) replaced = self._set_categories(new_categories) replaced = replaced.fillna(fill_value) df = df.dropna(subset=["old"]) @@ -928,9 +898,7 @@ def find_and_replace( else: replaced = self if df._data["new"].null_count > 0: - drop_values = df._data["old"].apply_boolean_mask( - df._data["new"].isnull() - ) + drop_values = df._data["old"].apply_boolean_mask(df._data["new"].isnull()) cur_categories = replaced.categories new_categories = cur_categories.apply_boolean_mask( ~cudf.Series(cur_categories.isin(drop_values)) @@ -960,9 +928,7 @@ def find_and_replace( # map it to the new label it is to be replaced by dtype_replace = cudf.Series._from_data({None: replacement_col}) dtype_replace[dtype_replace.isin(cats_col)] = None - new_cats_col = cats_col.find_and_replace( - to_replace_col, dtype_replace._column - ) + new_cats_col = cats_col.find_and_replace(to_replace_col, dtype_replace._column) # anything we mapped to None, we want to now filter out since # those categories don't exist anymore @@ -987,15 +953,11 @@ def find_and_replace( # The index of this frame is now the old ints, but the column # named 'index', which came from the filtered categories, # contains the new ints that we need to map to - to_replace_col = column.as_column(catmap.index).astype( - replaced.codes.dtype - ) + to_replace_col = column.as_column(catmap.index).astype(replaced.codes.dtype) replacement_col = catmap._data["index"].astype(replaced.codes.dtype) replaced = column.as_column(replaced.codes) - output = libcudf.replace.replace( - replaced, to_replace_col, replacement_col - ) + output = libcudf.replace.replace(replaced, to_replace_col, replacement_col) result = column.build_categorical_column( categories=new_cats["cats"], @@ -1086,15 +1048,11 @@ def fillna( self.categories, is_unique=True, ) - fill_value = column.as_column(fill_value.codes).astype( - self.codes.dtype - ) + fill_value = column.as_column(fill_value.codes).astype(self.codes.dtype) return super().fillna(fill_value, method=method) - def indices_of( - self, value: ScalarLike - ) -> cudf.core.column.NumericalColumn: + def indices_of(self, value: ScalarLike) -> cudf.core.column.NumericalColumn: return self.codes.indices_of(self._encode(value)) @property @@ -1109,18 +1067,14 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if isinstance(dtype, str) and dtype == "category": return self if ( - isinstance( - dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype) - ) + isinstance(dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype)) and (dtype.categories is None) and (dtype.ordered is None) ): return self if isinstance(dtype, pd.CategoricalDtype): - dtype = CategoricalDtype( - categories=dtype.categories, ordered=dtype.ordered - ) + dtype = CategoricalDtype(categories=dtype.categories, ordered=dtype.ordered) if not isinstance(dtype, CategoricalDtype): raise ValueError("dtype must be CategoricalDtype") @@ -1137,26 +1091,14 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: return self._get_decategorized_column().as_numerical_column(dtype) - def as_string_column( - self, dtype, format: str | None = None - ) -> StringColumn: - return self._get_decategorized_column().as_string_column( - dtype, format=format - ) + def as_string_column(self, dtype, format: str | None = None) -> StringColumn: + return self._get_decategorized_column().as_string_column(dtype, format=format) - def as_datetime_column( - self, dtype, format: str | None = None - ) -> DatetimeColumn: - return self._get_decategorized_column().as_datetime_column( - dtype, format - ) + def as_datetime_column(self, dtype, format: str | None = None) -> DatetimeColumn: + return self._get_decategorized_column().as_datetime_column(dtype, format) - def as_timedelta_column( - self, dtype, format: str | None = None - ) -> TimeDeltaColumn: - return self._get_decategorized_column().as_timedelta_column( - dtype, format - ) + def as_timedelta_column(self, dtype, format: str | None = None) -> TimeDeltaColumn: + return self._get_decategorized_column().as_timedelta_column(dtype, format) def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): @@ -1170,9 +1112,7 @@ def _get_decategorized_column(self) -> ColumnBase: def copy(self, deep: bool = True) -> Self: result_col = super().copy(deep=deep) if deep: - result_col.categories = libcudf.copying.copy_column( - self.dtype._categories - ) + result_col.categories = libcudf.copying.copy_column(self.dtype._categories) return result_col @cached_property @@ -1202,9 +1142,7 @@ def _concat( # improved as the concatenation API is solidified. # Find the first non-null column: - head = next( - (obj for obj in objs if obj.null_count != len(obj)), objs[0] - ) + head = next((obj for obj in objs if obj.null_count != len(obj)), objs[0]) # Combine and de-dupe the categories cats = column.concat_columns([o.categories for o in objs]).unique() @@ -1214,8 +1152,7 @@ def _concat( newsize = sum(map(len, codes)) if newsize > libcudf.MAX_COLUMN_SIZE: raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + f"Result of concat cannot have " f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype, masked=True) @@ -1226,23 +1163,17 @@ def _concat( return column.build_categorical_column( categories=column.as_column(cats), - codes=column.build_column( - codes_col.base_data, dtype=codes_col.dtype - ), + codes=column.build_column(codes_col.base_data, dtype=codes_col.dtype), mask=codes_col.base_mask, size=codes_col.size, offset=codes_col.offset, ) - def _with_type_metadata( - self: CategoricalColumn, dtype: Dtype - ) -> CategoricalColumn: + def _with_type_metadata(self: CategoricalColumn, dtype: Dtype) -> CategoricalColumn: if isinstance(dtype, CategoricalDtype): return column.build_categorical_column( categories=dtype.categories._values, - codes=column.build_column( - self.codes.base_data, dtype=self.codes.dtype - ), + codes=column.build_column(self.codes.base_data, dtype=self.codes.dtype), mask=self.codes.base_mask, ordered=dtype.ordered, size=self.codes.size, @@ -1291,9 +1222,7 @@ def set_categories( # return a column full of Nulls. out_col = _create_empty_categorical_column( self, - CategoricalDtype( - categories=new_categories, ordered=ordered - ), + CategoricalDtype(categories=new_categories, ordered=ordered), ) elif ( not out_col._categories_equal(new_categories, ordered=True) @@ -1305,9 +1234,7 @@ def set_categories( ) return out_col - def _categories_equal( - self, new_categories: ColumnBase, ordered=False - ) -> bool: + def _categories_equal(self, new_categories: ColumnBase, ordered=False) -> bool: cur_categories = self.categories if len(new_categories) != len(cur_categories): return False @@ -1315,12 +1242,8 @@ def _categories_equal( return False # if order doesn't matter, sort before the equals call below if not ordered: - cur_categories = cudf.Series(cur_categories).sort_values( - ignore_index=True - ) - new_categories = cudf.Series(new_categories).sort_values( - ignore_index=True - ) + cur_categories = cudf.Series(cur_categories).sort_values(ignore_index=True) + new_categories = cudf.Series(new_categories).sort_values(ignore_index=True) return cur_categories.equals(new_categories) def _set_categories( @@ -1349,18 +1272,12 @@ def _set_categories( new_cats = cudf.Series(new_cats)._column.unique() cur_codes = self.codes - max_cat_size = ( - len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats) - ) + max_cat_size = len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats) out_code_dtype = min_unsigned_type(max_cat_size) cur_order = column.as_column(range(len(cur_codes))) - old_codes = column.as_column( - range(len(cur_cats)), dtype=out_code_dtype - ) - new_codes = column.as_column( - range(len(new_cats)), dtype=out_code_dtype - ) + old_codes = column.as_column(range(len(cur_cats)), dtype=out_code_dtype) + new_codes = column.as_column(range(len(new_cats)), dtype=out_code_dtype) new_df = cudf.DataFrame._from_data( data={"new_codes": new_codes, "cats": new_cats} @@ -1385,9 +1302,7 @@ def _set_categories( # codes can't have masks, so take mask out before moving in return column.build_categorical_column( categories=new_cats, - codes=column.build_column( - new_codes.base_data, dtype=new_codes.dtype - ), + codes=column.build_column(new_codes.base_data, dtype=new_codes.dtype), mask=new_codes.base_mask, size=new_codes.size, offset=new_codes.offset, @@ -1406,8 +1321,7 @@ def reorder_categories( # current set of categories. if not self._categories_equal(new_categories, ordered=False): raise ValueError( - "items in new_categories are not the same as in " - "old categories" + "items in new_categories are not the same as in " "old categories" ) return self._set_categories(new_categories, ordered=ordered) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3e0ec4b5cd7..6a05293e500 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -204,16 +204,12 @@ def to_pandas( # This default implementation does not handle nulls in any meaningful # way if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(pa_array), index=index - ) + return pd.Series(pd.arrays.ArrowExtensionArray(pa_array), index=index) else: pd_series = pa_array.to_pandas() @@ -310,9 +306,7 @@ def to_arrow(self) -> pa.Array: 4 ] """ - return libcudf.interop.to_arrow([self], [("None", self.dtype)])[ - "None" - ].chunk(0) + return libcudf.interop.to_arrow([self], [("None", self.dtype)])["None"].chunk(0) @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: @@ -339,10 +333,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: data = pa.table([array], [None]) - if ( - isinstance(array.type, pa.TimestampType) - and array.type.tz is not None - ): + if isinstance(array.type, pa.TimestampType) and array.type.tz is not None: raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" ) @@ -470,9 +461,7 @@ def copy(self, deep: bool = True) -> Self: else self.base_mask.copy(deep=False), size=self.size, offset=self.offset, - children=tuple( - col.copy(deep=False) for col in self.base_children - ), + children=tuple(col.copy(deep=False) for col in self.base_children), ), ) @@ -493,9 +482,7 @@ def view(self, dtype: Dtype) -> ColumnBase: dtype = cudf.dtype(dtype) if dtype.kind in ("o", "u", "s"): - raise TypeError( - "Bytes viewed as str without metadata is ambiguous" - ) + raise TypeError("Bytes viewed as str without metadata is ambiguous") if self.dtype.itemsize == dtype.itemsize: return build_column( @@ -508,9 +495,7 @@ def view(self, dtype: Dtype) -> ColumnBase: else: if self.null_count > 0: - raise ValueError( - "Can not produce a view of a column with nulls" - ) + raise ValueError("Can not produce a view of a column with nulls") if (self.size * self.dtype.itemsize) % dtype.itemsize: raise ValueError( @@ -545,9 +530,7 @@ def element_indexing(self, index: int): return pd.Timedelta(result) return result - def slice( - self, start: int, stop: int, stride: Optional[int] = None - ) -> Self: + def slice(self, start: int, stop: int, stride: Optional[int] = None) -> Self: stride = 1 if stride is None else stride if start < 0: start = start + len(self) @@ -647,9 +630,7 @@ def _scatter_by_column( if is_bool_dtype(key.dtype): # `key` is boolean mask if len(key) != len(self): - raise ValueError( - "Boolean mask must be of same length as column" - ) + raise ValueError("Boolean mask must be of same length as column") if isinstance(value, ColumnBase) and len(self) == len(value): # Both value and key are aligned to self. Thus, the values # corresponding to the false values in key should be @@ -674,9 +655,9 @@ def _scatter_by_column( 0 ]._with_type_metadata(self.dtype) else: - return libcudf.copying.scatter([value], key, [self])[ - 0 - ]._with_type_metadata(self.dtype) + return libcudf.copying.scatter([value], key, [self])[0]._with_type_metadata( + self.dtype + ) def _check_scatter_key_length( self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase] @@ -728,9 +709,7 @@ def notnull(self) -> ColumnBase: return result - def indices_of( - self, value: ScalarLike | Self - ) -> cudf.core.column.NumericalColumn: + def indices_of(self, value: ScalarLike | Self) -> cudf.core.column.NumericalColumn: """ Find locations of value in the column @@ -935,15 +914,11 @@ def is_unique(self) -> bool: @property def is_monotonic_increasing(self) -> bool: - return not self.has_nulls() and libcudf.sort.is_sorted( - [self], [True], None - ) + return not self.has_nulls() and libcudf.sort.is_sorted([self], [True], None) @property def is_monotonic_decreasing(self) -> bool: - return not self.has_nulls() and libcudf.sort.is_sorted( - [self], [False], None - ) + return not self.has_nulls() and libcudf.sort.is_sorted([self], [False], None) def sort_values( self: ColumnBase, @@ -958,9 +933,7 @@ def distinct_count(self, dropna: bool = True) -> int: try: return self._distinct_count[dropna] except KeyError: - self._distinct_count[dropna] = cpp_distinct_count( - self, ignore_nulls=dropna - ) + self._distinct_count[dropna] = cpp_distinct_count(self, ignore_nulls=dropna) return self._distinct_count[dropna] def can_cast_safely(self, to_dtype: Dtype) -> bool: @@ -1020,12 +993,8 @@ def as_categorical_column(self, dtype) -> ColumnBase: # Re-label self w.r.t. the provided categories if ( - isinstance(dtype, cudf.CategoricalDtype) - and dtype._categories is not None - ) or ( - isinstance(dtype, pd.CategoricalDtype) - and dtype.categories is not None - ): + isinstance(dtype, cudf.CategoricalDtype) and dtype._categories is not None + ) or (isinstance(dtype, pd.CategoricalDtype) and dtype.categories is not None): labels = self._label_encoding(cats=as_column(dtype.categories)) return build_categorical_column( @@ -1055,9 +1024,7 @@ def as_categorical_column(self, dtype) -> ColumnBase: ordered=ordered, ) - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": + def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": raise NotImplementedError def as_datetime_column( @@ -1065,9 +1032,7 @@ def as_datetime_column( ) -> "cudf.core.column.DatetimeColumn": raise NotImplementedError - def as_interval_column( - self, dtype: Dtype - ) -> "cudf.core.column.IntervalColumn": + def as_interval_column(self, dtype: Dtype) -> "cudf.core.column.IntervalColumn": raise NotImplementedError def as_timedelta_column( @@ -1090,16 +1055,12 @@ def apply_boolean_mask(self, mask) -> ColumnBase: if not is_bool_dtype(mask.dtype): raise ValueError("boolean_mask is not boolean type.") - return apply_boolean_mask([self], mask)[0]._with_type_metadata( - self.dtype - ) + return apply_boolean_mask([self], mask)[0]._with_type_metadata(self.dtype) def argsort( self, ascending: bool = True, na_position: str = "last" ) -> "cudf.core.column.NumericalColumn": - return libcudf.sort.order_by( - [self], [ascending], na_position, stable=True - ) + return libcudf.sort.order_by([self], [ascending], na_position, stable=True) def __arrow_array__(self, type=None): raise TypeError( @@ -1111,8 +1072,7 @@ def __arrow_array__(self, type=None): @property def __cuda_array_interface__(self): raise NotImplementedError( - f"dtype {self.dtype} is not yet supported via " - "`__cuda_array_interface__`" + f"dtype {self.dtype} is not yet supported via " "`__cuda_array_interface__`" ) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -1141,9 +1101,7 @@ def unique(self) -> ColumnBase: """ Get unique values in the data """ - return drop_duplicates([self], keep="first")[0]._with_type_metadata( - self.dtype - ) + return drop_duplicates([self], keep="first")[0]._with_type_metadata(self.dtype) def serialize(self) -> Tuple[dict, list]: # data model: @@ -1180,9 +1138,7 @@ def serialize(self) -> Tuple[dict, list]: header["mask"] = mask_header frames.extend(mask_frames) if self.children: - child_headers, child_frames = zip( - *(c.serialize() for c in self.children) - ) + child_headers, child_frames = zip(*(c.serialize() for c in self.children)) header["subheaders"] = list(child_headers) frames.extend(chain(*child_frames)) header["size"] = self.size @@ -1228,13 +1184,9 @@ def unpack(header, frames) -> Tuple[Any, list]: ) def unary_operator(self, unaryop: str): - raise TypeError( - f"Operation {unaryop} not supported for dtype {self.dtype}." - ) + raise TypeError(f"Operation {unaryop} not supported for dtype {self.dtype}.") - def normalize_binop_value( - self, other: ScalarLike - ) -> Union[ColumnBase, ScalarLike]: + def normalize_binop_value(self, other: ScalarLike) -> Union[ColumnBase, ScalarLike]: raise NotImplementedError def _reduce( @@ -1253,9 +1205,7 @@ def _reduce( The minimum number of entries for the reduction, otherwise the reduction returns NaN. """ - preprocessed = self._process_for_reduction( - skipna=skipna, min_count=min_count - ) + preprocessed = self._process_for_reduction(skipna=skipna, min_count=min_count) if isinstance(preprocessed, ColumnBase): return libcudf.reduce.reduce(op, preprocessed, **kwargs) return preprocessed @@ -1357,9 +1307,7 @@ def _return_sentinel_column(): except ValueError: return _return_sentinel_column() - left_gather_map, right_gather_map = cpp_join( - [self], [cats], how="left" - ) + left_gather_map, right_gather_map = cpp_join([self], [cats], how="left") codes = libcudf.copying.gather( [as_column(range(len(cats)), dtype=dtype)], right_gather_map, @@ -1391,9 +1339,7 @@ def column_empty_like( and dtype == column.dtype ): catcolumn = cast("cudf.core.column.CategoricalColumn", column) - codes = column_empty_like( - catcolumn.codes, masked=masked, newsize=newsize - ) + codes = column_empty_like(catcolumn.codes, masked=masked, newsize=newsize) return build_column( data=None, dtype=dtype, @@ -1405,9 +1351,7 @@ def column_empty_like( return column_empty(row_count, dtype, masked) -def column_empty_like_same_mask( - column: ColumnBase, dtype: Dtype -) -> ColumnBase: +def column_empty_like_same_mask(column: ColumnBase, dtype: Dtype) -> ColumnBase: """Create a new empty Column with the same length and the same mask. Parameters @@ -1437,9 +1381,7 @@ def column_empty( elif isinstance(dtype, ListDtype): data = None children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), + as_column(0, length=row_count + 1, dtype=libcudf.types.size_type_dtype), column_empty(row_count, dtype=dtype.element_type), ) elif isinstance(dtype, CategoricalDtype): @@ -1458,9 +1400,7 @@ def column_empty( elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): data = as_buffer(rmm.DeviceBuffer(size=0)) children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), + as_column(0, length=row_count + 1, dtype=libcudf.types.size_type_dtype), ) else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) @@ -1470,9 +1410,7 @@ def column_empty( else: mask = None - return build_column( - data, dtype, mask=mask, size=row_count, children=children - ) + return build_column(data, dtype, mask=mask, size=row_count, children=children) def build_column( @@ -1764,9 +1702,7 @@ def as_column( as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")), ) if cudf.get_option("default_integer_bitwidth") and dtype is None: - dtype = cudf.dtype( - f'i{cudf.get_option("default_integer_bitwidth")//8}' - ) + dtype = cudf.dtype(f'i{cudf.get_option("default_integer_bitwidth")//8}') if dtype is not None: return column.astype(dtype) return column @@ -1844,10 +1780,7 @@ def as_column( "yet supported in pyarrow, see: " "https://github.com/apache/arrow/issues/20213" ) - elif ( - pa.types.is_timestamp(arbitrary.type) - and arbitrary.type.tz is not None - ): + elif pa.types.is_timestamp(arbitrary.type) and arbitrary.type.tz is not None: raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" ) @@ -1874,9 +1807,9 @@ def as_column( # be of `object` dtype. new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype()) - if cudf.get_option( - "mode.pandas_compatible" - ) and new_dtype == cudf.dtype("O"): + if cudf.get_option("mode.pandas_compatible") and new_dtype == cudf.dtype( + "O" + ): # We internally raise if we do `astype("object")`, hence # need to cast to `str` since this is safe to do so because # it is a null-array. @@ -1888,9 +1821,7 @@ def as_column( return col - elif isinstance( - arbitrary, (pd.Series, pd.Index, pd.api.extensions.ExtensionArray) - ): + elif isinstance(arbitrary, (pd.Series, pd.Index, pd.api.extensions.ExtensionArray)): if isinstance(arbitrary.dtype, (pd.SparseDtype, pd.PeriodDtype)): raise NotImplementedError( f"cuDF does not yet support {type(arbitrary.dtype).__name__}" @@ -1909,9 +1840,7 @@ def as_column( ) or ( isinstance(arbitrary.dtype, pd.CategoricalDtype) - and isinstance( - arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype - ) + and isinstance(arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype) ) ): raise NotImplementedError( @@ -1930,9 +1859,7 @@ def as_column( dtype=dtype, length=length, ) - elif isinstance( - arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype) - ): + elif isinstance(arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)): data = as_column( pa.array(arbitrary, from_pandas=True), nan_as_null=nan_as_null, @@ -1942,9 +1869,7 @@ def as_column( elif isinstance( arbitrary.dtype, pd.api.extensions.ExtensionDtype ) and not isinstance(arbitrary, NumpyExtensionArray): - raise NotImplementedError( - "Custom pandas ExtensionDtypes are not supported" - ) + raise NotImplementedError("Custom pandas ExtensionDtypes are not supported") elif arbitrary.dtype.kind in "fiubmM": # numpy dtype like if isinstance(arbitrary, NumpyExtensionArray): @@ -1974,12 +1899,9 @@ def as_column( "empty", "boolean", ): - raise TypeError( - f"Cannot convert a {inferred_dtype} of object type" - ) + raise TypeError(f"Cannot convert a {inferred_dtype} of object type") elif nan_as_null is False and ( - pd.isna(arbitrary).any() - and inferred_dtype not in ("decimal", "empty") + pd.isna(arbitrary).any() and inferred_dtype not in ("decimal", "empty") ): # Decimal can hold float("nan") # All np.nan is not restricted by type @@ -2042,11 +1964,7 @@ def as_column( arbitrary = np.asarray(arbitrary) # Handle case that `arbitrary` elements are cupy arrays - if ( - shape - and shape[0] - and hasattr(arbitrary[0], "__cuda_array_interface__") - ): + if shape and shape[0] and hasattr(arbitrary[0], "__cuda_array_interface__"): return as_column( cupy.asarray(arbitrary, dtype=arbitrary[0].dtype), nan_as_null=nan_as_null, @@ -2143,9 +2061,7 @@ def as_column( data = data.astype(cudf.dtype(dtype)) elif (view := as_memoryview(arbitrary)) is not None: - return as_column( - np.asarray(view), dtype=dtype, nan_as_null=nan_as_null - ) + return as_column(np.asarray(view), dtype=dtype, nan_as_null=nan_as_null) # Start of arbitrary that's not handed above but dtype provided elif isinstance(dtype, pd.DatetimeTZDtype): raise NotImplementedError( @@ -2205,8 +2121,7 @@ def as_column( and pa_array.type.tz is not None ): raise NotImplementedError( - "cuDF does not yet support timezone-aware " - "datetimes" + "cuDF does not yet support timezone-aware " "datetimes" ) if is_bool_dtype(dtype): # Need this special case handling for bool dtypes, @@ -2217,9 +2132,7 @@ def as_column( if np_dtype.kind in {"m", "M"}: unit = np.datetime_data(np_dtype)[0] if unit not in {"ns", "us", "ms", "s", "D"}: - raise NotImplementedError( - f"{dtype=} is not supported." - ) + raise NotImplementedError(f"{dtype=} is not supported.") pa_type = np_to_pa_dtype(np_dtype) else: # By default cudf constructs a 64-bit column. Setting @@ -2229,18 +2142,14 @@ def as_column( cudf.get_option("default_integer_bitwidth") and infer_dtype(arbitrary) == "integer" ): - pa_type = np_to_pa_dtype( - _maybe_convert_to_default_type("int") - ) + pa_type = np_to_pa_dtype(_maybe_convert_to_default_type("int")) if cudf.get_option("default_float_bitwidth") and infer_dtype( arbitrary ) in ( "floating", "mixed-integer-float", ): - pa_type = np_to_pa_dtype( - _maybe_convert_to_default_type("float") - ) + pa_type = np_to_pa_dtype(_maybe_convert_to_default_type("float")) pyarrow_array = pa.array( arbitrary, @@ -2280,9 +2189,7 @@ def as_column( elif ( isinstance(arbitrary, Sequence) and len(arbitrary) > 0 - and any( - cudf.utils.dtypes.is_column_like(arb) for arb in arbitrary - ) + and any(cudf.utils.dtypes.is_column_like(arb) for arb in arbitrary) ): # TODO: I think can be removed; covered by # elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)): @@ -2326,9 +2233,7 @@ def _construct_array( if inferred_dtype == "interval": # Only way to construct an Interval column. return pd.array(arbitrary) - elif ( - inferred_dtype == "string" and getattr(dtype, "kind", None) == "M" - ): + elif inferred_dtype == "string" and getattr(dtype, "kind", None) == "M": # We may have date-like strings with timezones try: with warnings.catch_warnings(): @@ -2347,9 +2252,7 @@ def _construct_array( arbitrary = np.asarray( arbitrary, - dtype=native_dtype - if native_dtype is None - else np.dtype(native_dtype), + dtype=native_dtype if native_dtype is None else np.dtype(native_dtype), ) return arbitrary @@ -2371,9 +2274,7 @@ def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]: col = as_column(mask) mask = bools_to_mask(col) else: - raise NotImplementedError( - f"Cannot infer mask from typestr {typestr}" - ) + raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") return mask @@ -2433,8 +2334,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # Notice, we can always cast pure null columns not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)] if len(not_null_col_dtypes) and all( - _is_non_decimal_numeric_dtype(dtyp) - and np.issubdtype(dtyp, np.datetime64) + _is_non_decimal_numeric_dtype(dtyp) and np.issubdtype(dtyp, np.datetime64) for dtyp in not_null_col_dtypes ): common_dtype = find_common_type(not_null_col_dtypes) @@ -2462,9 +2362,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: if all(isinstance(o.dtype, CategoricalDtype) for o in objs): return cudf.core.column.categorical.CategoricalColumn._concat( cast( - MutableSequence[ - cudf.core.column.categorical.CategoricalColumn - ], + MutableSequence[cudf.core.column.categorical.CategoricalColumn], objs, ) ) @@ -2472,8 +2370,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: newsize = sum(map(len, objs)) if newsize > libcudf.MAX_COLUMN_SIZE: raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + f"Result of concat cannot have " f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) elif newsize == 0: return column_empty(0, head.dtype, masked=True) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 9a5d9dcd47a..ccf1e248c1b 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -175,12 +175,9 @@ def _get_datetime_format(col, dtype, time_unit): sub_second_res_len = 0 has_nanos = time_unit in {"ns"} and col.get_dt_field("nanosecond").any() - has_micros = ( - time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any() - ) + has_micros = time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any() has_millis = ( - time_unit in {"ns", "us", "ms"} - and col.get_dt_field("millisecond").any() + time_unit in {"ns", "us", "ms"} and col.get_dt_field("millisecond").any() ) has_seconds = col.get_dt_field("second").any() has_minutes = col.get_dt_field("minute").any() @@ -269,9 +266,7 @@ def __contains__(self, item: ScalarLike) -> bool: # np.datetime64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item_as_dt64.astype("int64") in self.as_numerical_column( - "int64" - ) + return item_as_dt64.astype("int64") in self.as_numerical_column("int64") @property def time_unit(self) -> str: @@ -318,9 +313,7 @@ def values(self): """ Return a CuPy representation of the DateTimeColumn. """ - raise NotImplementedError( - "DateTime Arrays is not yet implemented in cudf" - ) + raise NotImplementedError("DateTime Arrays is not yet implemented in cudf") def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) @@ -361,9 +354,7 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if other_time_unit not in {"s", "ms", "ns", "us"}: other_time_unit = "ns" - return cudf.Scalar( - None, dtype=f"datetime64[{other_time_unit}]" - ) + return cudf.Scalar(None, dtype=f"datetime64[{other_time_unit}]") other = other.astype(self.dtype) return cudf.Scalar(other) @@ -424,13 +415,9 @@ def as_datetime_column( def as_timedelta_column( self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": - raise TypeError( - f"cannot astype a datetimelike from {self.dtype} to {dtype}" - ) + raise TypeError(f"cannot astype a datetimelike from {self.dtype} to {dtype}") - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": + def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": col = column.build_column( data=self.base_data, dtype=np.int64, @@ -454,22 +441,18 @@ def as_string_column( if format in _DATETIME_SPECIAL_FORMATS: names = as_column(_DATETIME_NAMES) else: - names = cudf.core.column.column_empty( - 0, dtype="object", masked=False - ) + names = cudf.core.column.column_empty(0, dtype="object", masked=False) if len(self) > 0: - return string._datetime_to_str_typecast_functions[ - cudf.dtype(self.dtype) - ](self, format, names) + return string._datetime_to_str_typecast_functions[cudf.dtype(self.dtype)]( + self, format, names + ) else: return cast( "cudf.core.column.StringColumn", column.column_empty(0, dtype="object", masked=False), ) - def mean( - self, skipna=None, min_count: int = 0, dtype=np.float64 - ) -> ScalarLike: + def mean(self, skipna=None, min_count: int = 0, dtype=np.float64) -> ScalarLike: return pd.Timestamp( self.as_numerical_column("int64").mean( skipna=skipna, min_count=min_count, dtype=dtype @@ -502,9 +485,7 @@ def cov(self, other: DatetimeColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov( - other.as_numerical_column("int64") - ) + return self.as_numerical_column("int64").cov(other.as_numerical_column("int64")) def corr(self, other: DatetimeColumn) -> float: if not isinstance(other, DatetimeColumn): @@ -529,9 +510,7 @@ def quantile( return_scalar=return_scalar, ) if return_scalar: - return pd.Timestamp(result, unit=self.time_unit).as_unit( - self.time_unit - ) + return pd.Timestamp(result, unit=self.time_unit).as_unit(self.time_unit) return result.astype(self.dtype) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: @@ -583,9 +562,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: "__ne__", }: out_dtype = cudf.dtype(np.bool_) - if isinstance(other, ColumnBase) and not isinstance( - other, DatetimeColumn - ): + if isinstance(other, ColumnBase) and not isinstance(other, DatetimeColumn): result = _all_bools_with_nulls( self, other, bool_fill_value=op == "__ne__" ) @@ -599,9 +576,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: result_col = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) if out_dtype != cudf.dtype(np.bool_) and op == "__add__": return result_col - elif cudf.get_option( - "mode.pandas_compatible" - ) and out_dtype == cudf.dtype(np.bool_): + elif cudf.get_option("mode.pandas_compatible") and out_dtype == cudf.dtype( + np.bool_ + ): return result_col.fillna(op == "__ne__") else: return result_col @@ -622,9 +599,7 @@ def fillna( return super().fillna(fill_value, method) - def indices_of( - self, value: ScalarLike - ) -> cudf.core.column.NumericalColumn: + def indices_of(self, value: ScalarLike) -> cudf.core.column.NumericalColumn: value = column.as_column( pd.to_datetime(value), dtype=self.dtype ).as_numerical_column("int64") @@ -655,9 +630,7 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: if max_dist <= np.timedelta64(max_int, to_res).astype( self_delta_dtype - ) and min_dist <= np.timedelta64(max_int, to_res).astype( - self_delta_dtype - ): + ) and min_dist <= np.timedelta64(max_int, to_res).astype(self_delta_dtype): return True else: return False @@ -708,9 +681,7 @@ def to_pandas( arrow_type: bool = False, ) -> pd.Series: if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") elif arrow_type: @@ -755,9 +726,7 @@ def as_string_column( return self._local_time.as_string_column(dtype, format) def get_dt_field(self, field: str) -> ColumnBase: - return libcudf.datetime.extract_datetime_component( - self._local_time, field - ) + return libcudf.datetime.extract_datetime_component(self._local_time, field) def __repr__(self): # Arrow prints the UTC timestamps, but we want to print the @@ -766,7 +735,5 @@ def __repr__(self): pa.timestamp(self.dtype.unit, str(self.dtype.tz)) ) return ( - f"{object.__repr__(self)}\n" - f"{arr.to_string()}\n" - f"dtype: {self.dtype}" + f"{object.__repr__(self)}\n" f"{arr.to_string()}\n" f"dtype: {self.dtype}" ) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index b83a6ded416..7c6ffca866c 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -69,9 +69,7 @@ def as_string_column( def __pow__(self, other): if isinstance(other, int): if other == 0: - res = cudf.core.column.as_column( - 1, dtype=self.dtype, length=len(self) - ) + res = cudf.core.column.as_column(1, dtype=self.dtype, length=len(self)) if self.nullable: res = res.set_mask(self.mask) return res @@ -185,9 +183,7 @@ def normalize_binop_value(self, other): metadata = other.as_tuple() precision = max(len(metadata.digits), metadata.exponent) scale = -metadata.exponent - return cudf.Scalar( - other, dtype=self.dtype.__class__(precision, scale) - ) + return cudf.Scalar(other, dtype=self.dtype.__class__(precision, scale)) return NotImplemented def _decimal_quantile( @@ -195,17 +191,13 @@ def _decimal_quantile( ) -> ColumnBase: quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls - indices = libcudf.sort.order_by( - [self], [True], "first", stable=True - ).slice(self.null_count, len(self)) - result = libcudf.quantiles.quantile( - self, quant, interpolation, indices, exact + indices = libcudf.sort.order_by([self], [True], "first", stable=True).slice( + self.null_count, len(self) ) + result = libcudf.quantiles.quantile(self, quant, interpolation, indices, exact) return result._with_type_metadata(self.dtype) - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": + def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": return libcudf.unary.cast(self, dtype) @@ -239,15 +231,9 @@ def to_arrow(self): data_buf_128[::4] = data_buf_32 # use striding again to set the remaining bits of each 128-bit chunk: # 0 for non-negative values, -1 for negative values: - data_buf_128[1::4] = np.piecewise( - data_buf_32, [data_buf_32 < 0], [-1, 0] - ) - data_buf_128[2::4] = np.piecewise( - data_buf_32, [data_buf_32 < 0], [-1, 0] - ) - data_buf_128[3::4] = np.piecewise( - data_buf_32, [data_buf_32 < 0], [-1, 0] - ) + data_buf_128[1::4] = np.piecewise(data_buf_32, [data_buf_32 < 0], [-1, 0]) + data_buf_128[2::4] = np.piecewise(data_buf_32, [data_buf_32 < 0], [-1, 0]) + data_buf_128[3::4] = np.piecewise(data_buf_32, [data_buf_32 < 0], [-1, 0]) data_buf = pa.py_buffer(data_buf_128) mask_buf = ( self.base_mask @@ -326,9 +312,7 @@ def to_arrow(self): data_buf_128[::2] = data_buf_64 # use striding again to set the remaining bits of each 128-bit chunk: # 0 for non-negative values, -1 for negative values: - data_buf_128[1::2] = np.piecewise( - data_buf_64, [data_buf_64 < 0], [-1, 0] - ) + data_buf_128[1::2] = np.piecewise(data_buf_64, [data_buf_64 < 0], [-1, 0]) data_buf = pa.py_buffer(data_buf_128) mask_buf = ( self.base_mask diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 7bd693966dc..3786b8cf8fa 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -65,9 +65,7 @@ def from_struct_column(cls, struct_column: StructColumn, closed="right"): first_field_name = next(iter(struct_column.dtype.fields.keys())) return IntervalColumn( size=struct_column.size, - dtype=IntervalDtype( - struct_column.dtype.fields[first_field_name], closed - ), + dtype=IntervalDtype(struct_column.dtype.fields[first_field_name], closed), mask=struct_column.base_mask, offset=struct_column.offset, null_count=struct_column.null_count, @@ -78,9 +76,7 @@ def copy(self, deep=True): struct_copy = super().copy(deep=deep) return IntervalColumn( size=struct_copy.size, - dtype=IntervalDtype( - struct_copy.dtype.fields["left"], self.dtype.closed - ), + dtype=IntervalDtype(struct_copy.dtype.fields["left"], self.dtype.closed), mask=struct_copy.base_mask, offset=struct_copy.offset, null_count=struct_copy.null_count, @@ -119,9 +115,7 @@ def to_pandas( # types into pandas (trying to convert the underlying numerical columns # directly is problematic), so we're stuck with this for now. if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") elif arrow_type: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 1c2bcbef2ec..64a8f8b617c 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -71,9 +71,9 @@ def memory_usage(self): child0_size = ( current_base_child.size + 1 - current_offset ) * current_base_child.base_children[0].dtype.itemsize - current_offset = current_base_child.base_children[ - 0 - ].element_indexing(current_offset) + current_offset = current_base_child.base_children[0].element_indexing( + current_offset + ) n += child0_size current_base_child = current_base_child.base_children[1] @@ -117,8 +117,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return concatenate_rows([self, other]) else: raise NotImplementedError( - "Lists concatenation for this operation is not yet" - "supported" + "Lists concatenation for this operation is not yet" "supported" ) else: raise TypeError("can only concatenate list to list") @@ -185,9 +184,7 @@ def _with_type_metadata( self: "cudf.core.column.ListColumn", dtype: Dtype ) -> "cudf.core.column.ListColumn": if isinstance(dtype, ListDtype): - elements = self.base_children[1]._with_type_metadata( - dtype.element_type - ) + elements = self.base_children[1]._with_type_metadata(dtype.element_type) return ListColumn( dtype=dtype, mask=self.base_mask, @@ -299,16 +296,12 @@ def to_pandas( # Can't rely on Column.to_pandas implementation for lists. # Need to perform `to_pylist` to preserve list types. if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(pa_array), index=index - ) + return pd.Series(pd.arrays.ArrowExtensionArray(pa_array), index=index) else: return pd.Series(pa_array.tolist(), dtype="object", index=index) @@ -322,9 +315,7 @@ class ListMethods(ColumnMethods): def __init__(self, parent: ParentType): if not isinstance(parent.dtype, ListDtype): - raise AttributeError( - "Can only use .list accessor with a 'list' dtype" - ) + raise AttributeError("Can only use .list accessor with a 'list' dtype") super().__init__(parent=parent) def get( @@ -392,15 +383,11 @@ def get( if not (default is None or default is NA): # determine rows for which `index` is out-of-bounds lengths = count_elements(self._column) - out_of_bounds_mask = (np.negative(index) > lengths) | ( - index >= lengths - ) + out_of_bounds_mask = (np.negative(index) > lengths) | (index >= lengths) # replace the value in those rows (should be NA) with `default` if out_of_bounds_mask.any(): - out = out._scatter_by_column( - out_of_bounds_mask, cudf.Scalar(default) - ) + out = out._scatter_by_column(out_of_bounds_mask, cudf.Scalar(default)) if out.dtype != self._column.dtype.element_type: # libcudf doesn't maintain struct labels so we must transfer over # manually from the input column if we lost some information @@ -510,9 +497,7 @@ def leaves(self) -> ParentType: 5 6 dtype: int64 """ - return self._return_or_inplace( - self._column.leaves(), retain_index=False - ) + return self._return_or_inplace(self._column.leaves(), retain_index=False) def len(self) -> ParentType: """ @@ -570,17 +555,11 @@ def take(self, lists_indices: ColumnLike) -> ParentType: if not isinstance(lists_indices_col, ListColumn): raise ValueError("lists_indices should be list type array.") if not lists_indices_col.size == self._column.size: - raise ValueError( - "lists_indices and list column is of different " "size." - ) + raise ValueError("lists_indices and list column is of different " "size.") if not _is_non_decimal_numeric_dtype( lists_indices_col.children[1].dtype - ) or not np.issubdtype( - lists_indices_col.children[1].dtype, np.integer - ): - raise TypeError( - "lists_indices should be column of values of index types." - ) + ) or not np.issubdtype(lists_indices_col.children[1].dtype, np.integer): + raise TypeError("lists_indices should be column of values of index types.") return self._return_or_inplace( segmented_gather(self._column, lists_indices_col) @@ -745,7 +724,5 @@ def astype(self, dtype): ListDtype(float64) """ return self._return_or_inplace( - self._column._transform_leaves( - lambda col, dtype: col.astype(dtype), dtype - ) + self._column._transform_leaves(lambda col, dtype: col.astype(dtype), dtype) ) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 0f5a0eb086b..422d03677e9 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -67,9 +67,7 @@ def _return_or_inplace( """ if inplace: self._parent._mimic_inplace( - self._parent.__class__._from_data( - {self._parent.name: new_col} - ), + self._parent.__class__._from_data({self._parent.name: new_col}), inplace=True, ) return None @@ -97,8 +95,6 @@ def _return_or_inplace( else: return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.core.index.as_index( - new_col, name=self._parent.name - ) + return cudf.core.index.as_index(new_col, name=self._parent.name) else: return self._parent._mimic_inplace(new_col, inplace=False) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b2bd73c9856..8e2e4cb3d8b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -131,24 +131,16 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: f"Cannot use a {type(value).__name__} to find an index of " f"a {self.dtype} Index." ) - if ( - value is not None - and self.dtype.kind in {"c", "f"} - and np.isnan(value) - ): + if value is not None and self.dtype.kind in {"c", "f"} and np.isnan(value): return column.as_column( - cp.argwhere( - cp.isnan(self.data_array_view(mode="read")) - ).flatten(), + cp.argwhere(cp.isnan(self.data_array_view(mode="read"))).flatten(), dtype=size_type_dtype, ) else: return super().indices_of(value) def has_nulls(self, include_nan: bool = False) -> bool: - return bool(self.null_count != 0) or ( - include_nan and bool(self.nan_count != 0) - ) + return bool(self.null_count != 0) or (include_nan and bool(self.nan_count != 0)) def __setitem__(self, key: Any, value: Any): """ @@ -181,9 +173,7 @@ def __setitem__(self, key: Any, value: Any): else: key = as_column( key, - dtype="float64" - if isinstance(key, list) and len(key) == 0 - else None, + dtype="float64" if isinstance(key, list) and len(key) == 0 else None, ) if not isinstance(key, cudf.core.column.NumericalColumn): raise ValueError(f"Invalid scatter map type {key.dtype}.") @@ -355,9 +345,9 @@ def as_string_column( self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if len(self) > 0: - return string._numeric_to_str_typecast_functions[ - cudf.dtype(self.dtype) - ](self) + return string._numeric_to_str_typecast_functions[cudf.dtype(self.dtype)]( + self + ) else: return cast( cudf.core.column.StringColumn, @@ -392,9 +382,7 @@ def as_timedelta_column( ), ) - def as_decimal_column( - self, dtype: Dtype - ) -> "cudf.core.column.DecimalBaseColumn": + def as_decimal_column(self, dtype: Dtype) -> "cudf.core.column.DecimalBaseColumn": return libcudf.unary.cast(self, dtype) def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: @@ -500,15 +488,11 @@ def find_and_replace( ): return self.copy() - to_replace_col = _normalize_find_and_replace_input( - self.dtype, to_replace - ) + to_replace_col = _normalize_find_and_replace_input(self.dtype, to_replace) if all_nan: replacement_col = column.as_column(replacement, dtype=self.dtype) else: - replacement_col = _normalize_find_and_replace_input( - self.dtype, replacement - ) + replacement_col = _normalize_find_and_replace_input(self.dtype, replacement) if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( replacement[0], length=len(to_replace_col), dtype=self.dtype @@ -518,9 +502,7 @@ def find_and_replace( to_replace_col, replacement_col, replaced = numeric_normalize_types( to_replace_col, replacement_col, self ) - df = cudf.DataFrame._from_data( - {"old": to_replace_col, "new": replacement_col} - ) + df = cudf.DataFrame._from_data({"old": to_replace_col, "new": replacement_col}) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: replaced = replaced.fillna( @@ -530,9 +512,7 @@ def find_and_replace( ) df = df.dropna(subset=["old"]) - return libcudf.replace.replace( - replaced, df._data["old"], df._data["new"] - ) + return libcudf.replace.replace(replaced, df._data["old"], df._data["new"]) def fillna( self, @@ -553,10 +533,7 @@ def fillna( if fill_value is None: raise ValueError("Must specify either 'fill_value' or 'method'") - if ( - isinstance(fill_value, cudf.Scalar) - and fill_value.dtype == col.dtype - ): + if isinstance(fill_value, cudf.Scalar) and fill_value.dtype == col.dtype: return super(NumericalColumn, col).fillna(fill_value, method) if np.isscalar(fill_value): @@ -627,9 +604,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: i_max_ = np.iinfo(self.dtype).max u_max_ = np.iinfo(to_dtype).max - return (self.min() >= 0) and ( - (i_max_ <= u_max_) or (self.max() < u_max_) - ) + return (self.min() >= 0) and ((i_max_ <= u_max_) or (self.max() < u_max_)) # want to cast uint to int elif self.dtype.kind == "u" and to_dtype.kind == "i": @@ -642,9 +617,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: elif self.dtype.kind in {"i", "u"} and to_dtype.kind == "f": info = np.finfo(to_dtype) biggest_exact_int = 2 ** (info.nmant + 1) - if (self.min() >= -biggest_exact_int) and ( - self.max() <= biggest_exact_int - ): + if (self.min() >= -biggest_exact_int) and (self.max() <= biggest_exact_int): return True else: filled = self.fillna(0) @@ -691,20 +664,14 @@ def to_pandas( arrow_type: bool = False, ) -> pd.Series: if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") elif arrow_type: return pd.Series( pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index ) elif ( nullable - and ( - pandas_nullable_dtype := np_dtypes_to_pandas_dtypes.get( - self.dtype - ) - ) + and (pandas_nullable_dtype := np_dtypes_to_pandas_dtypes.get(self.dtype)) is not None ): arrow_array = self.to_arrow() @@ -718,9 +685,7 @@ def to_pandas( def _reduction_result_dtype(self, reduction_op: str) -> Dtype: col_dtype = self.dtype if reduction_op in {"sum", "product"}: - col_dtype = ( - col_dtype if col_dtype.kind == "f" else np.dtype("int64") - ) + col_dtype = col_dtype if col_dtype.kind == "f" else np.dtype("int64") elif reduction_op == "sum_of_squares": col_dtype = np.result_dtype(col_dtype, np.dtype("uint64")) @@ -738,9 +703,7 @@ def _normalize_find_and_replace_input( if isinstance(col_to_normalize, list): if normalized_column.null_count == len(normalized_column): normalized_column = normalized_column.astype(input_column_dtype) - col_to_normalize_dtype = min_column_type( - normalized_column, input_column_dtype - ) + col_to_normalize_dtype = min_column_type(normalized_column, input_column_dtype) # Scalar case if len(col_to_normalize) == 1: if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]): @@ -767,8 +730,7 @@ def _normalize_find_and_replace_input( raise TypeError(f"Type {type(col_to_normalize)} not supported") if ( - col_to_normalize_dtype.kind == "f" - and input_column_dtype.kind in {"i", "u"} + col_to_normalize_dtype.kind == "f" and input_column_dtype.kind in {"i", "u"} ) or (col_to_normalize_dtype.num > input_column_dtype.num): raise TypeError( f"Potentially unsafe cast for non-equivalent " @@ -778,9 +740,7 @@ def _normalize_find_and_replace_input( return normalized_column.astype(input_column_dtype) -def digitize( - column: ColumnBase, bins: np.ndarray, right: bool = False -) -> ColumnBase: +def digitize(column: ColumnBase, bins: np.ndarray, right: bool = False) -> ColumnBase: """Return the indices of the bins to which each value in column belongs. Parameters diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index c45a9c7fd5d..ecb3a14e18b 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. """Define an interface for columns that can perform numerical operations.""" from __future__ import annotations @@ -99,9 +99,7 @@ def quantile( return_scalar: bool, ) -> NumericalBaseColumn: if np.logical_or(q < 0, q > 1).any(): - raise ValueError( - "percentiles should all be in the interval [0, 1]" - ) + raise ValueError("percentiles should all be in the interval [0, 1]") # Beyond this point, q either being scalar or list-like # will only have values in range [0, 1] if len(self) == 0: @@ -119,9 +117,7 @@ def quantile( try: new_scalar = self.dtype.type(scalar_result) scalar_result = ( - new_scalar - if new_scalar == scalar_result - else scalar_result + new_scalar if new_scalar == scalar_result else scalar_result ) except (TypeError, ValueError): pass @@ -138,9 +134,7 @@ def mean( min_count: int = 0, dtype=np.float64, ): - return self._reduce( - "mean", skipna=skipna, min_count=min_count, dtype=dtype - ) + return self._reduce("mean", skipna=skipna, min_count=min_count, dtype=dtype) def var( self, @@ -182,20 +176,14 @@ def _numeric_quantile( self, q: np.ndarray, interpolation: str, exact: bool ) -> NumericalBaseColumn: # get sorted indices and exclude nulls - indices = libcudf.sort.order_by( - [self], [True], "first", stable=True - ).slice(self.null_count, len(self)) - - return libcudf.quantiles.quantile( - self, q, interpolation, indices, exact + indices = libcudf.sort.order_by([self], [True], "first", stable=True).slice( + self.null_count, len(self) ) + return libcudf.quantiles.quantile(self, q, interpolation, indices, exact) + def cov(self, other: NumericalBaseColumn) -> float: - if ( - len(self) == 0 - or len(other) == 0 - or (len(self) == 1 and len(other) == 1) - ): + if len(self) == 0 or len(other) == 0 or (len(self) == 1 and len(other) == 1): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) result = (self - self.mean()) * (other - other.mean()) @@ -213,9 +201,7 @@ def corr(self, other: NumericalBaseColumn) -> float: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) return cov / lhs_std / rhs_std - def round( - self, decimals: int = 0, how: str = "half_even" - ) -> NumericalBaseColumn: + def round(self, decimals: int = 0, how: str = "half_even") -> NumericalBaseColumn: if not cudf.api.types.is_integer(decimals): raise TypeError("Values in decimals must be integers") """Round the values in the Column to the given number of decimals.""" diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fb76fcdaf39..49f69f63592 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -40,9 +40,9 @@ def str_to_boolean(column: StringColumn): """Takes in string column and returns boolean column""" - return ( - libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8") - ).fillna(False) + return (libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8")).fillna( + False + ) if TYPE_CHECKING: @@ -126,9 +126,7 @@ def __init__(self, parent): else parent.dtype ) if not is_string_dtype(value_type): - raise AttributeError( - "Can only use .str accessor with string values" - ) + raise AttributeError("Can only use .str accessor with string values") super().__init__(parent=parent) def htoi(self) -> SeriesOrIndex: @@ -219,9 +217,7 @@ def len(self) -> SeriesOrIndex: dtype: int32 """ - return self._return_or_inplace( - libstrings.count_characters(self._column) - ) + return self._return_or_inplace(libstrings.count_characters(self._column)) def byte_count(self) -> SeriesOrIndex: """ @@ -255,9 +251,7 @@ def byte_count(self) -> SeriesOrIndex: ) @overload - def cat( - self, sep: Optional[str] = None, na_rep: Optional[str] = None - ) -> str: + def cat(self, sep: Optional[str] = None, na_rep: Optional[str] = None) -> str: ... @overload @@ -380,9 +374,7 @@ def cat(self, others=None, sep=None, na_rep=None): out = out[0] return out - def join( - self, sep=None, string_na_rep=None, sep_na_rep=None - ) -> SeriesOrIndex: + def join(self, sep=None, string_na_rep=None, sep_na_rep=None) -> SeriesOrIndex: """ Join lists contained as elements in the Series/Index with passed delimiter. @@ -502,9 +494,7 @@ def join( string_na_rep = "" if is_scalar(sep) and sep_na_rep: - raise ValueError( - "sep_na_rep cannot be defined when `sep` is scalar." - ) + raise ValueError("sep_na_rep cannot be defined when `sep` is scalar.") if sep_na_rep is None: sep_na_rep = "" @@ -567,9 +557,7 @@ def _split_by_character(self): children=(offset_col, result_col), ) - def extract( - self, pat: str, flags: int = 0, expand: bool = True - ) -> SeriesOrIndex: + def extract(self, pat: str, flags: int = 0, expand: bool = True) -> SeriesOrIndex: r""" Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -628,9 +616,7 @@ def extract( re.MULTILINE. """ # noqa W605 if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) + raise NotImplementedError("unsupported value for `flags` parameter") data, _ = libstrings.extract(self._column, pat, flags) if len(data) == 1 and expand is False: @@ -762,13 +748,9 @@ def contains( flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) + raise NotImplementedError("unsupported value for `flags` parameter") if regex and not case: - raise NotImplementedError( - "`case=False` only supported when `regex=False`" - ) + raise NotImplementedError("`case=False` only supported when `regex=False`") if is_scalar(pat): if regex: @@ -838,17 +820,13 @@ def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex: dtype: boolean """ if not isinstance(pat, str): - raise TypeError( - f"expected a string object, not {type(pat).__name__}" - ) + raise TypeError(f"expected a string object, not {type(pat).__name__}") if esc is None: esc = "" if not isinstance(esc, str): - raise TypeError( - f"expected a string object, not {type(esc).__name__}" - ) + raise TypeError(f"expected a string object, not {type(esc).__name__}") if len(esc) > 1: raise ValueError( @@ -914,9 +892,7 @@ def repeat( ), ) - return self._return_or_inplace( - libstrings.repeat_scalar(self._column, repeats) - ) + return self._return_or_inplace(libstrings.repeat_scalar(self._column, repeats)) def replace( self, @@ -1025,9 +1001,7 @@ def replace( # Pandas forces non-regex replace when pat is a single-character return self._return_or_inplace( - libstrings.replace_re( - self._column, pat, cudf.Scalar(repl, "str"), n - ) + libstrings.replace_re(self._column, pat, cudf.Scalar(repl, "str"), n) if regex is True and len(pat) > 1 else libstrings.replace( self._column, @@ -1271,9 +1245,7 @@ def istimestamp(self, format: str) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace( - str_cast.istimestamp(self._column, format) - ) + return self._return_or_inplace(str_cast.istimestamp(self._column, format)) def isfloat(self) -> SeriesOrIndex: r""" @@ -2084,14 +2056,10 @@ def filter_alphanum( repl = "" return self._return_or_inplace( - libstrings.filter_alphanum( - self._column, cudf.Scalar(repl, "str"), keep - ), + libstrings.filter_alphanum(self._column, cudf.Scalar(repl, "str"), keep), ) - def slice_from( - self, starts: "cudf.Series", stops: "cudf.Series" - ) -> SeriesOrIndex: + def slice_from(self, starts: "cudf.Series", stops: "cudf.Series") -> SeriesOrIndex: """ Return substring of each string using positions for each string. @@ -2228,9 +2196,7 @@ def slice_replace( ), ) - def insert( - self, start: int = 0, repl: Optional[str] = None - ) -> SeriesOrIndex: + def insert(self, start: int = 0, repl: Optional[str] = None) -> SeriesOrIndex: """ Insert the specified string into each string in the specified position. @@ -2397,9 +2363,7 @@ def get_json_object( options = libstrings.GetJsonObjectOptions( allow_single_quotes=allow_single_quotes, - strip_quotes_from_single_strings=( - strip_quotes_from_single_strings - ), + strip_quotes_from_single_strings=(strip_quotes_from_single_strings), missing_fields_as_nulls=missing_fields_as_nulls, ) return self._return_or_inplace( @@ -2535,8 +2499,7 @@ def split( if expand not in (True, False): raise ValueError( - f"expand parameter accepts only : [True, False], " - f"got {expand}" + f"expand parameter accepts only : [True, False], " f"got {expand}" ) # Pandas treats 0 as all @@ -2559,9 +2522,7 @@ def split( if regex is True: data, _ = libstrings.split_re(self._column, pat, n) else: - data, _ = libstrings.split( - self._column, cudf.Scalar(pat, "str"), n - ) + data, _ = libstrings.split(self._column, cudf.Scalar(pat, "str"), n) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2711,8 +2672,7 @@ def rsplit( if expand not in (True, False): raise ValueError( - f"expand parameter accepts only : [True, False], " - f"got {expand}" + f"expand parameter accepts only : [True, False], " f"got {expand}" ) # Pandas treats 0 as all @@ -2741,9 +2701,7 @@ def rsplit( result_table = data else: if regex is True: - result_table = libstrings.rsplit_record_re( - self._column, pat, n - ) + result_table = libstrings.rsplit_record_re(self._column, pat, n) else: result_table = libstrings.rsplit_record( self._column, cudf.Scalar(pat, "str"), n @@ -2823,9 +2781,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: """ if expand is not True: - raise NotImplementedError( - "`expand=False` is currently not supported" - ) + raise NotImplementedError("`expand=False` is currently not supported") if sep is None: sep = " " @@ -2888,9 +2844,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: ) """ if expand is not True: - raise NotImplementedError( - "`expand=False` is currently not supported" - ) + raise NotImplementedError("`expand=False` is currently not supported") if sep is None: sep = " " @@ -2900,9 +2854,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: expand=expand, ) - def pad( - self, width: int, side: str = "left", fillchar: str = " " - ) -> SeriesOrIndex: + def pad(self, width: int, side: str = "left", fillchar: str = " ") -> SeriesOrIndex: """ Pad strings in the Series/Index up to width. @@ -2964,9 +2916,7 @@ def pad( dtype: object """ if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) + msg = f"fillchar must be a character, not {type(fillchar).__name__}" raise TypeError(msg) if len(fillchar) != 1: @@ -2979,9 +2929,7 @@ def pad( try: side = libstrings.SideType[side.upper()] except KeyError: - raise ValueError( - "side has to be either one of {'left', 'right', 'both'}" - ) + raise ValueError("side has to be either one of {'left', 'right', 'both'}") return self._return_or_inplace( libstrings.pad(self._column, width, fillchar, side) @@ -3109,9 +3057,7 @@ def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: dtype: object """ if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) + msg = f"fillchar must be a character, not {type(fillchar).__name__}" raise TypeError(msg) if len(fillchar) != 1: @@ -3121,9 +3067,7 @@ def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace( - libstrings.center(self._column, width, fillchar) - ) + return self._return_or_inplace(libstrings.center(self._column, width, fillchar)) def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3163,9 +3107,7 @@ def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: dtype: object """ if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) + msg = f"fillchar must be a character, not {type(fillchar).__name__}" raise TypeError(msg) if len(fillchar) != 1: @@ -3175,9 +3117,7 @@ def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace( - libstrings.ljust(self._column, width, fillchar) - ) + return self._return_or_inplace(libstrings.ljust(self._column, width, fillchar)) def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3217,9 +3157,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: dtype: object """ if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) + msg = f"fillchar must be a character, not {type(fillchar).__name__}" raise TypeError(msg) if len(fillchar) != 1: @@ -3229,9 +3167,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace( - libstrings.rjust(self._column, width, fillchar) - ) + return self._return_or_inplace(libstrings.rjust(self._column, width, fillchar)) def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: r""" @@ -3448,42 +3384,30 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex: if expand_tabs is True: raise NotImplementedError("`expand_tabs=True` is not supported") elif expand_tabs is None: - warnings.warn( - "wrap current implementation defaults to `expand_tabs`=False" - ) + warnings.warn("wrap current implementation defaults to `expand_tabs`=False") replace_whitespace = kwargs.get("replace_whitespace", True) if not replace_whitespace: - raise NotImplementedError( - "`replace_whitespace=False` is not supported" - ) + raise NotImplementedError("`replace_whitespace=False` is not supported") drop_whitespace = kwargs.get("drop_whitespace", True) if not drop_whitespace: - raise NotImplementedError( - "`drop_whitespace=False` is not supported" - ) + raise NotImplementedError("`drop_whitespace=False` is not supported") break_long_words = kwargs.get("break_long_words", None) if break_long_words is True: - raise NotImplementedError( - "`break_long_words=True` is not supported" - ) + raise NotImplementedError("`break_long_words=True` is not supported") elif break_long_words is None: warnings.warn( - "wrap current implementation defaults to " - "`break_long_words`=False" + "wrap current implementation defaults to " "`break_long_words`=False" ) break_on_hyphens = kwargs.get("break_on_hyphens", None) if break_long_words is True: - raise NotImplementedError( - "`break_on_hyphens=True` is not supported" - ) + raise NotImplementedError("`break_on_hyphens=True` is not supported") elif break_on_hyphens is None: warnings.warn( - "wrap current implementation defaults to " - "`break_on_hyphens`=False" + "wrap current implementation defaults to " "`break_on_hyphens`=False" ) return self._return_or_inplace(libstrings.wrap(self._column, width)) @@ -3551,13 +3475,9 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) + raise NotImplementedError("unsupported value for `flags` parameter") - return self._return_or_inplace( - libstrings.count_re(self._column, pat, flags) - ) + return self._return_or_inplace(libstrings.count_re(self._column, pat, flags)) def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ @@ -3628,9 +3548,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) + raise NotImplementedError("unsupported value for `flags` parameter") data = libstrings.findall(self._column, pat, flags) return self._return_or_inplace(data) @@ -3834,9 +3752,7 @@ def endswith(self, pat: str) -> SeriesOrIndex: f"{type(pat).__name__}" ) elif is_scalar(pat): - result_col = libstrings.endswith( - self._column, cudf.Scalar(pat, "str") - ) + result_col = libstrings.endswith(self._column, cudf.Scalar(pat, "str")) else: result_col = libstrings.endswith_multiple( self._column, column.as_column(pat, dtype="str") @@ -3897,9 +3813,7 @@ def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex: f"{type(pat).__name__}" ) elif is_scalar(pat): - result_col = libstrings.startswith( - self._column, cudf.Scalar(pat, "str") - ) + result_col = libstrings.startswith(self._column, cudf.Scalar(pat, "str")) else: result_col = libstrings.startswith_multiple( self._column, column.as_column(pat, dtype="str") @@ -3940,12 +3854,8 @@ def removesuffix(self, suffix: str) -> SeriesOrIndex: """ if suffix is None or len(suffix) == 0: return self._return_or_inplace(self._column) - ends_column = libstrings.endswith( - self._column, cudf.Scalar(suffix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, 0, -len(suffix), None - ) + ends_column = libstrings.endswith(self._column, cudf.Scalar(suffix, "str")) + removed_column = libstrings.slice_strings(self._column, 0, -len(suffix), None) result = cudf._lib.copying.copy_if_else( removed_column, self._column, ends_column ) @@ -3984,12 +3894,8 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: """ if prefix is None or len(prefix) == 0: return self._return_or_inplace(self._column) - starts_column = libstrings.startswith( - self._column, cudf.Scalar(prefix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, len(prefix), None, None - ) + starts_column = libstrings.startswith(self._column, cudf.Scalar(prefix, "str")) + removed_column = libstrings.slice_strings(self._column, len(prefix), None, None) result = cudf._lib.copying.copy_if_else( removed_column, self._column, starts_column ) @@ -4039,16 +3945,12 @@ def find( dtype: int32 """ if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) + raise TypeError(f"expected a string object, not {type(sub).__name__}") if end is None: end = -1 - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = libstrings.find(self._column, cudf.Scalar(sub, "str"), start, end) return self._return_or_inplace(result_col) @@ -4100,16 +4002,12 @@ def rfind( dtype: int32 """ if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) + raise TypeError(f"expected a string object, not {type(sub).__name__}") if end is None: end = -1 - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = libstrings.rfind(self._column, cudf.Scalar(sub, "str"), start, end) return self._return_or_inplace(result_col) @@ -4157,16 +4055,12 @@ def index( dtype: int32 """ if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) + raise TypeError(f"expected a string object, not {type(sub).__name__}") if end is None: end = -1 - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = libstrings.find(self._column, cudf.Scalar(sub, "str"), start, end) result = self._return_or_inplace(result_col) @@ -4219,16 +4113,12 @@ def rindex( dtype: int32 """ if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) + raise TypeError(f"expected a string object, not {type(sub).__name__}") if end is None: end = -1 - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = libstrings.rfind(self._column, cudf.Scalar(sub, "str"), start, end) result = self._return_or_inplace(result_col) @@ -4237,9 +4127,7 @@ def rindex( else: return result - def match( - self, pat: str, case: bool = True, flags: int = 0 - ) -> SeriesOrIndex: + def match(self, pat: str, case: bool = True, flags: int = 0) -> SeriesOrIndex: """ Determine if each string matches a regular expression. @@ -4288,13 +4176,9 @@ def match( flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) + raise NotImplementedError("unsupported value for `flags` parameter") - return self._return_or_inplace( - libstrings.match_re(self._column, pat, flags) - ) + return self._return_or_inplace(libstrings.match_re(self._column, pat, flags)) def url_decode(self) -> SeriesOrIndex: """ @@ -4438,9 +4322,7 @@ def translate(self, table: dict) -> SeriesOrIndex: dtype: object """ table = str.maketrans(table) - return self._return_or_inplace( - libstrings.translate(self._column, table) - ) + return self._return_or_inplace(libstrings.translate(self._column, table)) def filter_characters( self, table: dict, keep: bool = True, repl: Optional[str] = None @@ -4513,9 +4395,7 @@ def normalize_spaces(self) -> SeriesOrIndex: 1 test string dtype: object """ - return self._return_or_inplace( - libstrings.normalize_spaces(self._column) - ) + return self._return_or_inplace(libstrings.normalize_spaces(self._column)) def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" @@ -4618,9 +4498,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: ) return result - def detokenize( - self, indices: "cudf.Series", separator: str = " " - ) -> SeriesOrIndex: + def detokenize(self, indices: "cudf.Series", separator: str = " ") -> SeriesOrIndex: """ Combines tokens into strings by concatenating them in the order in which they appear in the ``indices`` column. The ``separator`` is @@ -4790,9 +4668,7 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: retain_index=False, ) - def character_ngrams( - self, n: int = 2, as_list: bool = False - ) -> SeriesOrIndex: + def character_ngrams(self, n: int = 2, as_list: bool = False) -> SeriesOrIndex: """ Generate the n-grams from characters in a column of strings. @@ -4855,9 +4731,7 @@ def character_ngrams( return result.explode() return result - def hash_character_ngrams( - self, n: int = 5, as_list: bool = False - ) -> SeriesOrIndex: + def hash_character_ngrams(self, n: int = 5, as_list: bool = False) -> SeriesOrIndex: """ Generate hashes of n-grams from characters in a column of strings. The MurmurHash32 algorithm is used to produce the hash results. @@ -5006,8 +4880,7 @@ def replace_tokens( delimiter = "" elif not is_scalar(delimiter): raise TypeError( - f"Type of delimiter should be a string," - f" found {type(delimiter)}" + f"Type of delimiter should be a string," f" found {type(delimiter)}" ) return self._return_or_inplace( @@ -5069,16 +4942,14 @@ def filter_tokens( replacement = "" elif not is_scalar(replacement): raise TypeError( - f"Type of replacement should be a string," - f" found {type(replacement)}" + f"Type of replacement should be a string," f" found {type(replacement)}" ) if delimiter is None: delimiter = "" elif not is_scalar(delimiter): raise TypeError( - f"Type of delimiter should be a string," - f" found {type(delimiter)}" + f"Type of delimiter should be a string," f" found {type(delimiter)}" ) return self._return_or_inplace( @@ -5109,9 +4980,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex: 1 2 dtype: int32 """ - return self._return_or_inplace( - libstrings.porter_stemmer_measure(self._column) - ) + return self._return_or_inplace(libstrings.porter_stemmer_measure(self._column)) def is_consonant(self, position) -> SeriesOrIndex: """ @@ -5279,17 +5148,13 @@ def edit_distance_matrix(self) -> SeriesOrIndex: dtype: list """ if self._column.size < 2: - raise ValueError( - "Require size >= 2 to compute edit distance matrix." - ) + raise ValueError("Require size >= 2 to compute edit distance matrix.") if self._column.has_nulls(): raise ValueError( "Cannot compute edit distance between null strings. " "Consider removing them using `dropna` or fill with `fillna`." ) - return self._return_or_inplace( - libstrings.edit_distance_matrix(self._column) - ) + return self._return_or_inplace(libstrings.edit_distance_matrix(self._column)) def minhash( self, seeds: Optional[ColumnLike] = None, width: int = 4 @@ -5499,9 +5364,7 @@ def __init__( if len(children) == 0 and size != 0: # all nulls-column: - offsets = column.as_column( - 0, length=size + 1, dtype=size_type_dtype - ) + offsets = column.as_column(0, length=size + 1, dtype=size_type_dtype) children = (offsets,) @@ -5546,9 +5409,7 @@ def end_offset(self) -> int: and (self.offset + self.size) < self.base_children[0].size ): self._end_offset = int( - self.base_children[0].element_indexing( - self.offset + self.size - ) + self.base_children[0].element_indexing(self.offset + self.size) ) else: self._end_offset = 0 @@ -5561,9 +5422,7 @@ def memory_usage(self) -> int: if self.data is not None: n += self.data.size if len(self.base_children) == 1: - child0_size = (self.size + 1) * self.base_children[ - 0 - ].dtype.itemsize + child0_size = (self.size + 1) * self.base_children[0].dtype.itemsize n += child0_size if self.nullable: @@ -5590,9 +5449,7 @@ def data(self): ): self._data = self.base_data else: - self._data = self.base_data[ - self.start_offset : self.end_offset - ] + self._data = self.base_data[self.start_offset : self.end_offset] return self._data def all(self, skipna: bool = True) -> bool: @@ -5610,9 +5467,7 @@ def any(self, skipna: bool = True) -> bool: raise NotImplementedError("`any` not implemented for `StringColumn`") - def data_array_view( - self, *, mode="write" - ) -> cuda.devicearray.DeviceNDArray: + def data_array_view(self, *, mode="write") -> cuda.devicearray.DeviceNDArray: raise ValueError("Cannot get an array view of a StringColumn") def to_arrow(self) -> pa.Array: @@ -5632,9 +5487,7 @@ def to_arrow(self) -> pa.Array: ] """ if self.null_count == len(self): - return pa.NullArray.from_buffers( - pa.null(), len(self), [pa.py_buffer(b"")] - ) + return pa.NullArray.from_buffers(pa.null(), len(self), [pa.py_buffer(b"")]) else: return super().to_arrow() @@ -5644,9 +5497,7 @@ def sum( dtype: Optional[Dtype] = None, min_count: int = 0, ): - result_col = self._process_for_reduction( - skipna=skipna, min_count=min_count - ) + result_col = self._process_for_reduction(skipna=skipna, min_count=min_count) if isinstance(result_col, type(self)): return libstrings.join( result_col, @@ -5666,9 +5517,7 @@ def __contains__(self, item: ScalarLike) -> bool: self, column.as_column(item, dtype=self.dtype) ) - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": + def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": out_dtype = cudf.api.types.dtype(dtype) string_col = self if out_dtype.kind in {"i", "u"}: @@ -5699,9 +5548,7 @@ def _as_datetime_or_timedelta_column(self, dtype, format): if dtype.kind == "M": without_nat = self.apply_boolean_mask(is_nat.unary_operator("not")) all_same_length = ( - libstrings.count_characters(without_nat).distinct_count( - dropna=True - ) + libstrings.count_characters(without_nat).distinct_count(dropna=True) == 1 ) if not all_same_length: @@ -5741,9 +5588,7 @@ def as_datetime_column( if self.null_count == len(self): return cast( "cudf.core.column.DatetimeColumn", - column.column_empty( - len(self), dtype=out_dtype, masked=True - ), + column.column_empty(len(self), dtype=out_dtype, masked=True), ) else: format = datetime.infer_format( @@ -5764,14 +5609,10 @@ def as_timedelta_column( format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) - def as_decimal_column( - self, dtype: Dtype - ) -> "cudf.core.column.DecimalBaseColumn": + def as_decimal_column(self, dtype: Dtype) -> "cudf.core.column.DecimalBaseColumn": return libstrings.to_decimal(self, dtype) - def as_string_column( - self, dtype: Dtype, format: str | None = None - ) -> StringColumn: + def as_string_column(self, dtype: Dtype, format: str | None = None) -> StringColumn: return self @property @@ -5796,9 +5637,7 @@ def to_pandas( arrow_type: bool = False, ) -> pd.Series: if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") if arrow_type: return pd.Series( pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index @@ -5814,10 +5653,7 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: if self.dtype == to_dtype: return True - elif ( - to_dtype.kind in {"i", "u"} - and not libstrings.is_integer(self).all() - ): + elif to_dtype.kind in {"i", "u"} and not libstrings.is_integer(self).all(): return False elif to_dtype.kind == "f" and not libstrings.is_float(self).all(): return False @@ -5844,14 +5680,9 @@ def find_and_replace( f"value dtype: {replacement_col.dtype}" ) - if ( - to_replace_col.dtype != self.dtype - and replacement_col.dtype != self.dtype - ): + if to_replace_col.dtype != self.dtype and replacement_col.dtype != self.dtype: return self.copy() - df = cudf.DataFrame._from_data( - {"old": to_replace_col, "new": replacement_col} - ) + df = cudf.DataFrame._from_data({"old": to_replace_col, "new": replacement_col}) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: res = self.fillna( @@ -5879,9 +5710,7 @@ def fillna( fill_value = cudf.Scalar(fill_value, dtype=self.dtype) return super().fillna(fill_value, method=method) - def normalize_binop_value( - self, other - ) -> Union[column.ColumnBase, cudf.Scalar]: + def normalize_binop_value(self, other) -> Union[column.ColumnBase, cudf.Scalar]: if ( isinstance(other, (column.ColumnBase, cudf.Scalar)) and other.dtype == "object" @@ -5891,9 +5720,7 @@ def normalize_binop_value( return cudf.Scalar(other) return NotImplemented - def _binaryop( - self, other: ColumnBinaryOperand, op: str - ) -> "column.ColumnBase": + def _binaryop(self, other: ColumnBinaryOperand, op: str) -> "column.ColumnBase": reflect, op = self._check_reflected_op(op) # Due to https://github.com/pandas-dev/pandas/issues/46332 we need to # support binary operations between empty or all null string columns @@ -5938,9 +5765,7 @@ def _binaryop( if isinstance(other, cudf.Scalar): other = cast( StringColumn, - column.as_column( - other, length=len(self), dtype="object" - ), + column.as_column(other, length=len(self), dtype="object"), ) # Explicit types are necessary because mypy infers ColumnBase @@ -5967,17 +5792,13 @@ def _binaryop( "NULL_EQUALS", }: lhs, rhs = (other, self) if reflect else (self, other) - return libcudf.binaryop.binaryop( - lhs=lhs, rhs=rhs, op=op, dtype="bool" - ) + return libcudf.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype="bool") return NotImplemented @copy_docstring(column.ColumnBase.view) def view(self, dtype) -> "cudf.core.column.ColumnBase": if self.null_count > 0: - raise ValueError( - "Can not produce a view of a string column with nulls" - ) + raise ValueError("Can not produce a view of a string column with nulls") dtype = cudf.api.types.dtype(dtype) str_byte_offset = self.base_children[0].element_indexing(self.offset) str_end_byte_offset = self.base_children[0].element_indexing( @@ -6006,9 +5827,7 @@ def _get_cols_list(parent_obj, others): and len(others) > 0 and ( can_convert_to_column( - others.iloc[0] - if isinstance(others, cudf.Series) - else others[0] + others.iloc[0] if isinstance(others, cudf.Series) else others[0] ) ) ): diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 1b2ffcc2700..b3026d2478d 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -35,17 +35,12 @@ def base_size(self): def to_arrow(self): children = [ - pa.nulls(len(child)) - if len(child) == child.null_count - else child.to_arrow() + pa.nulls(len(child)) if len(child) == child.null_count else child.to_arrow() for child in self.children ] pa_type = pa.struct( - { - field: child.type - for field, child in zip(self.dtype.fields, children) - } + {field: child.type for field, child in zip(self.dtype.fields, children)} ) if self.nullable: @@ -67,16 +62,12 @@ def to_pandas( # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(pa_array), index=index - ) + return pd.Series(pd.arrays.ArrowExtensionArray(pa_array), index=index) else: return pd.Series(pa_array.tolist(), dtype="object", index=index) @@ -94,8 +85,7 @@ def memory_usage(self): def element_indexing(self, index: int): result = super().element_indexing(index) return { - field: value - for field, value in zip(self.dtype.fields, result.values()) + field: value for field, value in zip(self.dtype.fields, result.values()) } def __setitem__(self, key, value): @@ -172,9 +162,7 @@ class StructMethods(ColumnMethods): def __init__(self, parent=None): if not isinstance(parent.dtype, StructDtype): - raise AttributeError( - "Can only use .struct accessor with a 'struct' dtype" - ) + raise AttributeError("Can only use .struct accessor with a 'struct' dtype") super().__init__(parent=parent) def field(self, key): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 0d24e8e5120..cc45b0e1956 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -122,17 +122,13 @@ def values(self): """ Return a CuPy representation of the TimeDeltaColumn. """ - raise NotImplementedError( - "TimeDelta Arrays is not yet implemented in cudf" - ) + raise NotImplementedError("TimeDelta Arrays is not yet implemented in cudf") @acquire_spill_lock() def to_arrow(self) -> pa.Array: mask = None if self.nullable: - mask = pa.py_buffer( - self.mask_array_view(mode="read").copy_to_host() - ) + mask = pa.py_buffer(self.mask_array_view(mode="read").copy_to_host()) data = pa.py_buffer( self.as_numerical_column("int64") .data_array_view(mode="read") @@ -176,9 +172,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: this = self.astype(common_dtype).astype(out_dtype) if isinstance(other, cudf.Scalar): if other.is_valid(): - other = other.value.astype(common_dtype).astype( - out_dtype - ) + other = other.value.astype(common_dtype).astype(out_dtype) else: other = cudf.Scalar(None, out_dtype) else: @@ -205,9 +199,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: lhs, rhs = (other, this) if reflect else (this, other) result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - if cudf.get_option( - "mode.pandas_compatible" - ) and out_dtype == cudf.dtype(np.bool_): + if cudf.get_option("mode.pandas_compatible") and out_dtype == cudf.dtype( + np.bool_ + ): result = result.fillna(op == "__ne__") return result @@ -269,9 +263,7 @@ def fillna( fill_value = column.as_column(fill_value, nan_as_null=False) return super().fillna(fill_value, method) - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": + def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": col = column.build_column( data=self.base_data, dtype=np.int64, @@ -284,9 +276,7 @@ def as_numerical_column( def as_datetime_column( self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": - raise TypeError( - f"cannot astype a timedelta from {self.dtype} to {dtype}" - ) + raise TypeError(f"cannot astype a timedelta from {self.dtype} to {dtype}") def as_string_column( self, dtype: Dtype, format: str | None = None @@ -296,9 +286,9 @@ def as_string_column( self.dtype.name, "%D days %H:%M:%S" ) if len(self) > 0: - return string._timedelta_to_str_typecast_functions[ - cudf.dtype(self.dtype) - ](self, format=format) + return string._timedelta_to_str_typecast_functions[cudf.dtype(self.dtype)]( + self, format=format + ) else: return cast( "cudf.core.column.StringColumn", @@ -342,9 +332,7 @@ def quantile( return_scalar=return_scalar, ) if return_scalar: - return pd.Timedelta(result, unit=self.time_unit).as_unit( - self.time_unit - ) + return pd.Timedelta(result, unit=self.time_unit).as_unit(self.time_unit) return result.astype(self.dtype) def sum( @@ -382,9 +370,7 @@ def cov(self, other: TimeDeltaColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov( - other.as_numerical_column("int64") - ) + return self.as_numerical_column("int64").cov(other.as_numerical_column("int64")) def corr(self, other: TimeDeltaColumn) -> float: if not isinstance(other, TimeDeltaColumn): @@ -433,35 +419,35 @@ def components(self, index=None) -> "cudf.DataFrame": data = { "days": self // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ).astype(self.dtype) + np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( + self.dtype + ) ), "hours": ( self % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ).astype(self.dtype) + np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( + self.dtype + ) ) ) // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["h"], "ns" - ).astype(self.dtype) + np.timedelta64(_unit_to_nanoseconds_conversion["h"], "ns").astype( + self.dtype + ) ), "minutes": ( self % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["h"], "ns" - ).astype(self.dtype) + np.timedelta64(_unit_to_nanoseconds_conversion["h"], "ns").astype( + self.dtype + ) ) ) // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["m"], "ns" - ).astype(self.dtype) + np.timedelta64(_unit_to_nanoseconds_conversion["m"], "ns").astype( + self.dtype + ) ), } keys_list = iter(date_meta.keys()) @@ -475,9 +461,9 @@ def components(self, index=None) -> "cudf.DataFrame": ).astype(self.dtype) ) ) // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion[value[1]], "ns" - ).astype(self.dtype) + np.timedelta64(_unit_to_nanoseconds_conversion[value[1]], "ns").astype( + self.dtype + ) ) if self._time_unit == value[1]: break @@ -525,13 +511,11 @@ def seconds(self) -> "cudf.core.column.NumericalColumn": return ( self % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ).astype(self.dtype) + np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( + self.dtype + ) ) - ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") - ) + ) // cudf.Scalar(np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns")) @property def microseconds(self) -> "cudf.core.column.NumericalColumn": @@ -549,12 +533,10 @@ def microseconds(self) -> "cudf.core.column.NumericalColumn": return ( self - % np.timedelta64( - _unit_to_nanoseconds_conversion["s"], "ns" - ).astype(self.dtype) - ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") - ) + % np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns").astype( + self.dtype + ) + ) // cudf.Scalar(np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns")) @property def nanoseconds(self) -> "cudf.core.column.NumericalColumn": @@ -578,12 +560,8 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": return cast("cudf.core.column.NumericalColumn", res_col) return ( self - % cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") - ) - ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns") - ) + % cudf.Scalar(np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns")) + ) // cudf.Scalar(np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns")) def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 33085bede78..faed3abc618 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -169,9 +169,7 @@ def __repr__(self) -> str: f"multiindex={self.multiindex}, " f"level_names={self.level_names})" ) - column_info = "\n".join( - [f"{name}: {col.dtype}" for name, col in self.items()] - ) + column_info = "\n".join([f"{name}: {col.dtype}" for name, col in self.items()]) return f"{type_info}\n{column_info}" @property @@ -243,9 +241,7 @@ def to_pandas_index(self) -> pd.Index: # Determine if we can return a RangeIndex if self.rangeindex: if not self.names: - return pd.RangeIndex( - start=0, stop=0, step=1, name=self.name - ) + return pd.RangeIndex(start=0, stop=0, step=1, name=self.name) elif cudf.api.types.infer_dtype(self.names) == "integer": if len(self.names) == 1: start = self.names[0] @@ -255,9 +251,7 @@ def to_pandas_index(self) -> pd.Index: uniques = np.unique(np.diff(np.array(self.names))) if len(uniques) == 1 and uniques[0] != 0: diff = uniques[0] - new_range = range( - self.names[0], self.names[-1] + diff, diff - ) + new_range = range(self.names[0], self.names[-1] + diff, diff) return pd.RangeIndex(new_range, name=self.name) result = pd.Index( self.names, @@ -267,9 +261,7 @@ def to_pandas_index(self) -> pd.Index: ) return result - def insert( - self, name: Any, value: Any, loc: int = -1, validate: bool = True - ): + def insert(self, name: Any, value: Any, loc: int = -1, validate: bool = True): """ Insert column into the ColumnAccessor at the specified location. @@ -292,9 +284,7 @@ def insert( if loc == -1: loc = ncols if not (0 <= loc <= ncols): - raise ValueError( - "insert: loc out of bounds: must be 0 <= loc <= ncols" - ) + raise ValueError("insert: loc out of bounds: must be 0 <= loc <= ncols") # TODO: we should move all insert logic here if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") @@ -370,9 +360,7 @@ def get_labels_by_index(self, index: Any) -> tuple: return (self.names[index],) elif (bn := len(index)) > 0 and all(map(is_bool, index)): if bn != (n := len(self.names)): - raise IndexError( - f"Boolean mask has wrong length: {bn} not {n}" - ) + raise IndexError(f"Boolean mask has wrong length: {bn} not {n}") if isinstance(index, (pd.Series, cudf.Series)): # Don't allow iloc indexing with series raise NotImplementedError( @@ -474,13 +462,9 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: # Special-casing for boolean mask if (bn := len(key)) > 0 and all(map(is_bool, key)): if bn != (n := len(self.names)): - raise IndexError( - f"Boolean mask has wrong length: {bn} not {n}" - ) + raise IndexError(f"Boolean mask has wrong length: {bn} not {n}") data = dict( - item - for item, keep in zip(self._grouped_data.items(), key) - if keep + item for item, keep in zip(self._grouped_data.items(), key) if keep ) else: data = {k: self._grouped_data[k] for k in key} @@ -641,12 +625,9 @@ def droplevel(self, level): level += self.nlevels self._data = { - _remove_key_level(key, level): value - for key, value in self._data.items() + _remove_key_level(key, level): value for key, value in self._data.items() } - self._level_names = ( - self._level_names[:level] + self._level_names[level + 1 :] - ) + self._level_names = self._level_names[:level] + self._level_names[level + 1 :] if ( len(self._level_names) == 1 @@ -704,9 +685,7 @@ def _get_level(x, nlevels, level_names): if x < 0: x += nlevels if x >= nlevels: - raise IndexError( - f"Level {x} out of bounds. Index has {nlevels} levels." - ) + raise IndexError(f"Level {x} out of bounds. Index has {nlevels} levels.") return x else: x = level_names.index(x) diff --git a/python/cudf/cudf/core/common.py b/python/cudf/cudf/core/common.py index 5276cd518e5..8cf3d35b11f 100644 --- a/python/cudf/cudf/core/common.py +++ b/python/cudf/cudf/core/common.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. def pipe(obj, func, *args, **kwargs): @@ -28,9 +28,7 @@ def pipe(obj, func, *args, **kwargs): if isinstance(func, tuple): func, target = func if target in kwargs: - raise ValueError( - f"{target} is both the pipe target and a keyword argument" - ) + raise ValueError(f"{target} is both the pipe target and a keyword argument") kwargs[target] = obj return func(*args, **kwargs) else: diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 6afbc0bbc65..4d10db7fb58 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast @@ -61,18 +61,14 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool): # TODO: we should fix this further up. # Alternately we can have an Optional[Column] and handle None # specially in _gather. - self.column = cast( - "NumericalColumn", self.column.astype(size_type_dtype) - ) + self.column = cast("NumericalColumn", self.column.astype(size_type_dtype)) else: if self.column.dtype.kind not in {"i", "u"}: raise TypeError("Gather map must have integer dtype") if not nullify: lo, hi = libcudf.reduce.minmax(self.column) if lo.value < -nrows or hi.value >= nrows: - raise IndexError( - f"Gather map is out of bounds for [0, {nrows})" - ) + raise IndexError(f"Gather map is out of bounds for [0, {nrows})") @classmethod def from_column_unchecked( diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index ccf730c91fb..ae7cad33734 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from collections import abc @@ -159,9 +159,7 @@ def cut( # create bins if given an int or single scalar if not isinstance(bins, pd.IntervalIndex): if not isinstance(bins, (abc.Sequence)): - if isinstance( - x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray) - ): + if isinstance(x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray)): mn = x.min() mx = x.max() else: @@ -208,14 +206,10 @@ def cut( old_bins[0], old_bins[1], periods=1, closed=closed ) else: - interval_labels = IntervalIndex.from_breaks( - old_bins, closed=closed - ) + interval_labels = IntervalIndex.from_breaks(old_bins, closed=closed) else: # get labels for categories - interval_labels = IntervalIndex.from_breaks( - int_label_bins, closed=closed - ) + interval_labels = IntervalIndex.from_breaks(int_label_bins, closed=closed) elif labels is not False: if not (is_list_like(labels)): raise ValueError( @@ -238,9 +232,7 @@ def cut( labels, categories=None, ordered=False ) else: - interval_labels = ( - labels if len(set(labels)) == len(labels) else None - ) + interval_labels = labels if len(set(labels)) == len(labels) else None if isinstance(bins, pd.IntervalIndex): # get the left and right edges of the bins as columns diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0440512c467..191fa7cf125 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -135,10 +135,7 @@ def _shape_mismatch_error(x, y): class _DataFrameIndexer(_FrameIndexer): def __getitem__(self, arg): - if ( - isinstance(self._frame.index, MultiIndex) - or self._frame._data.multiindex - ): + if isinstance(self._frame.index, MultiIndex) or self._frame._data.multiindex: # This try/except block allows the use of pandas-like # tuple arguments into MultiIndex dataframes. try: @@ -171,9 +168,7 @@ def _can_downcast_to_series(self, df, arg): if not is_scalar(arg[1]): return False elif (is_list_like(arg[0]) or is_column_like(arg[0])) and ( - is_list_like(arg[1]) - or is_column_like(arg[0]) - or type(arg[1]) is slice + is_list_like(arg[1]) or is_column_like(arg[0]) or type(arg[1]) is slice ): return False else: @@ -183,9 +178,7 @@ def _can_downcast_to_series(self, df, arg): return True dtypes = df.dtypes.values.tolist() all_numeric = all(is_numeric_dtype(t) for t in dtypes) - if all_numeric or ( - len(dtypes) and all(t == dtypes[0] for t in dtypes) - ): + if all_numeric or (len(dtypes) and all(t == dtypes[0] for t in dtypes)): return True if isinstance(arg[1], tuple): return True @@ -318,9 +311,7 @@ def _getitem_tuple_arg(self, arg): as_column( tmp_arg[0], dtype=self._frame.index.dtype - if isinstance( - self._frame.index.dtype, cudf.CategoricalDtype - ) + if isinstance(self._frame.index.dtype, cudf.CategoricalDtype) else None, ), tmp_arg[1], @@ -332,27 +323,17 @@ def _getitem_tuple_arg(self, arg): ) else: tmp_col_name = str(uuid4()) - cantor_name = "_" + "_".join( - map(str, columns_df._data.names) - ) + cantor_name = "_" + "_".join(map(str, columns_df._data.names)) if columns_df._data.multiindex: # column names must be appropriate length tuples - extra = tuple( - "" for _ in range(columns_df._data.nlevels - 1) - ) + extra = tuple("" for _ in range(columns_df._data.nlevels - 1)) tmp_col_name = (tmp_col_name, *extra) cantor_name = (cantor_name, *extra) other_df = DataFrame( - { - tmp_col_name: column.as_column( - range(len(tmp_arg[0])) - ) - }, + {tmp_col_name: column.as_column(range(len(tmp_arg[0])))}, index=as_index(tmp_arg[0]), ) - columns_df[cantor_name] = column.as_column( - range(len(columns_df)) - ) + columns_df[cantor_name] = column.as_column(range(len(columns_df))) df = other_df.join(columns_df, how="inner") # as join is not assigning any names to index, # update it over here @@ -370,10 +351,7 @@ def _getitem_tuple_arg(self, arg): @_cudf_nvtx_annotate def _setitem_tuple_arg(self, key, value): - if ( - isinstance(self._frame.index, MultiIndex) - or self._frame._data.multiindex - ): + if isinstance(self._frame.index, MultiIndex) or self._frame._data.multiindex: raise NotImplementedError( "Setting values using df.loc[] not supported on " "DataFrames with a MultiIndex" @@ -401,9 +379,7 @@ def _setitem_tuple_arg(self, key, value): new_col = cudf.Series(value, index=idx) if not self._frame.empty: - new_col = new_col._align_to_index( - self._frame.index, how="right" - ) + new_col = new_col._align_to_index(self._frame.index, how="right") if self._frame.empty: self._frame.index = ( @@ -470,13 +446,14 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): _frame: DataFrame def __getitem__(self, arg): - row_key, ( - col_is_scalar, - column_names, + ( + row_key, + ( + col_is_scalar, + column_names, + ), ) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame) - row_spec = indexing_utils.parse_row_iloc_indexer( - row_key, len(self._frame) - ) + row_spec = indexing_utils.parse_row_iloc_indexer(row_key, len(self._frame)) ca = self._frame._data index = self._frame.index if col_is_scalar: @@ -708,9 +685,7 @@ def __init__( self._reindex( column_names=columns, index=index, deep=False, inplace=True ) - if isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ): + if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)): self._data.rangeindex = True else: self._data = data._data @@ -754,9 +729,7 @@ def __init__( label_dtype = getattr(columns, "dtype", None) self._data = ColumnAccessor( { - k: column.column_empty( - len(self), dtype="object", masked=True - ) + k: column.column_empty(len(self), dtype="object", masked=True) for k in columns }, level_names=tuple(columns.names) @@ -776,13 +749,9 @@ def __init__( # descr is an optional field of the _cuda_ary_iface_ if "descr" in arr_interface: if len(arr_interface["descr"]) == 1: - new_df = self._from_arrays( - data, index=index, columns=columns - ) + new_df = self._from_arrays(data, index=index, columns=columns) else: - new_df = self.from_records( - data, index=index, columns=columns - ) + new_df = self.from_records(data, index=index, columns=columns) else: new_df = self._from_arrays(data, index=index, columns=columns) @@ -825,18 +794,12 @@ def __init__( ) self._data.rangeindex = rangeindex self._data.label_dtype = ( - cudf.dtype(label_dtype) - if label_dtype is not None - else None + cudf.dtype(label_dtype) if label_dtype is not None else None ) elif len(data) > 0 and isinstance(data[0], Series): - self._init_from_series_list( - data=data, columns=columns, index=index - ) + self._init_from_series_list(data=data, columns=columns, index=index) else: - self._init_from_list_like( - data, index=index, columns=columns - ) + self._init_from_list_like(data, index=index, columns=columns) self._check_data_index_length_match() else: if not is_dict_like(data): @@ -918,8 +881,7 @@ def _init_from_series_list(self, data, columns, index): for idx, series in enumerate(data): if not series.index.is_unique: raise ValueError( - "Reindexing only valid with uniquely valued Index " - "objects" + "Reindexing only valid with uniquely valued Index " "objects" ) if not series.index.equals(final_columns): series = series.reindex(final_columns) @@ -983,9 +945,7 @@ def _init_from_list_like(self, data, index=None, columns=None): elif len(data) > 0 and isinstance(data[0], pd.Interval): data = DataFrame.from_pandas(pd.DataFrame(data)) self._data = data._data - elif any( - not isinstance(col, (abc.Iterable, abc.Sequence)) for col in data - ): + elif any(not isinstance(col, (abc.Iterable, abc.Sequence)) for col in data): raise TypeError("Inputs should be an iterable or sequence.") elif len(data) > 0 and not can_convert_to_column(data[0]): raise ValueError("Must pass 2-d input.") @@ -1026,9 +986,7 @@ def _init_from_list_like(self, data, index=None, columns=None): self._data.label_dtype = getattr(columns, "dtype", None) @_cudf_nvtx_annotate - def _init_from_dict_like( - self, data, index=None, columns=None, nan_as_null=None - ): + def _init_from_dict_like(self, data, index=None, columns=None, nan_as_null=None): label_dtype = None if columns is not None: label_dtype = getattr(columns, "dtype", None) @@ -1051,9 +1009,7 @@ def _init_from_dict_like( masked=index is not None, ) - data = { - c: data[c] if c in data else empty_column() for c in columns - } + data = {c: data[c] if c in data else empty_column() for c in columns} data, index = self._align_input_series_indices(data, index=index) @@ -1132,9 +1088,7 @@ def _align_input_series_indices(data, index): ] else: - aligned_input_series = cudf.core.series._align_indices( - input_series - ) + aligned_input_series = cudf.core.series._align_indices(input_series) index = aligned_input_series[0].index for name, val in data.items(): @@ -1172,9 +1126,7 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): index_nframes = header["index_frame_count"] - obj = super().deserialize( - header, frames[header["index_frame_count"] :] - ) + obj = super().deserialize(header, frames[header["index_frame_count"] :]) idx_typ = pickle.loads(header["index"]["type-serialized"]) index = idx_typ.deserialize(header["index"], frames[:index_nframes]) @@ -1228,9 +1180,7 @@ def __dir__(self): o = set(dir(type(self))) o.update(self.__dict__) o.update( - c - for c in self._column_names - if isinstance(c, str) and c.isidentifier() + c for c in self._column_names if isinstance(c, str) and c.isidentifier() ) return list(o) @@ -1347,9 +1297,7 @@ def __getitem__(self, arg): elif isinstance(arg, DataFrame): return self.where(arg) else: - raise TypeError( - f"__getitem__ on type {type(arg)} is not supported" - ) + raise TypeError(f"__getitem__ on type {type(arg)} is not supported") @_cudf_nvtx_annotate def __setitem__(self, arg, value): @@ -1367,9 +1315,9 @@ def __setitem__(self, arg, value): if is_scalar(value): self._data[col_name][scatter_map] = value else: - self._data[col_name][scatter_map] = column.as_column( - value - )[scatter_map] + self._data[col_name][scatter_map] = column.as_column(value)[ + scatter_map + ] elif is_scalar(arg) or isinstance(arg, tuple): if isinstance(value, DataFrame): _setitem_with_dataframe( @@ -1455,16 +1403,12 @@ def __setitem__(self, arg, value): else: for col in arg: if is_scalar(value): - self._data[col] = as_column( - value, length=len(self) - ) + self._data[col] = as_column(value, length=len(self)) else: self._data[col] = column.as_column(value) else: - raise TypeError( - f"__setitem__ on type {type(arg)} is not supported" - ) + raise TypeError(f"__setitem__ on type {type(arg)} is not supported") def __delitem__(self, name): self._drop_column(name) @@ -1559,9 +1503,7 @@ def assign(self, **kwargs: Union[Callable[[Self], Any], Any]): @classmethod @_cudf_nvtx_annotate - def _concat( - cls, objs, axis=0, join="outer", ignore_index=False, sort=False - ): + def _concat(cls, objs, axis=0, join="outer", ignore_index=False, sort=False): # flag to indicate at least one empty input frame also has an index empty_has_index = False # length of output frame's RangeIndex if all input frames are empty, @@ -1602,9 +1544,7 @@ def _concat( intersecting_columns = functools.reduce( set.intersection, sets_of_column_names ) - union_of_columns = functools.reduce( - set.union, sets_of_column_names - ) + union_of_columns = functools.reduce(set.union, sets_of_column_names) non_intersecting_columns = union_of_columns.symmetric_difference( intersecting_columns ) @@ -1622,8 +1562,7 @@ def _concat( if axis == 0: if ignore_index and ( - num_empty_input_frames > 0 - or len(intersecting_columns) == 0 + num_empty_input_frames > 0 or len(intersecting_columns) == 0 ): # When ignore_index is True and if there is # at least 1 empty dataframe and no @@ -1663,9 +1602,7 @@ def _concat( # can't sort anything else. try: str_names = sorted(n for n in names if isinstance(n, str)) - non_str_names = sorted( - n for n in names if not isinstance(n, str) - ) + non_str_names = sorted(n for n in names if not isinstance(n, str)) names = non_str_names + str_names except TypeError: names = list(names) @@ -1681,8 +1618,7 @@ def _concat( columns = [ ( [] - if are_all_range_index - or (ignore_index and not empty_has_index) + if are_all_range_index or (ignore_index and not empty_has_index) else list(f._index._data.columns) ) + [f._data[name] if name in f._data else None for name in names] @@ -1748,9 +1684,7 @@ def _concat( if empty_has_index and num_empty_input_frames == len(objs): out._index = cudf.RangeIndex(result_index_length) elif are_all_range_index and not ignore_index: - out._index = cudf.core.index.Index._concat( - [o._index for o in objs] - ) + out._index = cudf.core.index.Index._concat([o._index for o in objs]) # Reassign the categories for any categorical table cols _reassign_categories( @@ -1767,13 +1701,9 @@ def _concat( if not isinstance(out._index, MultiIndex) and isinstance( out._index._values.dtype, cudf.CategoricalDtype ): - out = out.set_index( - cudf.core.index.as_index(out.index._values) - ) + out = out.set_index(cudf.core.index.as_index(out.index._values)) for name, col in out._data.items(): - out._data[name] = col._with_type_metadata( - tables[0]._data[name].dtype - ) + out._data[name] = col._with_type_metadata(tables[0]._data[name].dtype) # Reassign index and column names if objs[0]._data.multiindex: @@ -1832,9 +1762,7 @@ def _clean_renderable_dataframe(self, output): if lines[-1].startswith("["): lines = lines[:-1] - lines.append( - "[%d rows x %d columns]" % (len(self), len(self._data.names)) - ) + lines.append("[%d rows x %d columns]" % (len(self), len(self._data.names))) return "\n".join(lines) def _clean_nulls_from_dataframe(self, df): @@ -1847,9 +1775,7 @@ def _clean_nulls_from_dataframe(self, df): filling with `` values. """ for col in df._data: - if isinstance( - df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype) - ): + if isinstance(df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype)): # TODO we need to handle this pass elif df._data[col].has_nulls(): @@ -1920,9 +1846,7 @@ def _get_renderable_dataframe(self): right_cols = len(self._data.names) - int(ncols / 2.0) # adjust right columns for output if multiindex. right_cols = ( - right_cols - 1 - if isinstance(self.index, MultiIndex) - else right_cols + right_cols - 1 if isinstance(self.index, MultiIndex) else right_cols ) left_cols = int(ncols / 2.0) + 1 if right_cols > 0: @@ -1961,17 +1885,11 @@ def __repr__(self): @_cudf_nvtx_annotate def _repr_html_(self): - lines = ( - self._get_renderable_dataframe() - .to_pandas() - ._repr_html_() - .split("\n") - ) + lines = self._get_renderable_dataframe().to_pandas()._repr_html_().split("\n") if lines[-2].startswith("

"): lines = lines[:-2] lines.append( - "

%d rows × %d columns

" - % (len(self), len(self._data.names)) + "

%d rows × %d columns

" % (len(self), len(self._data.names)) ) lines.append("") return "\n".join(lines) @@ -1981,9 +1899,7 @@ def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() @_cudf_nvtx_annotate - def _get_columns_by_label( - self, labels, *, downcast=False - ) -> Self | Series: + def _get_columns_by_label(self, labels, *, downcast=False) -> Self | Series: """ Return columns of dataframe by `labels` @@ -2034,11 +1950,7 @@ def _make_operands_and_index_for_binop( if ( not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS - and ( - not self._data.to_pandas_index().equals( - other.index.to_pandas() - ) - ) + and (not self._data.to_pandas_index().equals(other.index.to_pandas())) ): raise ValueError( "Can only compare DataFrame & Series objects " @@ -2049,9 +1961,7 @@ def _make_operands_and_index_for_binop( # For keys in right but not left, perform binops between NaN (not # NULL!) and the right value (result is NaN). left_default = as_column(np.nan, length=len(self)) - equal_columns = other.index.to_pandas().equals( - self._data.to_pandas_index() - ) + equal_columns = other.index.to_pandas().equals(self._data.to_pandas_index()) can_use_self_column_name = ( equal_columns or list(other._index._data.names) == self._data._level_names @@ -2079,8 +1989,7 @@ def _make_operands_and_index_for_binop( left_default = fill_value equal_columns = self._column_names == other._column_names can_use_self_column_name = ( - equal_columns - or self._data._level_names == other._data._level_names + equal_columns or self._data._level_names == other._data._level_names ) elif isinstance(other, (dict, abc.Mapping)): # Need to fail early on host mapping types because we ultimately @@ -2216,9 +2125,7 @@ def from_dict( ): result = cls(data).T result.columns = ( - columns - if columns is not None - else range(len(result._data)) + columns if columns is not None else range(len(result._data)) ) if dtype is not None: result = result.astype(dtype) @@ -2234,22 +2141,14 @@ def from_dict( ) elif orient == "columns": if columns is not None: - raise ValueError( - "Cannot use columns parameter with orient='columns'" - ) + raise ValueError("Cannot use columns parameter with orient='columns'") return cls(data, columns=None, dtype=dtype) elif orient == "tight": if columns is not None: - raise ValueError( - "Cannot use columns parameter with orient='right'" - ) + raise ValueError("Cannot use columns parameter with orient='right'") - index = _from_dict_create_index( - data["index"], data["index_names"], cudf - ) - columns = _from_dict_create_index( - data["columns"], data["column_names"], pd - ) + index = _from_dict_create_index(data["index"], data["index_names"], cudf) + columns = _from_dict_create_index(data["columns"], data["column_names"], pd) return cls(data["data"], index=index, columns=columns, dtype=dtype) else: raise ValueError( @@ -2369,9 +2268,7 @@ def to_dict( elif issubclass(into, abc.Mapping): cons = into # type: ignore[assignment] if issubclass(into, defaultdict): - raise TypeError( - "to_dict() only accepts initialized defaultdicts" - ) + raise TypeError("to_dict() only accepts initialized defaultdicts") else: raise TypeError(f"unsupported type: {into}") return cons(self.items()) # type: ignore[misc] @@ -2465,8 +2362,7 @@ def scatter_by_map( if map_size: result += [ - self._empty_like(keep_index) - for _ in range(map_size - len(result)) + self._empty_like(keep_index) for _ in range(map_size - len(result)) ] return result @@ -2528,9 +2424,7 @@ def update( if join != "left": raise NotImplementedError("Only left join is supported") if errors not in {"ignore", "raise"}: - raise ValueError( - "The parameter errors must be either 'ignore' or 'raise'" - ) + raise ValueError("The parameter errors must be either 'ignore' or 'raise'") if filter_func is not None: raise NotImplementedError("filter_func is not supported yet") @@ -2587,9 +2481,7 @@ def equals(self, other): ret = super().equals(other) # If all other checks matched, validate names. if ret: - for self_name, other_name in zip( - self._data.names, other._data.names - ): + for self_name, other_name in zip(self._data.names, other._data.names): if self_name != other_name: ret = False break @@ -2963,9 +2855,7 @@ def set_index( raise TypeError(msg) if isinstance(col, (MultiIndex, pd.MultiIndex)): col = ( - cudf.from_pandas(col) - if isinstance(col, pd.MultiIndex) - else col + cudf.from_pandas(col) if isinstance(col, pd.MultiIndex) else col ) cols = [col._data[x] for x in col._data] columns_to_add.extend(cols) @@ -2977,9 +2867,7 @@ def set_index( else: # For pandas obj, convert to gpu obj columns_to_add.append(as_column(col)) - if isinstance( - col, (cudf.Series, cudf.Index, pd.Series, pd.Index) - ): + if isinstance(col, (cudf.Series, cudf.Index, pd.Series, pd.Index)): names.append(col.name) else: names.append(None) @@ -3037,9 +2925,7 @@ def where(self, cond, other=None, inplace=False): {name: cond._column for name in self._column_names}, ) elif hasattr(cond, "__cuda_array_interface__"): - cond = DataFrame( - cond, columns=self._column_names, index=self.index - ) + cond = DataFrame(cond, columns=self._column_names, index=self.index) elif ( hasattr(cond, "__array_interface__") and cond.__array_interface__["shape"] != self.shape @@ -3053,9 +2939,7 @@ def where(self, cond, other=None, inplace=False): cond = cond.reindex(self.index) else: if cond.shape != self.shape: - raise ValueError( - "Array conditional must be same shape as self" - ) + raise ValueError("Array conditional must be same shape as self") # Setting `self` column names to `cond` as it has no column names. cond._set_columns_like(self._data) @@ -3084,9 +2968,7 @@ def where(self, cond, other=None, inplace=False): ) if cond_col := cond._data.get(name): - result = cudf._lib.copying.copy_if_else( - col, other_col, cond_col - ) + result = cudf._lib.copying.copy_if_else(col, other_col, cond_col) out[name] = _make_categorical_like(result, self._data[name]) else: @@ -3096,9 +2978,7 @@ def where(self, cond, other=None, inplace=False): ) out[name] = col.set_mask(out_mask) - return self._mimic_inplace( - self._from_data_like_self(out), inplace=inplace - ) + return self._mimic_inplace(self._from_data_like_self(out), inplace=inplace) @docutils.doc_apply( doc_reset_index_template.format( @@ -3234,9 +3114,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): # least require a deprecation cycle because we currently support # inserting a pd.Categorical. if isinstance(value, pd.Categorical): - value = cudf.core.column.categorical.pandas_categorical_as_column( - value - ) + value = cudf.core.column.categorical.pandas_categorical_as_column(value) if _is_scalar_or_zero_d_array(value): dtype = None @@ -3269,9 +3147,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): elif isinstance(value, (pd.Series, Series)): value = Series(value, nan_as_null=nan_as_null) if not ignore_index: - value = value._align_to_index( - self._index, how="right", sort=False - ) + value = value._align_to_index(self._index, how="right", sort=False) value = column.as_column(value, nan_as_null=nan_as_null) @@ -3560,17 +3436,13 @@ def rename( with a number. """ if errors != "ignore": - raise NotImplementedError( - "Only errors='ignore' is currently supported" - ) + raise NotImplementedError("Only errors='ignore' is currently supported") if mapper is None and index is None and columns is None: return self.copy(deep=copy) index = mapper if index is None and axis in (0, "index") else index - columns = ( - mapper if columns is None and axis in (1, "columns") else columns - ) + columns = mapper if columns is None and axis in (1, "columns") else columns if index: if ( @@ -3624,17 +3496,13 @@ def rename( @_cudf_nvtx_annotate def add_prefix(self, prefix): out = self.copy(deep=True) - out.columns = [ - prefix + col_name for col_name in list(self._data.keys()) - ] + out.columns = [prefix + col_name for col_name in list(self._data.keys())] return out @_cudf_nvtx_annotate def add_suffix(self, suffix): out = self.copy(deep=True) - out.columns = [ - col_name + suffix for col_name in list(self._data.keys()) - ] + out.columns = [col_name + suffix for col_name in list(self._data.keys())] return out @_cudf_nvtx_annotate @@ -3683,9 +3551,7 @@ def agg(self, aggs, axis=None): if axis == 0 or axis is not None: raise NotImplementedError("axis not implemented yet") - if isinstance(aggs, abc.Iterable) and not isinstance( - aggs, (str, dict) - ): + if isinstance(aggs, abc.Iterable) and not isinstance(aggs, (str, dict)): result = DataFrame() # TODO : Allow simultaneous pass for multi-aggregation as # a future optimization @@ -3696,8 +3562,7 @@ def agg(self, aggs, axis=None): elif isinstance(aggs, str): if not hasattr(self, aggs): raise AttributeError( - f"{aggs} is not a valid function for " - f"'DataFrame' object" + f"{aggs} is not a valid function for " f"'DataFrame' object" ) result = DataFrame() result[aggs] = getattr(self, aggs)() @@ -3708,17 +3573,14 @@ def agg(self, aggs, axis=None): elif isinstance(aggs, dict): cols = aggs.keys() if any(callable(val) for val in aggs.values()): - raise NotImplementedError( - "callable parameter is not implemented yet" - ) + raise NotImplementedError("callable parameter is not implemented yet") elif all(isinstance(val, str) for val in aggs.values()): res = {} for key, value in aggs.items(): col = self[key] if not hasattr(col, value): raise AttributeError( - f"{value} is not a valid function for " - f"'Series' object" + f"{value} is not a valid function for " f"'Series' object" ) res[key] = getattr(col, value)() result = cudf.Series(list(res.values()), index=res.keys()) @@ -3738,9 +3600,7 @@ def agg(self, aggs, axis=None): result = DataFrame(index=idxs, columns=cols) for key in aggs.keys(): col = self[key] - col_empty = column_empty( - len(idxs), dtype=col.dtype, masked=True - ) + col_empty = column_empty(len(idxs), dtype=col.dtype, masked=True) ans = cudf.Series(data=col_empty, index=idxs) if isinstance(aggs.get(key), abc.Iterable): # TODO : Allow simultaneous pass for multi-aggregation @@ -3766,9 +3626,7 @@ def agg(self, aggs, axis=None): return result elif callable(aggs): - raise NotImplementedError( - "callable parameter is not implemented yet" - ) + raise NotImplementedError("callable parameter is not implemented yet") else: raise ValueError("argument must be a string, list or dict") @@ -4003,8 +3861,7 @@ def transpose(self): source_dtype = source_columns[0].dtype if isinstance(source_dtype, cudf.CategoricalDtype): if any( - not isinstance(c.dtype, cudf.CategoricalDtype) - for c in source_columns + not isinstance(c.dtype, cudf.CategoricalDtype) for c in source_columns ): raise ValueError("Columns must all have the same dtype") cats = list(c.categories for c in source_columns) @@ -4185,17 +4042,14 @@ def merge( ordering. """ if indicator: - raise NotImplementedError( - "Only indicator=False is currently supported" - ) + raise NotImplementedError("Only indicator=False is currently supported") if lsuffix or rsuffix: raise ValueError( "The lsuffix and rsuffix keywords have been replaced with the " "``suffixes=`` keyword. " "Please provide the following instead: \n\n" - " suffixes=('%s', '%s')" - % (lsuffix or "_x", rsuffix or "_y") + " suffixes=('%s', '%s')" % (lsuffix or "_x", rsuffix or "_y") ) else: lsuffix, rsuffix = suffixes @@ -4270,9 +4124,7 @@ def join( suffixes=(lsuffix, rsuffix), sort=sort, ) - df.index.name = ( - None if self.index.name != other.index.name else self.index.name - ) + df.index.name = None if self.index.name != other.index.name else self.index.name return df @_cudf_nvtx_annotate @@ -4392,8 +4244,7 @@ def query(self, expr, local_dict=None): if not isinstance(local_dict, dict): raise TypeError( - f"local_dict type: expected dict but found " - f"{type(local_dict)}" + f"local_dict type: expected dict but found " f"{type(local_dict)}" ) # Get calling environment @@ -4405,14 +4256,10 @@ def query(self, expr, local_dict=None): } # Run query boolmask = queryutils.query_execute(self, expr, callenv) - return self._apply_boolean_mask( - BooleanMask.from_column_unchecked(boolmask) - ) + return self._apply_boolean_mask(BooleanMask.from_column_unchecked(boolmask)) @_cudf_nvtx_annotate - def apply( - self, func, axis=1, raw=False, result_type=None, args=(), **kwargs - ): + def apply(self, func, axis=1, raw=False, result_type=None, args=(), **kwargs): """ Apply a function along an axis of the DataFrame. ``apply`` relies on Numba to JIT compile ``func``. @@ -4589,9 +4436,7 @@ def apply( """ if axis != 1: - raise ValueError( - "DataFrame.apply currently only supports row wise ops" - ) + raise ValueError("DataFrame.apply currently only supports row wise ops") if raw: raise ValueError("The `raw` kwarg is not yet supported.") if result_type is not None: @@ -4626,8 +4471,7 @@ def applymap( # Do not remove until pandas 3.0 support is added. assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( - "DataFrame.applymap has been deprecated. Use DataFrame.map " - "instead.", + "DataFrame.applymap has been deprecated. Use DataFrame.map " "instead.", FutureWarning, ) return self.map(func=func, na_action=na_action, **kwargs) @@ -4865,9 +4709,7 @@ def partition_by_hash(self, columns, nparts, keep_index=True): else: cols = [*self._columns] - output_columns, offsets = libcudf.hash.hash_partition( - cols, key_indices, nparts - ) + output_columns, offsets = libcudf.hash.hash_partition(cols, key_indices, nparts) outdf = self._from_columns_like_self( output_columns, self._column_names, @@ -5031,9 +4873,7 @@ def info( entries_summary = f", {self._index[0]} to {self._index[-1]}" else: entries_summary = "" - index_summary = ( - f"{index_name}: {len(self._index)} entries{entries_summary}" - ) + index_summary = f"{index_name}: {len(self._index)} entries{entries_summary}" lines.append(index_summary) if len(self._data) == 0: @@ -5075,22 +4915,17 @@ def _verbose_repr(): space_num = max(max_id, len_id) + col_space counts = None - header = _put_str(id_head, space_num) + _put_str( - column_head, space - ) + header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: counts = self.count().to_pandas().tolist() if col_count != len(counts): raise AssertionError( - f"Columns must equal " - f"counts ({col_count} != {len(counts)})" + f"Columns must equal " f"counts ({col_count} != {len(counts)})" ) count_header = "Non-Null Count" len_count = len(count_header) non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len( - non_null - ) + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) space_count = max(len_count, max_count) + col_space count_temp = "{count}" + non_null else: @@ -5178,9 +5013,7 @@ def _sizeof_fmt(num, size_qualifier): if "object" in dtype_counts or self.index.dtype == "object": size_qualifier = "+" mem_usage = self.memory_usage(index=True, deep=deep).sum() - lines.append( - f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n" - ) + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") cudf.utils.ioutils.buffer_write_lines(buf, lines) @@ -5206,9 +5039,7 @@ def describe( data_to_describe = self else: - data_to_describe = self.select_dtypes( - include=include, exclude=exclude - ) + data_to_describe = self.select_dtypes(include=include, exclude=exclude) if data_to_describe.empty: raise ValueError("No data of included types.") @@ -5222,15 +5053,9 @@ def describe( if len(describe_series_list) == 1: return describe_series_list[0].to_frame() else: - ldesc_indexes = sorted( - (x.index for x in describe_series_list), key=len - ) + ldesc_indexes = sorted((x.index for x in describe_series_list), key=len) names = dict.fromkeys( - [ - name - for idxnames in ldesc_indexes - for name in idxnames.to_pandas() - ], + [name for idxnames in ldesc_indexes for name in idxnames.to_pandas()], None, ) @@ -5326,9 +5151,7 @@ def to_pandas( """ out_index = self.index.to_pandas() out_data = { - i: col.to_pandas( - index=out_index, nullable=nullable, arrow_type=arrow_type - ) + i: col.to_pandas(index=out_index, nullable=nullable, arrow_type=arrow_type) for i, col in enumerate(self._data.columns) } @@ -5369,18 +5192,14 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): 2 3 4 """ if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) + nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None if isinstance(dataframe, pd.DataFrame): if not dataframe.columns.is_unique: raise ValueError("Duplicate column names are not allowed") data = { - col_name: column.as_column( - col_value.array, nan_as_null=nan_as_null - ) + col_name: column.as_column(col_value.array, nan_as_null=nan_as_null) for col_name, col_value in dataframe.items() } if isinstance(dataframe.index, pd.MultiIndex): @@ -5388,9 +5207,7 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): dataframe.index, nan_as_null=nan_as_null ) else: - index = cudf.Index.from_pandas( - dataframe.index, nan_as_null=nan_as_null - ) + index = cudf.Index.from_pandas(dataframe.index, nan_as_null=nan_as_null) df = cls._from_data(data, index) df._data._level_names = tuple(dataframe.columns.names) @@ -5406,9 +5223,7 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): # this isn't pandas specific return from_dataframe(dataframe, allow_copy=True) else: - raise TypeError( - f"Could not construct DataFrame from {type(dataframe)}" - ) + raise TypeError(f"Could not construct DataFrame from {type(dataframe)}") @classmethod @_cudf_nvtx_annotate @@ -5533,18 +5348,13 @@ def to_arrow(self, preserve_index=True): else: if isinstance(self.index, MultiIndex): gen_names = tuple( - f"level_{i}" - for i, _ in enumerate(self.index._data.names) + f"level_{i}" for i, _ in enumerate(self.index._data.names) ) else: gen_names = ( - self.index.names - if self.index.name is not None - else ("index",) + self.index.names if self.index.name is not None else ("index",) ) - for gen_name, col_name in zip( - gen_names, self.index._data.names - ): + for gen_name, col_name in zip(gen_names, self.index._data.names): data._insert( data.shape[1], gen_name, @@ -5609,9 +5419,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): DataFrame """ if data.ndim != 1 and data.ndim != 2: - raise ValueError( - f"records dimension expected 1 or 2 but found {data.ndim}" - ) + raise ValueError(f"records dimension expected 1 or 2 but found {data.ndim}") num_cols = len(data[0]) @@ -5624,8 +5432,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): else: if len(columns) != num_cols: raise ValueError( - f"columns length expected {num_cols} " - f"but found {len(columns)}" + f"columns length expected {num_cols} " f"but found {len(columns)}" ) names = columns @@ -5633,14 +5440,10 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): if data.ndim == 2: for i, k in enumerate(names): - df._data[k] = column.as_column( - data[:, i], nan_as_null=nan_as_null - ) + df._data[k] = column.as_column(data[:, i], nan_as_null=nan_as_null) elif data.ndim == 1: for k in names: - df._data[k] = column.as_column( - data[k], nan_as_null=nan_as_null - ) + df._data[k] = column.as_column(data[k], nan_as_null=nan_as_null) if index is None: df._index = RangeIndex(start=0, stop=len(data)) @@ -5692,8 +5495,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): else: if len(columns) != num_cols: raise ValueError( - f"columns length expected {num_cols} but " - f"found {len(columns)}" + f"columns length expected {num_cols} but " f"found {len(columns)}" ) elif len(columns) != len(set(columns)): raise ValueError("Duplicate column names are not allowed") @@ -5702,13 +5504,9 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df = cls() if data.ndim == 2: for i, k in enumerate(names): - df._data[k] = column.as_column( - data[:, i], nan_as_null=nan_as_null - ) + df._data[k] = column.as_column(data[:, i], nan_as_null=nan_as_null) elif data.ndim == 1: - df._data[names[0]] = column.as_column( - data, nan_as_null=nan_as_null - ) + df._data[names[0]] = column.as_column(data, nan_as_null=nan_as_null) if isinstance(columns, pd.Index): df._data._level_names = tuple(columns.names) if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)): @@ -5861,9 +5659,7 @@ def quantile( if q_is_number: result = result.transpose() - return Series( - data=result._columns[0], index=result.index, name=q - ) + return Series(data=result._columns[0], index=result.index, name=q) else: # Ensure that qs is non-scalar so that we always get a column back. interpolation = interpolation or "linear" @@ -5969,10 +5765,7 @@ def make_false_column_like_self(): # precomputed variables inside the loop requires nontrivial logic. # This optimization could be attempted if `isin` ever becomes a # bottleneck. - if ( - isinstance(values, (Series, DataFrame)) - and not values.index.is_unique - ): + if isinstance(values, (Series, DataFrame)) and not values.index.is_unique: # if DataFrame ever supports duplicate columns # would need to check that here raise ValueError("cannot compute isin with a duplicate axis.") @@ -6125,12 +5918,7 @@ def count(self, axis=0, numeric_only=False): raise NotImplementedError("Only axis=0 is currently supported.") length = len(self) return Series._from_data( - { - None: [ - length - self._data[col].null_count - for col in self._data.names - ] - }, + {None: [length - self._data[col].null_count for col in self._data.names]}, as_index(self._data.names), ) @@ -6189,9 +5977,7 @@ def _reduce( # TODO: concat + op can probably be done in the general case # for axis == 2. # https://github.com/rapidsai/cudf/issues/14930 - return getattr(concat_columns(source._data.columns), op)( - **kwargs - ) + return getattr(concat_columns(source._data.columns), op)(**kwargs) try: result = [ getattr(source._data[col], op)(**kwargs) @@ -6235,17 +6021,13 @@ def _reduce( else: raise if axis == 2: - return getattr(as_column(result, nan_as_null=False), op)( - **kwargs - ) + return getattr(as_column(result, nan_as_null=False), op)(**kwargs) else: source_dtypes = [c.dtype for c in source._data.columns] common_dtype = find_common_type(source_dtypes) if ( is_object_dtype(common_dtype) - and any( - not is_object_dtype(dtype) for dtype in source_dtypes - ) + and any(not is_object_dtype(dtype) for dtype in source_dtypes) or not is_bool_dtype(common_dtype) and any(is_bool_dtype(dtype) for dtype in source_dtypes) ): @@ -6368,17 +6150,13 @@ def mode(self, axis=0, numeric_only=False, dropna=True): else: data_df = self - mode_results = [ - data_df[col].mode(dropna=dropna) for col in data_df._data - ] + mode_results = [data_df[col].mode(dropna=dropna) for col in data_df._data] if len(mode_results) == 0: return DataFrame() with warnings.catch_warnings(): - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.simplefilter("ignore", FutureWarning) df = cudf.concat(mode_results, axis=1) @@ -6451,9 +6229,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): prepared._data[col] = ( prepared._data[col] .astype( - cudf.utils.dtypes.get_min_float_dtype( - prepared._data[col] - ) + cudf.utils.dtypes.get_min_float_dtype(prepared._data[col]) if not is_datetime_dtype(common_dtype) else cudf.dtype("float64") ) @@ -6480,8 +6256,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): } result_dtype = ( common_dtype - if method in type_coerced_methods - or is_datetime_dtype(common_dtype) + if method in type_coerced_methods or is_datetime_dtype(common_dtype) else None ) result = column.as_column(result, dtype=result_dtype) @@ -6504,9 +6279,7 @@ def _columns_view(self, columns): """ Return a subset of the DataFrame's columns as a view. """ - return DataFrame( - {col: self._data[col] for col in columns}, index=self.index - ) + return DataFrame({col: self._data[col] for col in columns}, index=self.index) @_cudf_nvtx_annotate def select_dtypes(self, include=None, exclude=None): @@ -6586,9 +6359,7 @@ def select_dtypes(self, include=None, exclude=None): selection = tuple(map(frozenset, (include, exclude))) if not any(selection): - raise ValueError( - "at least one of include or exclude must be nonempty" - ) + raise ValueError("at least one of include or exclude must be nonempty") include, exclude = map( lambda x: frozenset(map(cudf_dtype_from_pydata_dtype, x)), @@ -6597,9 +6368,7 @@ def select_dtypes(self, include=None, exclude=None): # can't both include AND exclude! if not include.isdisjoint(exclude): - raise ValueError( - f"include and exclude overlap on {(include & exclude)}" - ) + raise ValueError(f"include and exclude overlap on {(include & exclude)}") # include all subtypes include_subtypes = set() @@ -6924,9 +6693,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): "level must be either an int/str, or a list of int/str." ) else: - raise ValueError( - "level must be either an int/str, or a list of int/str." - ) + raise ValueError("level must be either an int/str, or a list of int/str.") level = [level] if not isinstance(level, list) else level @@ -6954,9 +6721,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): ) else: # Must be a list of positions, normalize negative positions - level_indices = [ - lv + self._data.nlevels if lv < 0 else lv for lv in level - ] + level_indices = [lv + self._data.nlevels if lv < 0 else lv for lv in level] unnamed_levels_indices = [ i for i in range(self._data.nlevels) if i not in level_indices @@ -7001,18 +6766,14 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Compute the column indices that serves as the input for # `interleave_columns` - column_idx_df = pd.DataFrame( - data=range(len(self._data)), index=named_levels - ) + column_idx_df = pd.DataFrame(data=range(len(self._data)), index=named_levels) column_indices: list[list[int]] = [] if has_unnamed_levels: unnamed_level_values = list( map(column_name_idx.get_level_values, unnamed_levels_indices) ) - unnamed_level_values = pd.MultiIndex.from_arrays( - unnamed_level_values - ) + unnamed_level_values = pd.MultiIndex.from_arrays(unnamed_level_values) def unnamed_group_generator(): if has_unnamed_levels: @@ -7028,9 +6789,11 @@ def unnamed_group_generator(): unique_named_levels, axis=0, fill_value=-1 ).values else: - yield grpdf.reindex( - unique_named_levels, axis=0, fill_value=-1 - ).sort_index().values + yield ( + grpdf.reindex(unique_named_levels, axis=0, fill_value=-1) + .sort_index() + .values + ) else: if future_stack: yield column_idx_df.values @@ -7056,9 +6819,7 @@ def unnamed_group_generator(): ) all_nulls = functools.cache( - functools.partial( - column_empty, self.shape[0], common_type, masked=True - ) + functools.partial(column_empty, self.shape[0], common_type, masked=True) ) # homogenize the dtypes of the columns @@ -7071,9 +6832,7 @@ def unnamed_group_generator(): # Construct the resulting dataframe / series if not has_unnamed_levels: - result = Series._from_data( - data={None: stacked[0]}, index=new_index - ) + result = Series._from_data(data={None: stacked[0]}, index=new_index) else: if unnamed_level_values.nlevels == 1: unnamed_level_values = unnamed_level_values.get_level_values(0) @@ -7083,14 +6842,9 @@ def unnamed_group_generator(): dict( zip( unnamed_level_values, - [ - stacked[i] - for i in unnamed_level_values.argsort().argsort() - ] + [stacked[i] for i in unnamed_level_values.argsort().argsort()] if not future_stack - else [ - stacked[i] for i in unnamed_level_values.argsort() - ], + else [stacked[i] for i in unnamed_level_values.argsort()], ) ), isinstance(unnamed_level_values, pd.MultiIndex), @@ -7292,9 +7046,7 @@ def pivot_table( @_cudf_nvtx_annotate @copy_docstring(reshape.unstack) def unstack(self, level=-1, fill_value=None): - return cudf.core.reshape.unstack( - self, level=level, fill_value=fill_value - ) + return cudf.core.reshape.unstack(self, level=level, fill_value=fill_value) @_cudf_nvtx_annotate def explode(self, column, ignore_index=False): @@ -7391,9 +7143,7 @@ def pct_change( if fill_method not in (no_default, None) or limit is not no_default: # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "The 'fill_method' and 'limit' keywords in " f"{type(self).__name__}.pct_change are deprecated and will be " @@ -7411,13 +7161,9 @@ def pct_change( warnings.simplefilter("ignore") data = self.fillna(method=fill_method, limit=limit) - return data.diff(periods=periods) / data.shift( - periods=periods, freq=freq - ) + return data.diff(periods=periods) / data.shift(periods=periods, freq=freq) - def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True - ): + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): return df_protocol.__dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) @@ -7465,8 +7211,7 @@ def _sample_axis_1( # Since cuDF does not support multiple columns with same name, # sample with replace=True at axis 1 is unsupported. raise NotImplementedError( - "Sample is not supported for axis 1/`columns` when" - "`replace=True`." + "Sample is not supported for axis 1/`columns` when" "`replace=True`." ) sampled_column_labels = random_state.choice( @@ -7530,9 +7275,7 @@ def interleave_columns(self): The interleaved columns as a single column """ if ("category" == self.dtypes).any(): - raise ValueError( - "interleave_columns does not support 'category' dtype." - ) + raise ValueError("interleave_columns does not support 'category' dtype.") return self._constructor_sliced._from_data( {None: libcudf.reshape.interleave_columns([*self._columns])} @@ -7642,9 +7385,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): the output of earlier statements) is not supported. """ if kwargs: - raise ValueError( - "Keyword arguments other than `inplace` are not supported" - ) + raise ValueError("Keyword arguments other than `inplace` are not supported") # Have to use a regex match to avoid capturing ==, >=, or <= equals_sign_regex = "[^=><]=[^=]" @@ -7662,9 +7403,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): if not includes_assignment: if inplace: - raise ValueError( - "Cannot operate inplace if there is no assignment" - ) + raise ValueError("Cannot operate inplace if there is no assignment") return Series._from_data( { None: libcudf.transform.compute_column( @@ -7688,9 +7427,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): exprs.append(e.strip()) cols = ( - libcudf.transform.compute_column( - [*self._columns], self._column_names, e - ) + libcudf.transform.compute_column([*self._columns], self._column_names, e) for e in exprs ) ret = self if inplace else self.copy(deep=False) @@ -7855,9 +7592,7 @@ def func(left, right, output): if isinstance(right, Series): uncommon_columns = set(left._column_names) ^ set(right.index) elif isinstance(right, DataFrame): - uncommon_columns = set(left._column_names) ^ set( - right._column_names - ) + uncommon_columns = set(left._column_names) ^ set(right._column_names) elif _is_scalar_or_zero_d_array(right): for name, col in output._data.items(): output._data[name] = col.fillna(value) @@ -7866,9 +7601,7 @@ def func(left, right, output): return output for name in uncommon_columns: - output._data[name] = as_column( - value, length=len(output), dtype="bool" - ) + output._data[name] = as_column(value, length=len(output), dtype="bool") return output return func @@ -7889,9 +7622,7 @@ def func(left, right, output): "gt", "ge", ]: - setattr( - DataFrame, binop, make_binop_func(binop, _make_replacement_func(False)) - ) + setattr(DataFrame, binop, make_binop_func(binop, _make_replacement_func(False))) @_cudf_nvtx_annotate @@ -7994,9 +7725,7 @@ def from_pandas(obj, nan_as_null=no_default): """ if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) + nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None if isinstance(obj, pd.DataFrame): return DataFrame.from_pandas(obj, nan_as_null=nan_as_null) @@ -8096,9 +7825,7 @@ def _setitem_with_dataframe( input_cols = input_df._column_names if len(input_cols) != len(replace_df._column_names): - raise ValueError( - "Number of Input Columns must be same replacement Dataframe" - ) + raise ValueError("Number of Input Columns must be same replacement Dataframe") if ( not ignore_index @@ -8110,9 +7837,7 @@ def _setitem_with_dataframe( for col_1, col_2 in zip(input_cols, replace_df._column_names): if col_1 in input_df._column_names: if mask is not None: - input_df._data[col_1][mask] = column.as_column( - replace_df[col_2] - ) + input_df._data[col_1][mask] = column.as_column(replace_df[col_2]) else: input_df._data[col_1] = column.as_column(replace_df[col_2]) else: @@ -8207,9 +7932,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): if all(is_numeric_dtype(col.dtype) for col in cols): dtypes[idx] = find_common_type([col.dtype for col in cols]) # If all categorical dtypes, combine the categories - elif all( - isinstance(col, cudf.core.column.CategoricalColumn) for col in cols - ): + elif all(isinstance(col, cudf.core.column.CategoricalColumn) for col in cols): # Combine and de-dupe the categories categories[idx] = cudf.Series( concat_columns([col.categories for col in cols]) @@ -8254,9 +7977,7 @@ def _reassign_categories(categories, cols, col_idxs): if idx in categories: cols[name] = build_categorical_column( categories=categories[idx], - codes=build_column( - cols[name].base_data, dtype=cols[name].dtype - ), + codes=build_column(cols[name].base_data, dtype=cols[name].dtype), mask=cols[name].base_mask, offset=cols[name].offset, size=cols[name].size, diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 62ded8ac6f1..e0ea964c767 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -247,9 +247,7 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: kind = _DtypeKind.CATEGORICAL # Codes and categories' dtypes are different. # We use codes' dtype as these are stored in the buffer. - codes = cast( - cudf.core.column.CategoricalColumn, self._col - ).codes + codes = cast(cudf.core.column.CategoricalColumn, self._col).codes dtype = codes.dtype else: raise ValueError( @@ -326,9 +324,7 @@ def describe_null(self) -> Tuple[int, Any]: return _MaskKind.BITMASK, 0 else: - raise NotImplementedError( - f"Data type {self.dtype} not yet supported" - ) + raise NotImplementedError(f"Data type {self.dtype} not yet supported") @property def null_count(self) -> int: @@ -350,9 +346,7 @@ def num_chunks(self) -> int: """ return 1 - def get_chunks( - self, n_chunks: Optional[int] = None - ) -> Iterable["_CuDFColumn"]: + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["_CuDFColumn"]: """ Return an iterable yielding the chunks. @@ -411,21 +405,16 @@ def _get_validity_buffer( if null == _MaskKind.BITMASK: assert self._col.mask is not None - buffer = _CuDFBuffer( - self._col.mask, cp.uint8, allow_copy=self._allow_copy - ) + buffer = _CuDFBuffer(self._col.mask, cp.uint8, allow_copy=self._allow_copy) dtype = (_DtypeKind.UINT, 8, "C", "=") return buffer, dtype elif null == _MaskKind.NAN: raise RuntimeError( - "This column uses NaN as null " - "so does not have a separate mask" + "This column uses NaN as null " "so does not have a separate mask" ) elif null == _MaskKind.NON_NULLABLE: - raise RuntimeError( - "This column is non-nullable so does not have a mask" - ) + raise RuntimeError("This column is non-nullable so does not have a mask") else: raise NotImplementedError( f"See {self.__class__.__name__}.describe_null method." @@ -476,26 +465,18 @@ def _get_data_buffer( dtype = self.dtype elif self.dtype[0] == _DtypeKind.CATEGORICAL: - col_data = cast( - cudf.core.column.CategoricalColumn, self._col - ).codes + col_data = cast(cudf.core.column.CategoricalColumn, self._col).codes dtype = self._dtype_from_cudfdtype(col_data.dtype) elif self.dtype[0] == _DtypeKind.STRING: - col_data = build_column( - data=self._col.data, dtype=np.dtype("int8") - ) + col_data = build_column(data=self._col.data, dtype=np.dtype("int8")) dtype = self._dtype_from_cudfdtype(col_data.dtype) else: - raise NotImplementedError( - f"Data type {self._col.dtype} not handled yet" - ) + raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") assert (col_data is not None) and (col_data.data is not None), " " f"col_data(.data) should not be None when dtype = {dtype}" - buffer = _CuDFBuffer( - col_data.data, col_data.dtype, allow_copy=self._allow_copy - ) + buffer = _CuDFBuffer(col_data.data, col_data.dtype, allow_copy=self._allow_copy) return buffer, dtype @@ -535,9 +516,7 @@ def __dataframe__( """ See the docstring of the `cudf.DataFrame.__dataframe__` for details """ - return _CuDFDataFrame( - self._df, nan_as_null=nan_as_null, allow_copy=allow_copy - ) + return _CuDFDataFrame(self._df, nan_as_null=nan_as_null, allow_copy=allow_copy) @property def metadata(self): @@ -558,14 +537,10 @@ def column_names(self) -> Iterable[str]: return self._df._column_names def get_column(self, i: int) -> _CuDFColumn: - return _CuDFColumn( - as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy - ) + return _CuDFColumn(as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy) def get_column_by_name(self, name: str) -> _CuDFColumn: - return _CuDFColumn( - as_column(self._df[name]), allow_copy=self._allow_copy - ) + return _CuDFColumn(as_column(self._df[name]), allow_copy=self._allow_copy) def get_columns(self) -> Iterable[_CuDFColumn]: return [ @@ -587,9 +562,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame": self._df.loc[:, names], self._nan_as_null, self._allow_copy ) - def get_chunks( - self, n_chunks: Optional[int] = None - ) -> Iterable["_CuDFDataFrame"]: + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["_CuDFDataFrame"]: """ Return an iterator yielding the chunks. """ @@ -654,9 +627,7 @@ def __dataframe__( } -def from_dataframe( - df: DataFrameObject, allow_copy: bool = False -) -> _CuDFDataFrame: +def from_dataframe(df: DataFrameObject, allow_copy: bool = False) -> _CuDFDataFrame: """ Construct a ``DataFrame`` from ``df`` if it supports the dataframe interchange protocol (``__dataframe__``). @@ -717,24 +688,16 @@ def from_dataframe( _DtypeKind.FLOAT, _DtypeKind.BOOL, ): - columns[name], _buf = _protocol_to_cudf_column_numeric( - col, allow_copy - ) + columns[name], _buf = _protocol_to_cudf_column_numeric(col, allow_copy) elif col.dtype[0] == _DtypeKind.CATEGORICAL: - columns[name], _buf = _protocol_to_cudf_column_categorical( - col, allow_copy - ) + columns[name], _buf = _protocol_to_cudf_column_categorical(col, allow_copy) elif col.dtype[0] == _DtypeKind.STRING: - columns[name], _buf = _protocol_to_cudf_column_string( - col, allow_copy - ) + columns[name], _buf = _protocol_to_cudf_column_string(col, allow_copy) else: - raise NotImplementedError( - f"Data type {col.dtype[0]} not handled yet" - ) + raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") _buffers.append(_buf) @@ -796,16 +759,12 @@ def _set_missing_values( if valid_mask is not None: null, invalid = protocol_col.describe_null if null == _MaskKind.BYTEMASK: - valid_mask = _ensure_gpu_buffer( - valid_mask[0], valid_mask[1], allow_copy - ) + valid_mask = _ensure_gpu_buffer(valid_mask[0], valid_mask[1], allow_copy) boolmask = as_column(valid_mask._buf, dtype="bool") bitmask = cudf._lib.transform.bools_to_mask(boolmask) return cudf_col.set_mask(bitmask) elif null == _MaskKind.BITMASK: - valid_mask = _ensure_gpu_buffer( - valid_mask[0], valid_mask[1], allow_copy - ) + valid_mask = _ensure_gpu_buffer(valid_mask[0], valid_mask[1], allow_copy) bitmask = valid_mask._buf return cudf_col.set_mask(bitmask) return cudf_col @@ -831,9 +790,7 @@ def _protocol_to_cudf_column_categorical( """ ordered, is_dict, categories = col.describe_categorical if not is_dict: - raise NotImplementedError( - "Non-dictionary categoricals not supported yet" - ) + raise NotImplementedError("Non-dictionary categoricals not supported yet") buffers = col.get_buffers() assert buffers["data"] is not None, "data buffer should not be None" codes_buffer, codes_dtype = buffers["data"] @@ -894,8 +851,6 @@ def _protocol_to_cudf_column_string( def _protocol_buffer_to_cudf_buffer(protocol_buffer): return as_buffer( - rmm.DeviceBuffer( - ptr=protocol_buffer.ptr, size=protocol_buffer.bufsize - ), + rmm.DeviceBuffer(ptr=protocol_buffer.ptr, size=protocol_buffer.bufsize), exposed=True, ) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 3bd342e24c2..4974329396d 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -225,9 +225,7 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": >>> cudf_dtype CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) """ # noqa: E501 - return CategoricalDtype( - categories=dtype.categories, ordered=dtype.ordered - ) + return CategoricalDtype(categories=dtype.categories, ordered=dtype.ordered) def to_pandas(self) -> pd.CategoricalDtype: """ @@ -309,9 +307,7 @@ def deserialize(cls, header, frames): categories_header = header["categories"] categories_frames = frames categories_type = pickle.loads(categories_header["type-serialized"]) - categories = categories_type.deserialize( - categories_header, categories_frames - ) + categories = categories_type.deserialize(categories_header, categories_frames) return klass(categories=categories, ordered=ordered) def __repr__(self): @@ -358,9 +354,7 @@ def __init__(self, element_type: Any) -> None: if isinstance(element_type, ListDtype): self._typ = pa.list_(element_type._typ) else: - element_type = cudf.utils.dtypes.cudf_dtype_to_pa_type( - element_type - ) + element_type = cudf.utils.dtypes.cudf_dtype_to_pa_type(element_type) self._typ = pa.list_(element_type) @cached_property @@ -547,8 +541,7 @@ class StructDtype(_BaseDtype): def __init__(self, fields): pa_fields = { - k: cudf.utils.dtypes.cudf_dtype_to_pa_type(v) - for k, v in fields.items() + k: cudf.utils.dtypes.cudf_dtype_to_pa_type(v) for k, v in fields.items() } self._typ = pa.struct(pa_fields) @@ -653,9 +646,7 @@ def deserialize(cls, header: dict, frames: list): for k, dtype in header["fields"].items(): if isinstance(dtype, tuple): dtype_header, (start, stop) = dtype - fields[k] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize( + fields[k] = pickle.loads(dtype_header["type-serialized"]).deserialize( dtype_header, frames[start:stop], ) @@ -919,9 +910,7 @@ def from_arrow(cls, typ): return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed) def to_arrow(self): - return ArrowIntervalType( - pa.from_numpy_dtype(self.subtype), self.closed - ) + return ArrowIntervalType(pa.from_numpy_dtype(self.subtype), self.closed) @classmethod def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype": @@ -1059,10 +1048,7 @@ def is_list_dtype(obj): or type(obj) is cudf.core.column.ListColumn or obj is cudf.core.column.ListColumn or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name) - or ( - hasattr(obj, "dtype") - and isinstance(obj.dtype, cudf.core.dtypes.ListDtype) - ) + or (hasattr(obj, "dtype") and isinstance(obj.dtype, cudf.core.dtypes.ListDtype)) ) @@ -1109,9 +1095,7 @@ def is_decimal_dtype(obj): Whether or not the array-like or dtype is of the decimal dtype. """ return ( - is_decimal32_dtype(obj) - or is_decimal64_dtype(obj) - or is_decimal128_dtype(obj) + is_decimal32_dtype(obj) or is_decimal64_dtype(obj) or is_decimal128_dtype(obj) ) @@ -1126,9 +1110,7 @@ def _is_interval_dtype(obj): ) or obj is cudf.core.dtypes.IntervalDtype or (isinstance(obj, cudf.core.index.BaseIndex) and obj._is_interval()) - or ( - isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name - ) + or (isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name) or ( isinstance( getattr(obj, "dtype", None), @@ -1163,10 +1145,7 @@ def is_decimal32_dtype(obj): return ( type(obj) is cudf.core.dtypes.Decimal32Dtype or obj is cudf.core.dtypes.Decimal32Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal32Dtype.name - ) + or (isinstance(obj, str) and obj == cudf.core.dtypes.Decimal32Dtype.name) or (hasattr(obj, "dtype") and is_decimal32_dtype(obj.dtype)) ) @@ -1175,10 +1154,7 @@ def is_decimal64_dtype(obj): return ( type(obj) is cudf.core.dtypes.Decimal64Dtype or obj is cudf.core.dtypes.Decimal64Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal64Dtype.name - ) + or (isinstance(obj, str) and obj == cudf.core.dtypes.Decimal64Dtype.name) or (hasattr(obj, "dtype") and is_decimal64_dtype(obj.dtype)) ) @@ -1187,9 +1163,6 @@ def is_decimal128_dtype(obj): return ( type(obj) is cudf.core.dtypes.Decimal128Dtype or obj is cudf.core.dtypes.Decimal128Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal128Dtype.name - ) + or (isinstance(obj, str) and obj == cudf.core.dtypes.Decimal128Dtype.name) or (hasattr(obj, "dtype") and is_decimal128_dtype(obj.dtype)) ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 809bdb4e6d1..2294bc40e15 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -87,9 +87,7 @@ def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? @property def _dtypes(self): - return dict( - zip(self._data.names, (col.dtype for col in self._data.columns)) - ) + return dict(zip(self._data.names, (col.dtype for col in self._data.columns))) @_cudf_nvtx_annotate def serialize(self): @@ -121,9 +119,7 @@ def deserialize(cls, header, frames): key = f"column_{metadata}" if key in header: kwargs[metadata] = pickle.loads(header[key]) - col_accessor = ColumnAccessor( - data=dict(zip(column_names, columns)), **kwargs - ) + col_accessor = ColumnAccessor(data=dict(zip(column_names, columns)), **kwargs) return cls_deserialize._from_data(col_accessor) @classmethod @@ -156,15 +152,11 @@ def _from_columns_like_self( return frame._copy_type_metadata(self, override_dtypes=override_dtypes) @_cudf_nvtx_annotate - def _mimic_inplace( - self, result: Self, inplace: bool = False - ) -> Optional[Self]: + def _mimic_inplace(self, result: Self, inplace: bool = False) -> Optional[Self]: if inplace: for col in self._data: if col in result._data: - self._data[col]._mimic_inplace( - result._data[col], inplace=True - ) + self._data[col]._mimic_inplace(result._data[col], inplace=True) self._data = result._data return None else: @@ -353,9 +345,7 @@ def equals(self, other) -> bool: return all( self_col.equals(other_col, check_dtypes=True) - for self_col, other_col in zip( - self._data.values(), other._data.values() - ) + for self_col, other_col in zip(self._data.values(), other._data.values()) ) @_cudf_nvtx_annotate @@ -453,9 +443,7 @@ def get_column_values_na(col): ) dtype = find_common_type(dtypes) - matrix = make_empty_matrix( - shape=(len(self), ncol), dtype=dtype, order="F" - ) + matrix = make_empty_matrix(shape=(len(self), ncol), dtype=dtype, order="F") for i, col in enumerate(self._data.values()): # TODO: col.values may fail if there is nullable data or an # unsupported dtype. We may want to catch and provide a more @@ -496,9 +484,7 @@ def to_cupy( cupy.ndarray """ return self._to_array( - (lambda col: col.values.copy()) - if copy - else (lambda col: col.values), + (lambda col: col.values.copy()) if copy else (lambda col: col.values), cupy.empty, dtype, na_value, @@ -535,9 +521,7 @@ def to_numpy( "array always copies the data." ) - return self._to_array( - (lambda col: col.values_host), np.empty, dtype, na_value - ) + return self._to_array((lambda col: col.values_host), np.empty, dtype, na_value) @_cudf_nvtx_annotate def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: @@ -728,9 +712,7 @@ def fillna( if method: if method not in {"ffill", "bfill", "pad", "backfill"}: - raise NotImplementedError( - f"Fill method {method} is not supported" - ) + raise NotImplementedError(f"Fill method {method} is not supported") if method == "pad": method = "ffill" elif method == "backfill": @@ -809,9 +791,7 @@ def _quantile_table( column_order = [libcudf.types.Order[key] for key in column_order] - null_precedence = [ - libcudf.types.NullOrder[key] for key in null_precedence - ] + null_precedence = [libcudf.types.NullOrder[key] for key in null_precedence] return self._from_columns_like_self( libcudf.quantiles.quantile_table( @@ -912,17 +892,13 @@ def from_arrow(cls, data: pa.Table) -> Self: size=codes.size, ordered=dict_ordered[name], ) - for name, codes in zip( - dict_indices_table.column_names, indices_columns - ) + for name, codes in zip(dict_indices_table.column_names, indices_columns) } # Handle non-dict arrays cudf_non_category_frame = { name: col - for name, col in zip( - data.column_names, libcudf.interop.from_arrow(data) - ) + for name, col in zip(data.column_names, libcudf.interop.from_arrow(data)) } result = {**cudf_non_category_frame, **cudf_category_frame} @@ -930,19 +906,13 @@ def from_arrow(cls, data: pa.Table) -> Self: # There are some special cases that need to be handled # based on metadata. for name in result: - if ( - len(result[name]) == 0 - and pandas_dtypes.get(name) == "categorical" - ): + if len(result[name]) == 0 and pandas_dtypes.get(name) == "categorical": # When pandas_dtype is a categorical column and the size # of column is 0 (i.e., empty) then we will have an # int8 column in result._data[name] returned by libcudf, # which needs to be type-casted to 'category' dtype. result[name] = result[name].as_categorical_column("category") - elif ( - pandas_dtypes.get(name) == "empty" - and np_dtypes.get(name) == "object" - ): + elif pandas_dtypes.get(name) == "empty" and np_dtypes.get(name) == "object": # When a string column has all null values, pandas_dtype is # is specified as 'empty' and np_dtypes as 'object', # hence handling this special case to type-cast the empty @@ -1011,9 +981,7 @@ def _positions_from_column_names(self, column_names) -> list[int]: Frame. """ return [ - i - for i, name in enumerate(self._column_names) - if name in set(column_names) + i for i, name in enumerate(self._column_names) if name in set(column_names) ] @_cudf_nvtx_annotate @@ -1290,15 +1258,11 @@ def searchsorted( for col, val in zip(self._columns, values) ] sources = [ - col - if is_dtype_equal(col.dtype, common_dtype) - else col.astype(common_dtype) + col if is_dtype_equal(col.dtype, common_dtype) else col.astype(common_dtype) for col, common_dtype in zip(self._columns, common_dtype_list) ] values = [ - val - if is_dtype_equal(val.dtype, common_dtype) - else val.astype(common_dtype) + val if is_dtype_equal(val.dtype, common_dtype) else val.astype(common_dtype) for val, common_dtype in zip(values, common_dtype_list) ] @@ -1464,16 +1428,12 @@ def _is_sorted(self, ascending=None, null_position=None): Returns True, if sorted as expected by ``ascending`` and ``null_position``, False otherwise. """ - if ascending is not None and not cudf.api.types.is_list_like( - ascending - ): + if ascending is not None and not cudf.api.types.is_list_like(ascending): raise TypeError( f"Expected a list-like or None for `ascending`, got " f"{type(ascending)}" ) - if null_position is not None and not cudf.api.types.is_list_like( - null_position - ): + if null_position is not None and not cudf.api.types.is_list_like(null_position): raise TypeError( f"Expected a list-like or None for `null_position`, got " f"{type(null_position)}" @@ -1489,9 +1449,7 @@ def _split(self, splits): """ return [ self._from_columns_like_self( - libcudf.copying.columns_split([*self._data.columns], splits)[ - split_idx - ], + libcudf.copying.columns_split([*self._data.columns], splits)[split_idx], self._column_names, ) for split_idx in range(len(splits) + 1) @@ -1594,9 +1552,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @_cudf_nvtx_annotate @acquire_spill_lock() - def _apply_cupy_ufunc_to_operands( - self, ufunc, cupy_func, operands, **kwargs - ): + def _apply_cupy_ufunc_to_operands(self, ufunc, cupy_func, operands, **kwargs): # Note: There are some operations that may be supported by libcudf but # are not supported by pandas APIs. In particular, libcudf binary # operations support logical and/or operations as well as @@ -1909,10 +1865,7 @@ def __copy__(self): def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data_like_self( - { - name: _apply_inverse_column(col) - for name, col in self._data.items() - } + {name: _apply_inverse_column(col) for name, col in self._data.items()} ) @_cudf_nvtx_annotate @@ -1932,19 +1885,14 @@ def nunique(self, dropna: bool = True): Name and unique value counts of each column in frame. """ return { - name: col.distinct_count(dropna=dropna) - for name, col in self._data.items() + name: col.distinct_count(dropna=dropna) for name, col in self._data.items() } @staticmethod @_cudf_nvtx_annotate - def _repeat( - columns: List[ColumnBase], repeats, axis=None - ) -> List[ColumnBase]: + def _repeat(columns: List[ColumnBase], repeats, axis=None) -> List[ColumnBase]: if axis is not None: - raise NotImplementedError( - "Only axis=`None` supported at this time." - ) + raise NotImplementedError("Only axis=`None` supported at this time.") if not is_scalar(repeats): repeats = as_column(repeats) @@ -1970,6 +1918,4 @@ def _apply_inverse_column(col: ColumnBase) -> ColumnBase: elif is_bool_dtype(col.dtype): return col.unary_operator("not") else: - raise TypeError( - f"Operation `~` not supported on {col.dtype.type.__name__}" - ) + raise TypeError(f"Operation `~` not supported on {col.dtype.type.__name__}") diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e5030eb634b..375b6320297 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -282,9 +282,10 @@ def __iter__(self): if isinstance(group_names, cudf.BaseIndex): group_names = group_names.to_pandas() for i, name in enumerate(group_names): - yield (name,) if isinstance(self._by, list) and len( - self._by - ) == 1 else name, grouped_values[offsets[i] : offsets[i + 1]] + yield ( + (name,) if isinstance(self._by, list) and len(self._by) == 1 else name, + grouped_values[offsets[i] : offsets[i + 1]], + ) @property def dtypes(self): @@ -320,10 +321,7 @@ def dtypes(self): index = self.grouping.keys.unique().sort_values().to_pandas() obj_dtypes = self.obj._dtypes return pd.DataFrame( - { - name: [obj_dtypes[name]] * len(index) - for name in self.obj._data.names - }, + {name: [obj_dtypes[name]] * len(index) for name in self.obj._data.names}, index=index, ) @@ -341,9 +339,7 @@ def groups(self): f"number of groups. Got {len(group_names)} groups." ) - return dict( - zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1])) - ) + return dict(zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1]))) @cached_property def indices(self): @@ -364,11 +360,7 @@ def indices(self): {10: array([0, 1]), 40: array([2])} """ offsets, group_keys, (indices,) = self._groupby.groups( - [ - cudf.core.column.as_column( - range(len(self.obj)), dtype=size_type_dtype - ) - ] + [cudf.core.column.as_column(range(len(self.obj)), dtype=size_type_dtype)] ) group_keys = libcudf.stream_compaction.drop_duplicates(group_keys) @@ -377,9 +369,7 @@ def indices(self): else: (group_keys,) = group_keys index = cudf.Index(group_keys) - return dict( - zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1])) - ) + return dict(zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))) @_cudf_nvtx_annotate def get_group(self, name, obj=None): @@ -432,9 +422,7 @@ def size(self): """ return ( cudf.Series( - cudf.core.column.column_empty( - len(self.obj), "int8", masked=False - ) + cudf.core.column.column_empty(len(self.obj), "int8", masked=False) ) .groupby(self.grouping, sort=self._sort, dropna=self._dropna) .agg("size") @@ -447,9 +435,7 @@ def cumcount(self): """ return ( cudf.Series( - cudf.core.column.column_empty( - len(self.obj), "int8", masked=False - ), + cudf.core.column.column_empty(len(self.obj), "int8", masked=False), index=self.obj.index, ) .groupby(self.grouping, sort=self._sort) @@ -482,12 +468,9 @@ def rank( # treats NaNs the way we treat nulls. if cudf.get_option("mode.pandas_compatible"): if any( - is_float_dtype(typ) - for typ in self.grouping.values._dtypes.values() + is_float_dtype(typ) for typ in self.grouping.values._dtypes.values() ): - raise NotImplementedError( - "NaNs are not supported in groupby.rank." - ) + raise NotImplementedError("NaNs are not supported in groupby.rank.") def rank(x): return getattr(x, "rank")( @@ -507,9 +490,7 @@ def rank(x): @cached_property def _groupby(self): - return libgroupby.GroupBy( - [*self.grouping.keys._columns], dropna=self._dropna - ) + return libgroupby.GroupBy([*self.grouping.keys._columns], dropna=self._dropna) @_cudf_nvtx_annotate def agg(self, func): @@ -632,10 +613,7 @@ def agg(self, func): key = (col_name, agg_name) else: key = col_name - if ( - agg in {list, "collect"} - and orig_dtype != col.dtype.element_type - ): + if agg in {list, "collect"} and orig_dtype != col.dtype.element_type: # Structs lose their labels which we reconstruct here col = col._with_type_metadata(cudf.ListDtype(orig_dtype)) @@ -673,9 +651,7 @@ def agg(self, func): ) and not libgroupby._is_all_scan_aggregate(normalized_aggs): # Even with `sort=False`, pandas guarantees that # groupby preserves the order of rows within each group. - left_cols = list( - self.grouping.keys.drop_duplicates()._data.columns - ) + left_cols = list(self.grouping.keys.drop_duplicates()._data.columns) right_cols = list(result_index._data.columns) join_keys = [ _match_join_keys(lcol, rcol, "left") @@ -729,13 +705,9 @@ def _reduce( The numeric_only, min_count """ if numeric_only: - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) + raise NotImplementedError("numeric_only parameter is not implemented yet") if min_count != 0: - raise NotImplementedError( - "min_count parameter is not implemented yet" - ) + raise NotImplementedError("min_count parameter is not implemented yet") return self.agg(op) def _scan(self, op: str, *args, **kwargs): @@ -778,9 +750,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # "Out of bounds" n for the group size either means no entries # (negative) or all the entries (positive) if n < 0: - size_per_group = np.maximum( - size_per_group + n, 0, out=size_per_group - ) + size_per_group = np.maximum(size_per_group + n, 0, out=size_per_group) else: size_per_group = np.minimum(size_per_group, n, out=size_per_group) if take_head: @@ -857,9 +827,7 @@ def head(self, n: int = 5, *, preserve_order: bool = True): 6 3 6 8 3 8 """ - return self._head_tail( - n, take_head=True, preserve_order=preserve_order - ) + return self._head_tail(n, take_head=True, preserve_order=preserve_order) @_cudf_nvtx_annotate def tail(self, n: int = 5, *, preserve_order: bool = True): @@ -911,9 +879,7 @@ def tail(self, n: int = 5, *, preserve_order: bool = True): 9 3 9 10 3 10 """ - return self._head_tail( - n, take_head=False, preserve_order=preserve_order - ) + return self._head_tail(n, take_head=False, preserve_order=preserve_order) @_cudf_nvtx_annotate def nth(self, n): @@ -929,9 +895,7 @@ def nth(self, n): result = result[sizes > n] - result._index = self.obj.index.take( - result._data["__groupbynth_order__"] - ) + result._index = self.obj.index.take(result._data["__groupbynth_order__"]) del result._data["__groupbynth_order__"] del self.obj._data["__groupbynth_order__"] return result @@ -1082,8 +1046,7 @@ def sample( # TODO: handle random states properly. if random_state is not None and not isinstance(random_state, int): raise NotImplementedError( - "Only integer seeds are supported for random_state " - "in this case" + "Only integer seeds are supported for random_state " "in this case" ) # Get the groups # TODO: convince Cython to convert the std::vector offsets @@ -1105,9 +1068,9 @@ def sample( # Pandas uses round-to-nearest, ties to even to # pick sample sizes for the fractional case (unlike IEEE # which is round-to-nearest, ties to sgn(x) * inf). - samples_per_group = np.round( - size_per_group * frac, decimals=0 - ).astype(size_type_dtype) + samples_per_group = np.round(size_per_group * frac, decimals=0).astype( + size_type_dtype + ) if replace: # We would prefer to use cupy here, but their rng.integers # interface doesn't take array-based low and high @@ -1178,9 +1141,7 @@ def deserialize(cls, header, frames): kwargs = header["kwargs"] obj_type = pickle.loads(header["obj_type"]) - obj = obj_type.deserialize( - header["obj"], frames[: header["num_obj_frames"]] - ) + obj = obj_type.deserialize(header["obj"], frames[: header["num_obj_frames"]]) grouping = _Grouping.deserialize( header["grouping"], frames[header["num_obj_frames"] :] ) @@ -1305,9 +1266,7 @@ def pipe(self, func, *args, **kwargs): def _jit_groupby_apply( self, function, group_names, offsets, group_keys, grouped_values, *args ): - chunk_results = jit_groupby_apply( - offsets, grouped_values, function, *args - ) + chunk_results = jit_groupby_apply(offsets, grouped_values, function, *args) return self._post_process_chunk_results( chunk_results, group_names, group_keys, grouped_values ) @@ -1325,9 +1284,7 @@ def _iterative_groupby_apply( RuntimeWarning, ) - chunks = [ - grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) - ] + chunks = [grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])] chunk_results = [function(chk, *args) for chk in chunks] return self._post_process_chunk_results( chunk_results, group_names, group_keys, grouped_values @@ -1373,9 +1330,7 @@ def _post_process_chunk_results( # TODO: Is there a better way to determine what # the column name should be, especially if we applied # a nameless UDF. - result = result.to_frame( - name=grouped_values._data.names[0] - ) + result = result.to_frame(name=grouped_values._data.names[0]) else: index_data = group_keys._data.copy(deep=True) index_data[None] = grouped_values.index._column @@ -1401,9 +1356,7 @@ def _post_process_chunk_results( return result @_cudf_nvtx_annotate - def apply( - self, function, *args, engine="auto", include_groups: bool = True - ): + def apply(self, function, *args, engine="auto", include_groups: bool = True): """Apply a python transformation function over the grouped chunk. Parameters @@ -1917,13 +1870,9 @@ def corr(self, method="pearson", min_periods=1): """ if method.lower() not in ("pearson",): - raise NotImplementedError( - "Only pearson correlation is currently supported" - ) + raise NotImplementedError("Only pearson correlation is currently supported") - return self._cov_or_corr( - lambda x: x.corr(method, min_periods), "Correlation" - ) + return self._cov_or_corr(lambda x: x.corr(method, min_periods), "Correlation") @_cudf_nvtx_annotate def cov(self, min_periods=0, ddof=1): @@ -2011,9 +1960,7 @@ def cov(self, min_periods=0, ddof=1): val3 3.833333 12.333333 12.333333 """ - return self._cov_or_corr( - lambda x: x.cov(min_periods, ddof), "Covariance" - ) + return self._cov_or_corr(lambda x: x.cov(min_periods, ddof), "Covariance") def _cov_or_corr(self, func, method_name): """ @@ -2057,17 +2004,15 @@ def _cov_or_corr(self, func, method_name): offset=0, ) - column_pair_groupby = cudf.DataFrame._from_data( - column_pair_structs - ).groupby(by=self.grouping.keys) + column_pair_groupby = cudf.DataFrame._from_data(column_pair_structs).groupby( + by=self.grouping.keys + ) try: gb_cov_corr = column_pair_groupby.agg(func) except RuntimeError as e: if "Unsupported groupby reduction type-agg combination" in str(e): - raise TypeError( - f"{method_name} accepts only numerical column-pairs" - ) + raise TypeError(f"{method_name} accepts only numerical column-pairs") raise # ensure that column-pair labels are arranged in ascending order @@ -2077,8 +2022,7 @@ def _cov_or_corr(self, func, method_name): for i, x in enumerate(column_names) ] cols_split = [ - cols_list[i : i + num_cols] - for i in range(0, len(cols_list), num_cols) + cols_list[i : i + num_cols] for i in range(0, len(cols_list), num_cols) ] # interleave: combines the correlation or covariance results for each @@ -2295,9 +2239,7 @@ def fillna( values = self.obj.__class__._from_data( self.grouping.values._data, self.obj.index ) - return values.fillna( - value=value, inplace=inplace, axis=axis, limit=limit - ) + return values.fillna(value=value, inplace=inplace, axis=axis, limit=limit) @_cudf_nvtx_annotate def shift(self, periods=1, freq=None, axis=0, fill_value=None): @@ -2343,9 +2285,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): values = self.grouping.values if is_list_like(fill_value): if len(fill_value) != len(values._data): - raise ValueError( - "Mismatched number of columns and values to fill." - ) + raise ValueError("Mismatched number of columns and values to fill.") else: fill_value = [fill_value] * len(values._data) @@ -2353,9 +2293,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): dict( zip( values._column_names, - self._groupby.shift( - [*values._columns], periods, fill_value - )[0], + self._groupby.shift([*values._columns], periods, fill_value)[0], ) ) ) @@ -2407,15 +2345,11 @@ def pct_change( if freq is not None: raise NotImplementedError("freq parameter not supported yet.") elif fill_method not in {no_default, None, "ffill", "bfill"}: - raise ValueError( - "fill_method must be one of 'ffill', or" "'bfill'." - ) + raise ValueError("fill_method must be one of 'ffill', or" "'bfill'.") if fill_method not in (no_default, None) or limit is not no_default: # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "The 'fill_method' keyword being not None and the 'limit' " f"keywords in {type(self).__name__}.pct_change are " @@ -2572,9 +2506,7 @@ def value_counts( df["__placeholder"] = 1 result = ( - df.groupby(groupings + list(subset), dropna=dropna)[ - "__placeholder" - ] + df.groupby(groupings + list(subset), dropna=dropna)["__placeholder"] .count() .sort_index() .astype(np.int64) @@ -2593,9 +2525,7 @@ def value_counts( if not self._as_index: if name in df._column_names: - raise ValueError( - f"Column label '{name}' is duplicate of result column" - ) + raise ValueError(f"Column label '{name}' is duplicate of result column") result.name = name result = result.to_frame().reset_index() else: @@ -2603,9 +2533,7 @@ def value_counts( return result - def _mimic_pandas_order( - self, result: DataFrameOrSeries - ) -> DataFrameOrSeries: + def _mimic_pandas_order(self, result: DataFrameOrSeries) -> DataFrameOrSeries: """Given a groupby result from libcudf, reconstruct the row orders matching that of pandas. This also adds appropriate indices. """ @@ -2618,12 +2546,9 @@ def _mimic_pandas_order( # result coming back from libcudf has null_count few rows than # the input, so we must produce an ordering from the full # input range. - _, _, (ordering,) = self._groupby.groups( - [as_column(range(0, len(self.obj)))] - ) + _, _, (ordering,) = self._groupby.groups([as_column(range(0, len(self.obj)))]) if self._dropna and any( - c.has_nulls(include_nan=True) > 0 - for c in self.grouping._key_columns + c.has_nulls(include_nan=True) > 0 for c in self.grouping._key_columns ): # Scan aggregations with null/nan keys put nulls in the # corresponding output rows in pandas, to do that here @@ -2691,9 +2616,7 @@ def apply(self, func, *args): # TODO: should we define this as a dataclass instead? class Grouper: - def __init__( - self, key=None, level=None, freq=None, closed=None, label=None - ): + def __init__(self, key=None, level=None, freq=None, closed=None, label=None): if key is not None and level is not None: raise ValueError("Grouper cannot specify both key and level") if (key, level) == (None, None) and not freq: @@ -2762,9 +2685,7 @@ def keys(self): dict(zip(range(nkeys), self._key_columns)) )._set_names(self.names) else: - return cudf.core.index.as_index( - self._key_columns[0], name=self.names[0] - ) + return cudf.core.index.as_index(self._key_columns[0], name=self.names[0]) @property def values(self) -> cudf.core.frame.Frame: @@ -2851,9 +2772,7 @@ def serialize(self): def deserialize(cls, header, frames): names = pickle.loads(header["names"]) _named_columns = pickle.loads(header["_named_columns"]) - key_columns = cudf.core.column.deserialize_columns( - header["columns"], frames - ) + key_columns = cudf.core.column.deserialize_columns(header["columns"], frames) out = _Grouping.__new__(_Grouping) out.names = names out._named_columns = _named_columns diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index bd9dc1ae3da..1d0efc3ad47 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -141,8 +141,7 @@ def _index_from_data(data: MutableMapping, name: Any = no_default): index_class_type = IntervalIndex else: raise NotImplementedError( - "Unsupported column type passed to " - f"create an Index: {type(values)}" + "Unsupported column type passed to " f"create an Index: {type(values)}" ) else: index_class_type = cudf.MultiIndex @@ -201,9 +200,7 @@ class RangeIndex(BaseIndex, BinaryOperand): _range: range @_cudf_nvtx_annotate - def __init__( - self, start, stop=None, step=1, dtype=None, copy=False, name=None - ): + def __init__(self, start, stop=None, step=1, dtype=None, copy=False, name=None): if step == 0: raise ValueError("Step must not be zero.") if not cudf.api.types.is_hashable(name): @@ -219,20 +216,14 @@ def __init__( if stop is None: start, stop = 0, start if not is_integer(start): - raise TypeError( - f"start must be an integer, not {type(start).__name__}" - ) + raise TypeError(f"start must be an integer, not {type(start).__name__}") self._start = int(start) if not is_integer(stop): - raise TypeError( - f"stop must be an integer, not {type(stop).__name__}" - ) + raise TypeError(f"stop must be an integer, not {type(stop).__name__}") self._stop = int(stop) if step is not None: if not is_integer(step): - raise TypeError( - f"step must be an integer, not {type(step).__name__}" - ) + raise TypeError(f"step must be an integer, not {type(step).__name__}") self._step = int(step) else: self._step = 1 @@ -243,9 +234,7 @@ def __init__( # whereas _stop is an upper bound. self._end = self._start + self._step * (len(self._range) - 1) - def _copy_type_metadata( - self, other: RangeIndex, *, override_dtypes=None - ) -> Self: + def _copy_type_metadata(self, other: RangeIndex, *, override_dtypes=None) -> Self: # There is no metadata to be copied for RangeIndex since it does not # have an underlying column. return self @@ -341,9 +330,7 @@ def hasnans(self): @property # type: ignore @_cudf_nvtx_annotate def _data(self): - return cudf.core.column_accessor.ColumnAccessor( - {self.name: self._values} - ) + return cudf.core.column_accessor.ColumnAccessor({self.name: self._values}) @_cudf_nvtx_annotate def __contains__(self, item): @@ -541,9 +528,7 @@ def __mul__(self, other): ): other = other.item() if isinstance(other, (int, np.integer)): - return RangeIndex( - self.start * other, self.stop * other, self.step * other - ) + return RangeIndex(self.start * other, self.stop * other, self.step * other) return self._as_int_index().__mul__(other) @_cudf_nvtx_annotate @@ -559,9 +544,7 @@ def _as_int_index(self): @_cudf_nvtx_annotate def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - return self._as_int_index().__array_ufunc__( - ufunc, method, *inputs, **kwargs - ) + return self._as_int_index().__array_ufunc__(ufunc, method, *inputs, **kwargs) @_cudf_nvtx_annotate def get_indexer(self, target, limit=None, method=None, tolerance=None): @@ -716,9 +699,7 @@ def _intersection(self, other, sort=None): # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds - tmp_start = ( - first.start + (second.start - first.start) * first.step // gcd * s - ) + tmp_start = first.start + (second.start - first.start) * first.step // gcd * s new_step = first.step * second.step // gcd no_steps = -(-(int_low - tmp_start) // abs(new_step)) new_start = tmp_start + abs(new_step) * no_steps @@ -737,9 +718,7 @@ def difference(self, other, sort=None): if isinstance(other, RangeIndex) and self.equals(other): return self[:0]._get_reconciled_name_object(other) - return self._try_reconstruct_range_index( - super().difference(other, sort=sort) - ) + return self._try_reconstruct_range_index(super().difference(other, sort=sort)) def _try_reconstruct_range_index(self, index): if isinstance(index, RangeIndex) or index.dtype.kind == "f": @@ -801,18 +780,14 @@ def repeat(self, repeats, axis=None): return self._as_int_index().repeat(repeats, axis) def _split(self, splits): - return cudf.Index._from_data( - {self.name: self._as_int_index()._split(splits)} - ) + return cudf.Index._from_data({self.name: self._as_int_index()._split(splits)}) def _binaryop(self, other, op: str): # TODO: certain binops don't require materializing range index and # could use some optimization. return self._as_int_index()._binaryop(other, op=op) - def join( - self, other, how="left", level=None, return_indexers=False, sort=False - ): + def join(self, other, how="left", level=None, return_indexers=False, sort=False): if how in {"left", "right"} or self.equals(other): # pandas supports directly merging RangeIndex objects and can # intelligently create RangeIndex outputs depending on the type of @@ -827,14 +802,10 @@ def join( sort=sort, ) if return_indexers: - return tuple( - cudf.from_pandas(result[0]), result[1], result[2] - ) + return tuple(cudf.from_pandas(result[0]), result[1], result[2]) else: return cudf.from_pandas(result) - return self._as_int_index().join( - other, how, level, return_indexers, sort - ) + return self._as_int_index().join(other, how, level, return_indexers, sort) @property # type: ignore @_cudf_nvtx_annotate @@ -861,9 +832,7 @@ def argsort( raise ValueError(f"invalid na_position: {na_position}") indices = cupy.arange(0, len(self)) - if (ascending and self._step < 0) or ( - not ascending and self._step > 0 - ): + if (ascending and self._step < 0) or (not ascending and self._step > 0): indices = indices[::-1] return indices @@ -908,9 +877,7 @@ def _minmax(self, meth: str): no_steps = len(self) - 1 if no_steps == -1: return np.nan - elif (meth == "min" and self.step > 0) or ( - meth == "max" and self.step < 0 - ): + elif (meth == "min" and self.step > 0) or (meth == "max" and self.step < 0): return self.start return self.start + self.step * no_steps @@ -1007,8 +974,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): inputs = self._make_operands_for_binop(other) else: inputs = { - name: (col, None, False, None) - for name, col in self._data.items() + name: (col, None, False, None) for name, col in self._data.items() } data = self._apply_cupy_ufunc_to_operands( @@ -1075,9 +1041,7 @@ def _binaryop( other_name = getattr(other, "name", self.name) ret.name = ( - self.name - if cudf.utils.utils._is_same_name(self.name, other_name) - else None + self.name if cudf.utils.utils._is_same_name(self.name, other_name) else None ) # pandas returns numpy arrays when the outputs are boolean. We @@ -1092,12 +1056,8 @@ def _binaryop( # Override just to make mypy happy. @_cudf_nvtx_annotate - def _copy_type_metadata( - self, other: Self, *, override_dtypes=None - ) -> Self: - return super()._copy_type_metadata( - other, override_dtypes=override_dtypes - ) + def _copy_type_metadata(self, other: Self, *, override_dtypes=None) -> Self: + return super()._copy_type_metadata(other, override_dtypes=override_dtypes) @property # type: ignore @_cudf_nvtx_annotate @@ -1110,9 +1070,7 @@ def _concat(cls, objs): non_empties = [index for index in objs if len(index)] if len(objs) != len(non_empties): # Do not remove until pandas-3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "The behavior of array concatenation with empty entries is " "deprecated. In a future version, this will no longer exclude " @@ -1147,11 +1105,7 @@ def is_unique(self): @_cudf_nvtx_annotate def equals(self, other): - if ( - other is None - or not isinstance(other, BaseIndex) - or len(self) != len(other) - ): + if other is None or not isinstance(other, BaseIndex) or len(self) != len(other): return False check_dtypes = False @@ -1166,9 +1120,7 @@ def equals(self, other): check_dtypes = True try: - return self._column.equals( - other._column, check_dtypes=check_dtypes - ) + return self._column.equals(other._column, check_dtypes=check_dtypes) except TypeError: return False @@ -1220,9 +1172,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if not self.is_unique: raise ValueError("Cannot get index for a non-unique Index.") - is_sorted = ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ) + is_sorted = self.is_monotonic_increasing or self.is_monotonic_decreasing if not is_sorted and method is not None: raise ValueError( @@ -1276,9 +1226,7 @@ def get_loc(self, key): if not is_scalar(key): raise TypeError("Should be a scalar-like") - is_sorted = ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ) + is_sorted = self.is_monotonic_increasing or self.is_monotonic_decreasing target_as_table = cudf.core.frame.Frame({"None": as_column([key])}) lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( @@ -1290,11 +1238,7 @@ def get_loc(self, key): if lower_bound + 1 == upper_bound: # Search result is unique, return int. - return ( - lower_bound - if is_sorted - else sort_inds.element_indexing(lower_bound) - ) + return lower_bound if is_sorted else sort_inds.element_indexing(lower_bound) if is_sorted: # In monotonic index, lex search result is continuous. A slice for @@ -1342,9 +1286,7 @@ def __repr__(self): ) ) break_idx = output.find("ordered=") - output = ( - output[:break_idx].replace("'", "") + output[break_idx:] - ) + output = output[:break_idx].replace("'", "") + output[break_idx:] else: output = repr(preprocess.to_pandas()) @@ -1593,9 +1535,7 @@ def str(self): if isinstance(self._values, cudf.core.column.StringColumn): return StringMethods(parent=self) else: - raise AttributeError( - "Can only use .str accessor with string values!" - ) + raise AttributeError("Can only use .str accessor with string values!") @cache @_warn_no_dask_cudf @@ -1738,9 +1678,7 @@ def _copy_type_metadata( return self @classmethod - def _from_data( - cls, data: MutableMapping, name: Any = no_default, freq: Any = None - ): + def _from_data(cls, data: MutableMapping, name: Any = no_default, freq: Any = None): result = super()._from_data(data, name) result._freq = _validate_freq(freq) return result @@ -2105,9 +2043,7 @@ def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.DatetimeIndex: if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") @@ -2293,9 +2229,7 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): result_col = delocalize(self._column) else: result_col = localize(self._column, tz, ambiguous, nonexistent) - return DatetimeIndex._from_data( - {self.name: result_col}, freq=self._freq - ) + return DatetimeIndex._from_data({self.name: result_col}, freq=self._freq) def tz_convert(self, tz): """ @@ -2445,9 +2379,7 @@ def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.TimedeltaIndex: if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") @@ -2464,9 +2396,7 @@ def days(self): Number of days for each element. """ # Need to specifically return `int64` to avoid overflow. - return as_index( - arbitrary=self._values.days, name=self.name, dtype="int64" - ) + return as_index(arbitrary=self._values.days, name=self.name, dtype="int64") @property # type: ignore @_cudf_nvtx_annotate @@ -2474,9 +2404,7 @@ def seconds(self): """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return as_index( - arbitrary=self._values.seconds, name=self.name, dtype="int32" - ) + return as_index(arbitrary=self._values.seconds, name=self.name, dtype="int32") @property # type: ignore @_cudf_nvtx_annotate @@ -2588,8 +2516,7 @@ def __init__( if isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): if categories is not None or ordered is not None: raise ValueError( - "Cannot specify `categories` or " - "`ordered` together with `dtype`." + "Cannot specify `categories` or " "`ordered` together with `dtype`." ) if copy: data = column.as_column(data, dtype=dtype).copy(deep=True) @@ -2613,9 +2540,7 @@ def __init__( ordered=data.ordered, ) else: - data = column.as_column( - data, dtype="category" if dtype is None else dtype - ) + data = column.as_column(data, dtype="category" if dtype is None else dtype) # dtype has already been taken care dtype = None @@ -2740,8 +2665,7 @@ def interval_range( end = start + periods * freq if any( - not _is_non_decimal_numeric_dtype(x.dtype) - for x in (start, periods, freq, end) + not _is_non_decimal_numeric_dtype(x.dtype) for x in (start, periods, freq, end) ): raise ValueError("start, end, periods, freq must be numeric values.") @@ -2867,9 +2791,7 @@ def closed(self): def from_breaks( cls, breaks, - closed: Optional[ - Literal["left", "right", "neither", "both"] - ] = "right", + closed: Optional[Literal["left", "right", "neither", "both"]] = "right", name=None, copy: bool = False, dtype=None, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 94d862d52b4..b99fed499bb 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -354,9 +354,7 @@ def __round__(self, digits=0): # this method. return self.round(decimals=digits) - def _mimic_inplace( - self, result: Self, inplace: bool = False - ) -> Optional[Self]: + def _mimic_inplace(self, result: Self, inplace: bool = False) -> Optional[Self]: if inplace: self._index = result._index return super()._mimic_inplace(result, inplace) @@ -512,9 +510,7 @@ def empty(self): def to_json(self, path_or_buf=None, *args, **kwargs): """{docstring}""" - return cudf.io.json.to_json( - self, path_or_buf=path_or_buf, *args, **kwargs - ) + return cudf.io.json.to_json(self, path_or_buf=path_or_buf, *args, **kwargs) @_cudf_nvtx_annotate @ioutils.doc_to_hdf() @@ -1013,11 +1009,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): if self.ndim == 1: # In case of series and Index, # swap lower and upper if lower > upper - if ( - lower[0] is not None - and upper[0] is not None - and (lower[0] > upper[0]) - ): + if lower[0] is not None and upper[0] is not None and (lower[0] > upper[0]): lower[0], upper[0] = upper[0], lower[0] data = { @@ -1112,12 +1104,8 @@ def dot(self, other, reflect=False): elif isinstance(self, cudf.DataFrame) and isinstance( other, (cudf.Series, cudf.DataFrame) ): - common = self._data.to_pandas_index().union( - other.index.to_pandas() - ) - if len(common) > len(self._data.names) or len(common) > len( - other.index - ): + common = self._data.to_pandas_index().union(other.index.to_pandas()) + if len(common) > len(self._data.names) or len(common) > len(other.index): raise ValueError("matrices are not aligned") lhs = self.reindex(columns=common, copy=False) @@ -1493,9 +1481,7 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs): **kwargs, ) - def median( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs - ): + def median(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): """ Return the median of the values for the requested axis. @@ -1848,9 +1834,7 @@ def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: @_cudf_nvtx_annotate @copy_docstring(Rolling) - def rolling( - self, window, min_periods=None, center=False, axis=0, win_type=None - ): + def rolling(self, window, min_periods=None, center=False, axis=0, win_type=None): return Rolling( self, window, @@ -1927,11 +1911,7 @@ def _copy_type_metadata( See `ColumnBase._with_type_metadata` for more information. """ super()._copy_type_metadata(other, override_dtypes=override_dtypes) - if ( - include_index - and self._index is not None - and other._index is not None - ): + if include_index and self._index is not None and other._index is not None: self._index._copy_type_metadata(other._index) # When other._index is a CategoricalIndex, the current index # will be a NumericalIndex with an underlying CategoricalColumn @@ -1940,9 +1920,7 @@ def _copy_type_metadata( # appropriate index. if isinstance( other._index, cudf.core.index.CategoricalIndex - ) and not isinstance( - self._index, cudf.core.index.CategoricalIndex - ): + ) and not isinstance(self._index, cudf.core.index.CategoricalIndex): self._index = cudf.Index( cast("cudf.Index", self._index)._column, name=self._index.name, @@ -2060,9 +2038,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): if freq is not None: raise ValueError("The freq argument is not yet supported.") - data_columns = ( - col.shift(periods, fill_value) for col in self._columns - ) + data_columns = (col.shift(periods, fill_value) for col in self._columns) return self.__class__._from_data( zip(self._column_names, data_columns), self._index ) @@ -2709,9 +2685,7 @@ def sort_index( by=by, ascending=ascending, na_position=na_position ) out = self._gather( - GatherMap.from_column_unchecked( - inds, len(self), nullify=False - ) + GatherMap.from_column_unchecked(inds, len(self), nullify=False) ) # TODO: frame factory function should handle multilevel column # names @@ -2725,9 +2699,7 @@ def sort_index( ): out = self.copy() else: - inds = idx.argsort( - ascending=ascending, na_position=na_position - ) + inds = idx.argsort(ascending=ascending, na_position=na_position) out = self._gather( GatherMap.from_column_unchecked( cudf.core.column.as_column(inds), @@ -2948,16 +2920,12 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: has_range_index = isinstance(index, RangeIndex) if len(range(start, stop, stride)) == 0: # Avoid materialising the range index column - result = self._empty_like( - keep_index=keep_index and not has_range_index - ) + result = self._empty_like(keep_index=keep_index and not has_range_index) if keep_index and has_range_index: lo = index.start + start * index.step hi = index.start + stop * index.step step = index.step * stride - result.index = RangeIndex( - start=lo, stop=hi, step=step, name=index.name - ) + result.index = RangeIndex(start=lo, stop=hi, step=step, name=index.name) return result if start < 0: start = start + num_rows @@ -2992,11 +2960,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: ) columns_to_slice = [ - *( - self._index._data.columns - if keep_index and not has_range_index - else [] - ), + *(self._index._data.columns if keep_index and not has_range_index else []), *self._columns, ] result = self._from_columns_like_self( @@ -3011,18 +2975,14 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: result.index = self.index[start:stop] return result - def _positions_from_column_names( - self, column_names, offset_by_index_columns=False - ): + def _positions_from_column_names(self, column_names, offset_by_index_columns=False): """Map each column name into their positions in the frame. Return positions of the provided column names, offset by the number of index columns if `offset_by_index_columns` is True. The order of indices returned corresponds to the column order in this Frame. """ - num_index_columns = ( - len(self._index._data) if offset_by_index_columns else 0 - ) + num_index_columns = len(self._index._data) if offset_by_index_columns else 0 return [ i + num_index_columns for i, name in enumerate(self._column_names) @@ -3051,8 +3011,7 @@ def drop_duplicates( """ if not isinstance(ignore_index, (np.bool_, bool)): raise ValueError( - f"{ignore_index=} must be bool, " - f"not {type(ignore_index).__name__}" + f"{ignore_index=} must be bool, " f"not {type(ignore_index).__name__}" ) subset = self._preprocess_subset(subset) subset_cols = [name for name in self._column_names if name in subset] @@ -3174,9 +3133,7 @@ def duplicated(self, subset=None, keep="first"): columns = [self._column] else: columns = [self._data[n] for n in subset] - distinct = libcudf.stream_compaction.distinct_indices( - columns, keep=keep - ) + distinct = libcudf.stream_compaction.distinct_indices(columns, keep=keep) (result,) = libcudf.copying.scatter( [cudf.Scalar(False, dtype=bool)], distinct, @@ -3223,14 +3180,10 @@ def _split(self, splits, keep_index=True): ] @_cudf_nvtx_annotate - def fillna( - self, value=None, method=None, axis=None, inplace=False, limit=None - ): # noqa: D102 + def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None): # noqa: D102 if method is not None: # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( f"{type(self).__name__}.fillna with 'method' is " "deprecated and will raise in a future version. " @@ -3453,9 +3406,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): self, func, args, kernel_getter=kernel_getter ) except Exception as e: - raise ValueError( - "user defined function compilation failed." - ) from e + raise ValueError("user defined function compilation failed.") from e # Mask and data column preallocated ans_col = _return_arr_from_dtype(retty, len(self)) @@ -3554,10 +3505,7 @@ def sort_values( ), keep_index=not ignore_index, ) - if ( - isinstance(self, cudf.core.dataframe.DataFrame) - and self._data.multiindex - ): + if isinstance(self, cudf.core.dataframe.DataFrame) and self._data.multiindex: out.columns = self._data.to_pandas_index() return out @@ -3609,13 +3557,9 @@ def _n_largest_or_smallest( # Empty slice. indices = indices.slice(0, 0) else: - indices = indices.slice( - *slice(None, -n - 1, -1).indices(len(self)) - ) + indices = indices.slice(*slice(None, -n - 1, -1).indices(len(self))) return self._gather( - GatherMap.from_column_unchecked( - indices, len(self), nullify=False - ), + GatherMap.from_column_unchecked(indices, len(self), nullify=False), keep_index=True, ) else: @@ -3652,9 +3596,7 @@ def _align_to_index( result = result.sort_values(sort_col_id) del result[sort_col_id] - result = self.__class__._from_data( - data=result._data, index=result.index - ) + result = self.__class__._from_data(data=result._data, index=result.index) result._data.multiindex = self._data.multiindex result._data._level_names = self._data._level_names result.index.names = self.index.names @@ -3700,9 +3642,7 @@ def _reindex( df = self if index is not None: if not df._index.is_unique: - raise ValueError( - "cannot reindex on an axis with duplicate labels" - ) + raise ValueError("cannot reindex on an axis with duplicate labels") index = cudf.core.index.as_index( index, name=getattr(index, "name", self._index.name) ) @@ -3717,9 +3657,7 @@ def _reindex( if not idx_dtype_match: column_names = ( - column_names - if column_names is not None - else list(df._column_names) + column_names if column_names is not None else list(df._column_names) ) df = cudf.DataFrame() else: @@ -3727,9 +3665,7 @@ def _reindex( rhs = cudf.DataFrame._from_data( { # bookkeeping workaround for unnamed series - (name or 0) - if isinstance(self, cudf.Series) - else name: col + (name or 0) if isinstance(self, cudf.Series) else name: col for name, col in df._data.items() }, index=df._index, @@ -3758,9 +3694,7 @@ def _reindex( names = column_names if isinstance(names, cudf.Index): names = names.to_pandas() - rangeindex = isinstance( - column_names, (pd.RangeIndex, cudf.RangeIndex) - ) + rangeindex = isinstance(column_names, (pd.RangeIndex, cudf.RangeIndex)) level_names = tuple(column_names.names) else: names = column_names @@ -3887,9 +3821,7 @@ def round(self, decimals=0, how="half_even"): elif isinstance(decimals, int): decimals = {name: decimals for name in self._column_names} elif not isinstance(decimals, abc.Mapping): - raise TypeError( - "decimals must be an integer, a dict-like or a Series" - ) + raise TypeError("decimals must be an integer, a dict-like or a Series") cols = { name: col.round(decimals[name], how=how) @@ -4082,18 +4014,14 @@ def resample( "- origin\n" "- offset" ) - by = cudf.Grouper( - key=on, freq=rule, closed=closed, label=label, level=level - ) + by = cudf.Grouper(key=on, freq=rule, closed=closed, label=label, level=level) return ( cudf.core.resample.SeriesResampler(self, by=by) if isinstance(self, cudf.Series) else cudf.core.resample.DataFrameResampler(self, by=by) ) - def dropna( - self, axis=0, how="any", thresh=None, subset=None, inplace=False - ): + def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): """ Drop rows (or columns) containing nulls from a Column. @@ -4193,9 +4121,7 @@ def dropna( if axis == 0: result = self._drop_na_rows(how=how, subset=subset, thresh=thresh) else: - result = self._drop_na_columns( - how=how, subset=subset, thresh=thresh - ) + result = self._drop_na_columns(how=how, subset=subset, thresh=thresh) return self._mimic_inplace(result, inplace=inplace) @@ -4222,9 +4148,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): check_col = col.nans_to_nulls() except AttributeError: check_col = col - no_threshold_valid_count = ( - len(col) - check_col.null_count - ) < thresh + no_threshold_valid_count = (len(col) - check_col.null_count) < thresh if no_threshold_valid_count: continue out_cols.append(name) @@ -4397,9 +4321,7 @@ def _first_or_last( if not isinstance(self._index, cudf.core.index.DatetimeIndex): raise TypeError("'first' only supports a DatetimeIndex index.") if not isinstance(offset, str): - raise NotImplementedError( - f"Unsupported offset type {type(offset)}." - ) + raise NotImplementedError(f"Unsupported offset type {type(offset)}.") if len(self) == 0: return self.copy() @@ -4419,9 +4341,7 @@ def _first_or_last( return self.loc[:to_search] needle = as_column(to_search, dtype=self._index.dtype) end_point = int( - self._index._column.searchsorted( - needle, side=side - ).element_indexing(0) + self._index._column.searchsorted(needle, side=side).element_indexing(0) ) return slice_func(end_point) @@ -4643,15 +4563,11 @@ def sample( "population `frac` > 1." ) if n is not None: - raise ValueError( - "Please enter a value for `frac` OR `n`, not both." - ) + raise ValueError("Please enter a value for `frac` OR `n`, not both.") n = int(round(size * frac)) if n > 0 and size == 0: - raise ValueError( - "Cannot take a sample larger than 0 when axis is empty." - ) + raise ValueError("Cannot take a sample larger than 0 when axis is empty.") if isinstance(random_state, cp.random.RandomState): lib = cp @@ -4681,18 +4597,14 @@ def sample( weights = weights / weights.sum() if axis == 0: - return self._sample_axis_0( - n, weights, replace, random_state, ignore_index - ) + return self._sample_axis_0(n, weights, replace, random_state, ignore_index) else: if isinstance(random_state, cp.random.RandomState): raise ValueError( "Sampling from `axis=1`/`columns` with cupy random state" "isn't supported." ) - return self._sample_axis_1( - n, weights, replace, random_state, ignore_index - ) + return self._sample_axis_1(n, weights, replace, random_state, ignore_index) def _sample_axis_0( self, @@ -4705,9 +4617,7 @@ def _sample_axis_0( try: gather_map = GatherMap.from_column_unchecked( cudf.core.column.as_column( - random_state.choice( - len(self), size=n, replace=replace, p=weights - ) + random_state.choice(len(self), size=n, replace=replace, p=weights) ), len(self), nullify=False, @@ -4751,9 +4661,7 @@ def _binaryop( if operands is NotImplemented: return NotImplemented - level_names = ( - self._data._level_names if can_use_self_column_name else None - ) + level_names = self._data._level_names if can_use_self_column_name else None return self._from_data( ColumnAccessor( type(self)._colwise_binop(operands, op), @@ -4793,14 +4701,11 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if cupy_func: if ufunc.nin == 2: other = inputs[self is inputs[0]] - inputs, index, _ = self._make_operands_and_index_for_binop( - other, fname - ) + inputs, index, _ = self._make_operands_and_index_for_binop(other, fname) else: # This works for Index too inputs = { - name: (col, None, False, None) - for name, col in self._data.items() + name: (col, None, False, None) for name, col in self._data.items() } index = self._index @@ -4879,9 +4784,7 @@ def repeat(self, repeats, axis=None): dtype: int64 """ return self._from_columns_like_self( - Frame._repeat( - [*self._index._data.columns, *self._columns], repeats, axis - ), + Frame._repeat([*self._index._data.columns, *self._columns], repeats, axis), self._column_names, self._index_names, ) @@ -5185,9 +5088,7 @@ def drop( """ if labels is not None: if index is not None or columns is not None: - raise ValueError( - "Cannot specify both 'labels' and 'index'/'columns'" - ) + raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") target = labels elif index is not None: target = index @@ -5197,8 +5098,7 @@ def drop( axis = 1 else: raise ValueError( - "Need to specify at least one of 'labels', " - "'index' or 'columns'" + "Need to specify at least one of 'labels', " "'index' or 'columns'" ) if inplace: @@ -5249,9 +5149,7 @@ def _explode(self, explode_column: Any, ignore_index: bool): ) # We must copy inner datatype of the exploded list column to # maintain struct dtype key names - exploded_dtype = cast( - ListDtype, self._columns[column_index].dtype - ).element_type + exploded_dtype = cast(ListDtype, self._columns[column_index].dtype).element_type return self._from_columns_like_self( exploded, self._column_names, @@ -5288,9 +5186,7 @@ def tile(self, count): The indexed frame containing the tiled "rows". """ return self._from_columns_like_self( - libcudf.reshape.tile( - [*self._index._columns, *self._columns], count - ), + libcudf.reshape.tile([*self._index._columns, *self._columns], count), column_names=self._column_names, index_names=self._index_names, ) @@ -5315,19 +5211,13 @@ def groupby( raise NotImplementedError("axis parameter is not yet implemented") if squeeze is not False: - raise NotImplementedError( - "squeeze parameter is not yet implemented" - ) + raise NotImplementedError("squeeze parameter is not yet implemented") if not observed: - raise NotImplementedError( - "observed parameter is not yet implemented" - ) + raise NotImplementedError("observed parameter is not yet implemented") if by is None and level is None: - raise TypeError( - "groupby() requires either by or level to be specified." - ) + raise TypeError("groupby() requires either by or level to be specified.") if group_keys is None: group_keys = False @@ -5835,9 +5725,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rfloordiv( - self, other, axis, level=None, fill_value=None - ): # noqa: D102 + def rfloordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5967,9 +5855,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def eq( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def eq(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__eq__", fill_value=fill_value, can_reindex=True ) @@ -6009,9 +5895,7 @@ def eq( ), ) ) - def ne( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def ne(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__ne__", fill_value=fill_value, can_reindex=True ) @@ -6051,9 +5935,7 @@ def ne( ), ) ) - def lt( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def lt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__lt__", fill_value=fill_value, can_reindex=True ) @@ -6093,9 +5975,7 @@ def lt( ), ) ) - def le( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def le(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__le__", fill_value=fill_value, can_reindex=True ) @@ -6135,9 +6015,7 @@ def le( ), ) ) - def gt( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def gt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__gt__", fill_value=fill_value, can_reindex=True ) @@ -6177,9 +6055,7 @@ def gt( ), ) ) - def ge( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: D102 + def ge(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 return self._binaryop( other=other, op="__ge__", fill_value=fill_value, can_reindex=True ) @@ -6250,9 +6126,7 @@ def rank( method_enum = libcudf.pylibcudf.aggregation.RankMethod[method.upper()] if na_option not in {"keep", "top", "bottom"}: - raise ValueError( - "na_option must be one of 'keep', 'top', or 'bottom'" - ) + raise ValueError("na_option must be one of 'keep', 'top', or 'bottom'") if axis not in (0, "index"): raise NotImplementedError( @@ -6262,9 +6136,9 @@ def rank( source = self if numeric_only: - if isinstance( - source, cudf.Series - ) and not _is_non_decimal_numeric_dtype(self.dtype): + if isinstance(source, cudf.Series) and not _is_non_decimal_numeric_dtype( + self.dtype + ): raise TypeError( "Series.rank does not allow numeric_only=True with " "non-numeric dtype." @@ -6382,9 +6256,7 @@ def _get_replacement_values_for_columns( if is_scalar(to_replace) and is_scalar(value): to_replace_columns = {col: [to_replace] for col in columns_dtype_map} values_columns = {col: [value] for col in columns_dtype_map} - elif cudf.api.types.is_list_like(to_replace) or isinstance( - to_replace, ColumnBase - ): + elif cudf.api.types.is_list_like(to_replace) or isinstance(to_replace, ColumnBase): if is_scalar(value): to_replace_columns = {col: to_replace for col in columns_dtype_map} values_columns = { @@ -6405,17 +6277,13 @@ def _get_replacement_values_for_columns( f" Expected {len(to_replace)}, got {len(value)}." ) else: - to_replace_columns = { - col: to_replace for col in columns_dtype_map - } + to_replace_columns = {col: to_replace for col in columns_dtype_map} values_columns = {col: value for col in columns_dtype_map} elif cudf.utils.dtypes.is_column_like(value): to_replace_columns = {col: to_replace for col in columns_dtype_map} values_columns = {col: value for col in columns_dtype_map} else: - raise TypeError( - "value argument must be scalar, list-like or Series" - ) + raise TypeError("value argument must be scalar, list-like or Series") elif _is_series(to_replace): if value is None or value is no_default: to_replace_columns = { @@ -6424,18 +6292,14 @@ def _get_replacement_values_for_columns( values_columns = {col: to_replace for col in columns_dtype_map} elif is_dict_like(value): to_replace_columns = { - col: to_replace[col] - for col in columns_dtype_map - if col in to_replace + col: to_replace[col] for col in columns_dtype_map if col in to_replace } values_columns = { col: value[col] for col in to_replace_columns if col in value } elif is_scalar(value) or _is_series(value): to_replace_columns = { - col: to_replace[col] - for col in columns_dtype_map - if col in to_replace + col: to_replace[col] for col in columns_dtype_map if col in to_replace } values_columns = { col: [value] if is_scalar(value) else value[col] @@ -6444,8 +6308,7 @@ def _get_replacement_values_for_columns( } else: raise ValueError( - "Series.replace cannot use dict-like to_replace and non-None " - "value" + "Series.replace cannot use dict-like to_replace and non-None " "value" ) elif is_dict_like(to_replace): if value is None or value is no_default: @@ -6457,18 +6320,14 @@ def _get_replacement_values_for_columns( } elif is_dict_like(value): to_replace_columns = { - col: to_replace[col] - for col in columns_dtype_map - if col in to_replace + col: to_replace[col] for col in columns_dtype_map if col in to_replace } values_columns = { col: value[col] for col in columns_dtype_map if col in value } elif is_scalar(value) or _is_series(value): to_replace_columns = { - col: to_replace[col] - for col in columns_dtype_map - if col in to_replace + col: to_replace[col] for col in columns_dtype_map if col in to_replace } values_columns = { col: [value] if is_scalar(value) else value @@ -6496,9 +6355,7 @@ def _get_replacement_values_for_columns( for i in to_replace_columns: if i in values_columns: if isinstance(values_columns[i], list): - all_na = values_columns[i].count(None) == len( - values_columns[i] - ) + all_na = values_columns[i].count(None) == len(values_columns[i]) else: all_na = False all_na_columns[i] = all_na @@ -6565,9 +6422,7 @@ def _drop_rows_by_labels( join_res = working_df.join(to_join, how="leftanti") # 4. Reconstruct original layout, and rename - join_res._insert( - ilevel, name=join_res._index.name, value=join_res._index - ) + join_res._insert(ilevel, name=join_res._index.name, value=join_res._index) midx = cudf.MultiIndex.from_frame( join_res.iloc[:, 0:idx_nlv], names=obj._index.names @@ -6590,9 +6445,7 @@ def _drop_rows_by_labels( key_df = cudf.DataFrame._from_data( data={}, - index=cudf.Index( - labels, name=getattr(labels, "name", obj.index.name) - ), + index=cudf.Index(labels, name=getattr(labels, "name", obj.index.name)), ) if isinstance(obj, cudf.DataFrame): res = obj.join(key_df, how="leftanti") diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 7242de9964f..66b364a1686 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -108,17 +108,13 @@ def destructure_iloc_key( # shape of frame indexers = key + (slice(None),) * (n - len(key)) if len(indexers) > n: - raise IndexError( - f"Too many indexers: got {len(indexers)} expected {n}" - ) + raise IndexError(f"Too many indexers: got {len(indexers)} expected {n}") else: # Key indexes rows, slice-expand to shape of frame indexers = (key, *(slice(None),) * (n - 1)) indexers = tuple(k(frame) if callable(k) else k for k in indexers) if any(isinstance(k, tuple) for k in indexers): - raise IndexError( - "Too many indexers: can't have nested tuples in iloc indexing" - ) + raise IndexError("Too many indexers: can't have nested tuples in iloc indexing") return indexers @@ -154,17 +150,14 @@ def destructure_dataframe_iloc_indexer( cols = slice(None) scalar = is_integer(cols) try: - column_names: ColumnLabels = list( - frame._data.get_labels_by_index(cols) - ) + column_names: ColumnLabels = list(frame._data.get_labels_by_index(cols)) if len(set(column_names)) != len(column_names): raise NotImplementedError( "cudf DataFrames do not support repeated column names" ) except TypeError: raise TypeError( - "Column indices must be integers, slices, " - "or list-like of integers" + "Column indices must be integers, slices, " "or list-like of integers" ) if scalar: assert ( @@ -238,6 +231,5 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: return MapIndexer(GatherMap(key, n, nullify=False)) else: raise TypeError( - "Cannot index by location " - f"with non-integer key of type {type(key)}" + "Cannot index by location " f"with non-integer key of type {type(key)}" ) diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 6a619945e75..c5163407c30 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -91,8 +91,7 @@ def _match_join_keys( np.issubdtype(ltype, np.number) and np.issubdtype(rtype, np.number) and not ( - np.issubdtype(ltype, np.timedelta64) - or np.issubdtype(rtype, np.timedelta64) + np.issubdtype(ltype, np.timedelta64) or np.issubdtype(rtype, np.timedelta64) ) ): common_type = ( @@ -101,24 +100,20 @@ def _match_join_keys( else np.result_type(ltype, rtype) ) elif ( - np.issubdtype(ltype, np.datetime64) - and np.issubdtype(rtype, np.datetime64) + np.issubdtype(ltype, np.datetime64) and np.issubdtype(rtype, np.datetime64) ) or ( - np.issubdtype(ltype, np.timedelta64) - and np.issubdtype(rtype, np.timedelta64) + np.issubdtype(ltype, np.timedelta64) and np.issubdtype(rtype, np.timedelta64) ): common_type = max(ltype, rtype) elif ( - np.issubdtype(ltype, np.datetime64) - or np.issubdtype(ltype, np.timedelta64) + np.issubdtype(ltype, np.datetime64) or np.issubdtype(ltype, np.timedelta64) ) and not rcol.fillna(0).can_cast_safely(ltype): raise TypeError( f"Cannot join between {ltype} and {rtype}, please type-cast both " "columns to the same type." ) elif ( - np.issubdtype(rtype, np.datetime64) - or np.issubdtype(rtype, np.timedelta64) + np.issubdtype(rtype, np.datetime64) or np.issubdtype(rtype, np.timedelta64) ) and not lcol.fillna(0).can_cast_safely(rtype): raise TypeError( f"Cannot join between {rtype} and {ltype}, please type-cast both " @@ -145,8 +140,7 @@ def _match_categorical_dtypes_both( # ambiguous and not allowed. if ltype.ordered != rtype.ordered: raise TypeError( - "Merging on categorical variables with mismatched" - " ordering is ambiguous" + "Merging on categorical variables with mismatched" " ordering is ambiguous" ) if ltype.ordered and rtype.ordered: @@ -176,9 +170,7 @@ def _match_categorical_dtypes_both( merged_categories = cudf.concat( [ltype.categories, rtype.categories] ).unique() - common_type = cudf.CategoricalDtype( - categories=merged_categories, ordered=False - ) + common_type = cudf.CategoricalDtype(categories=merged_categories, ordered=False) return lcol.astype(common_type), rcol.astype(common_type) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 1ef2915bc59..ccabb93c1fe 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -125,9 +125,7 @@ def __init__( self.sort = sort or ( cudf.get_option("mode.pandas_compatible") and how == "outer" ) - self.preserve_key_order = cudf.get_option( - "mode.pandas_compatible" - ) and how in { + self.preserve_key_order = cudf.get_option("mode.pandas_compatible") and how in { "inner", "outer", "left", @@ -139,16 +137,10 @@ def __init__( # don't have any other args, so we can apply it directly to left_on and # right_on. self._using_left_index = bool(left_index) - left_on = ( - lhs.index._data.names if left_index else left_on if left_on else on - ) + left_on = lhs.index._data.names if left_index else left_on if left_on else on self._using_right_index = bool(right_index) right_on = ( - rhs.index._data.names - if right_index - else right_on - if right_on - else on + rhs.index._data.names if right_index else right_on if right_on else on ) if left_on or right_on: @@ -192,8 +184,7 @@ def __init__( for lkey, rkey in zip(self._left_keys, self._right_keys) if lkey.name == rkey.name and not ( - isinstance(lkey, _IndexIndexer) - or isinstance(rkey, _IndexIndexer) + isinstance(lkey, _IndexIndexer) or isinstance(rkey, _IndexIndexer) ) } ) @@ -231,11 +222,7 @@ def _gather_maps(self, left_cols, right_cols): key_order = list( itertools.chain.from_iterable( libcudf.copying.gather( - [ - cudf.core.column.as_column( - range(n), dtype=size_type_dtype - ) - ], + [cudf.core.column.as_column(range(n), dtype=size_type_dtype)], map_, nullify=null, ) @@ -275,17 +262,13 @@ def perform_merge(self) -> cudf.DataFrame: left_key.set(self.lhs, lcol_casted, validate=False) right_key.set(self.rhs, rcol_casted, validate=False) - left_rows, right_rows = self._gather_maps( - left_join_cols, right_join_cols - ) + left_rows, right_rows = self._gather_maps(left_join_cols, right_join_cols) gather_kwargs = { "keep_index": self._using_left_index or self._using_right_index, } left_result = ( self.lhs._gather( - GatherMap.from_column_unchecked( - left_rows, len(self.lhs), nullify=True - ), + GatherMap.from_column_unchecked(left_rows, len(self.lhs), nullify=True), **gather_kwargs, ) if left_rows is not None @@ -311,9 +294,7 @@ def perform_merge(self) -> cudf.DataFrame: result = self._sort_result(result) return result - def _merge_results( - self, left_result: cudf.DataFrame, right_result: cudf.DataFrame - ): + def _merge_results(self, left_result: cudf.DataFrame, right_result: cudf.DataFrame): # Merge the DataFrames `left_result` and `right_result` into a single # `DataFrame`, suffixing column names if necessary. @@ -335,9 +316,7 @@ def _merge_results( # All columns from the left table make it into the output. Non-key # columns that share a name with a column in the right table are # suffixed with the provided suffix. - common_names = set(left_result._data.names) & set( - right_result._data.names - ) + common_names = set(left_result._data.names) & set(right_result._data.names) cols_to_suffix = common_names - self._key_columns_with_same_name data = { (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col @@ -360,9 +339,7 @@ def _merge_results( # - either one of `lhs` or `rhs` have a MultiIndex columns, # and the other is empty (i.e., no columns) if self.lhs._data and self.rhs._data: - multiindex_columns = ( - self.lhs._data.multiindex and self.rhs._data.multiindex - ) + multiindex_columns = self.lhs._data.multiindex and self.rhs._data.multiindex elif self.lhs._data: multiindex_columns = self.lhs._data.multiindex elif self.rhs._data: @@ -382,9 +359,7 @@ def _merge_results( # Construct result from data and index: return ( - left_result._data.__class__( - data=data, multiindex=multiindex_columns - ), + left_result._data.__class__(data=data, multiindex=multiindex_columns), index, ) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 7bbb299d643..2adc454b132 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import inspect @@ -59,9 +59,7 @@ def __get__(self, obj, owner=None): retfunc.__annotations__.pop("op", None) retfunc_params = [ v - for k, v in inspect.signature( - self._base_operation - ).parameters.items() + for k, v in inspect.signature(self._base_operation).parameters.items() if k != "op" ] retfunc.__signature__ = inspect.Signature(retfunc_params) @@ -230,12 +228,10 @@ def __init_subclass__(cls): base_operation = getattr(cls, base_operation_name) for operation in valid_operations: - if _should_define_operation( - cls, operation, base_operation_name - ): - docstring_format_args = getattr( - cls, docstring_attr, {} - ).get(operation, {}) + if _should_define_operation(cls, operation, base_operation_name): + docstring_format_args = getattr(cls, docstring_attr, {}).get( + operation, {} + ) op_attr = Operation( operation, docstring_format_args, base_operation ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 019daacddba..6046b462982 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -134,9 +134,7 @@ def __init__( if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") if name is not None: - raise NotImplementedError( - "Use `names`, `name` is not yet supported" - ) + raise NotImplementedError("Use `names`, `name` is not yet supported") if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") if not isinstance(codes, cudf.DataFrame) and not isinstance( @@ -147,9 +145,7 @@ def __init__( if copy: if isinstance(codes, cudf.DataFrame): codes = codes.copy(deep=True) - if len(levels) > 0 and isinstance( - levels[0], (cudf.Index, cudf.Series) - ): + if len(levels) > 0 and isinstance(levels[0], (cudf.Index, cudf.Series)): levels = [level.copy(deep=True) for level in levels] if not isinstance(codes, cudf.DataFrame): @@ -175,8 +171,7 @@ def __init__( ) if len({c.size for c in codes._data.columns}) != 1: raise ValueError( - "MultiIndex length of codes does not match " - "and is inconsistent!" + "MultiIndex length of codes does not match " "and is inconsistent!" ) source_data = {} @@ -184,18 +179,12 @@ def __init__( if len(code): lo, hi = libcudf.reduce.minmax(code) if lo.value < -1 or hi.value > len(level) - 1: - raise ValueError( - f"Codes must be -1 <= codes <= {len(level) - 1}" - ) + raise ValueError(f"Codes must be -1 <= codes <= {len(level) - 1}") if lo.value == -1: # Now we can gather and insert null automatically code[code == -1] = np.iinfo(size_type_dtype).min - result_col = libcudf.copying.gather( - [level._column], code, nullify=True - ) - source_data[column_name] = result_col[0]._with_type_metadata( - level.dtype - ) + result_col = libcudf.copying.gather([level._column], code, nullify=True) + source_data[column_name] = result_col[0]._with_type_metadata(level.dtype) super().__init__(source_data) self._levels = levels @@ -237,9 +226,7 @@ def names(self, value): @_cudf_nvtx_annotate def to_series(self, index=None, name=None): - raise NotImplementedError( - "MultiIndex.to_series isn't implemented yet." - ) + raise NotImplementedError("MultiIndex.to_series isn't implemented yet.") @_cudf_nvtx_annotate def astype(self, dtype, copy: bool = True): @@ -303,9 +290,7 @@ def set_names(self, names, level=None, inplace=False): level_is_list_like = is_list_like(level) if level is not None and not level_is_list_like and names_is_list_like: - raise TypeError( - "Names must be a string when a single level is provided." - ) + raise TypeError("Names must be a string when a single level is provided.") if not names_is_list_like and level is None and self.nlevels > 1: raise TypeError("Must pass list-like as `names`.") @@ -452,17 +437,13 @@ def __repr__(self): column.timedelta.TimeDeltaColumn, ), ): - preprocess_df[name] = col.astype("str").fillna( - str(cudf.NaT) - ) + preprocess_df[name] = col.astype("str").fillna(str(cudf.NaT)) tuples_list = list( zip( *list( map(lambda val: pd.NA if val is None else val, col) - for col in preprocess_df.to_arrow() - .to_pydict() - .values() + for col in preprocess_df.to_arrow().to_pydict().values() ) ) ) @@ -661,9 +642,7 @@ def isin(self, values, level=None): "squences when `level=None`." ) else: - values_idx = cudf.MultiIndex.from_tuples( - values, names=self.names - ) + values_idx = cudf.MultiIndex.from_tuples(values, names=self.names) self_df = self.to_frame(index=False).reset_index() values_df = values_idx.to_frame(index=False) idx = self_df.merge(values_df, how="leftsemi")._data["index"] @@ -677,9 +656,7 @@ def isin(self, values, level=None): return result def where(self, cond, other=None, inplace=False): - raise NotImplementedError( - ".where is not supported for MultiIndex operations" - ) + raise NotImplementedError(".where is not supported for MultiIndex operations") @_cudf_nvtx_annotate def _compute_levels_and_codes(self): @@ -710,11 +687,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): [ frame, cudf.DataFrame( - { - "idx": cudf.Series( - column.as_column(range(len(frame))) - ) - } + {"idx": cudf.Series(column.as_column(range(len(frame))))} ), ], axis=1, @@ -727,9 +700,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): if cudf.get_option("mode.pandas_compatible"): lookup_order = "_" + "_".join(map(str, lookup._data.names)) lookup[lookup_order] = column.as_column(range(len(lookup))) - postprocess = operator.methodcaller( - "sort_values", by=[lookup_order, "idx"] - ) + postprocess = operator.methodcaller("sort_values", by=[lookup_order, "idx"]) else: postprocess = lambda r: r # noqa: E731 result = postprocess(lookup.merge(data_table))["idx"] @@ -763,12 +734,8 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): start_values = self._compute_validity_mask( index, row_tuple.start, max_length ) - stop_values = self._compute_validity_mask( - index, row_tuple.stop, max_length - ) - return column.as_column( - range(start_values.min(), stop_values.max() + 1) - ) + stop_values = self._compute_validity_mask(index, row_tuple.stop, max_length) + return column.as_column(range(start_values.min(), stop_values.max() + 1)) elif isinstance(row_tuple, numbers.Number): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length) @@ -777,9 +744,9 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): def _index_and_downcast(self, result, index, index_key): if isinstance(index_key, (numbers.Number, slice)): index_key = [index_key] - if ( - len(index_key) > 0 and not isinstance(index_key, tuple) - ) or isinstance(index_key[0], slice): + if (len(index_key) > 0 and not isinstance(index_key, tuple)) or isinstance( + index_key[0], slice + ): index_key = index_key[0] slice_access = isinstance(index_key, slice) @@ -845,9 +812,7 @@ def _index_and_downcast(self, result, index, index_key): def _get_row_major( self, df: DataFrameOrSeries, - row_tuple: Union[ - numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] - ], + row_tuple: Union[numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]], ) -> DataFrameOrSeries: if pd.api.types.is_bool_dtype( list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple @@ -870,18 +835,14 @@ def _get_row_major( @_cudf_nvtx_annotate def _validate_indexer( self, - indexer: Union[ - numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] - ], + indexer: Union[numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]], ): if isinstance(indexer, numbers.Number): return if isinstance(indexer, tuple): # drop any slice(None) from the end: indexer = tuple( - itertools.dropwhile( - lambda x: x == slice(None), reversed(indexer) - ) + itertools.dropwhile(lambda x: x == slice(None), reversed(indexer)) )[::-1] # now check for size @@ -947,9 +908,7 @@ def __getitem__(self, index): start, stop, step = index.indices(len(self)) index = column.as_column(range(start, stop, step)) result = MultiIndex.from_frame( - self.to_frame(index=False, name=range(0, self.nlevels)).take( - index - ), + self.to_frame(index=False, name=range(0, self.nlevels)).take(index), names=self.names, ) @@ -1019,14 +978,11 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False): # modifications of the resulting DataFrame will affect the MultiIndex. if name is no_default: column_names = [ - level if name is None else name - for level, name in enumerate(self.names) + level if name is None else name for level, name in enumerate(self.names) ] else: if not is_list_like(name): - raise TypeError( - "'name' must be a list / sequence of column names." - ) + raise TypeError("'name' must be a list / sequence of column names.") if len(name) != len(self.levels): raise ValueError( "'name' should have the same length as " @@ -1035,9 +991,9 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False): column_names = name all_none_names = None - if not ( - all_none_names := all(x is None for x in column_names) - ) and len(column_names) != len(set(column_names)): + if not (all_none_names := all(x is None for x in column_names)) and len( + column_names + ) != len(set(column_names)): raise ValueError("Duplicate column names are not allowed") df = cudf.DataFrame._from_data( data=self._data, @@ -1226,9 +1182,7 @@ def values(self): """ if cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError( - "Unable to create a cupy array with tuples." - ) + raise NotImplementedError("Unable to create a cupy array with tuples.") return self.to_frame(index=False).values @classmethod @@ -1404,9 +1358,7 @@ def from_arrays( code, level = factorize(array, sort=True) codes.append(code) levels.append(level) - return cls( - codes=codes, levels=levels, sortorder=sortorder, names=names - ) + return cls(codes=codes, levels=levels, sortorder=sortorder, names=names) @_cudf_nvtx_annotate def _poplevels(self, level): @@ -1613,16 +1565,12 @@ def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default): if not isinstance(multiindex, pd.MultiIndex): raise TypeError("not a pandas.MultiIndex") if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) + nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None levels = [ cudf.Index.from_pandas(level, nan_as_null=nan_as_null) for level in multiindex.levels ] - return cls( - levels=levels, codes=multiindex.codes, names=multiindex.names - ) + return cls(levels=levels, codes=multiindex.codes, names=multiindex.names) @cached_property # type: ignore @_cudf_nvtx_annotate @@ -1649,9 +1597,7 @@ def is_monotonic_decreasing(self): Return if the index is monotonic decreasing (only equal or decreasing) values. """ - return self._is_sorted( - ascending=[False] * len(self.levels), null_position=None - ) + return self._is_sorted(ascending=[False] * len(self.levels), null_position=None) @_cudf_nvtx_annotate def fillna(self, value): @@ -1830,27 +1776,20 @@ def _level_index_from_level(self, level): level += self.nlevels if level >= self.nlevels: raise IndexError( - f"Level {level} out of bounds. " - f"Index has {self.nlevels} levels." + f"Level {level} out of bounds. " f"Index has {self.nlevels} levels." ) from None return level @_cudf_nvtx_annotate def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is not None: - raise NotImplementedError( - "Parameter tolerance is not supported yet." - ) + raise NotImplementedError("Parameter tolerance is not supported yet.") if method == "nearest": - raise NotImplementedError( - f"{method=} is not supported yet for MultiIndex." - ) + raise NotImplementedError(f"{method=} is not supported yet for MultiIndex.") if method in {"ffill", "bfill", "pad", "backfill"} and not ( self.is_monotonic_increasing or self.is_monotonic_decreasing ): - raise ValueError( - "index must be monotonic increasing or decreasing" - ) + raise ValueError("index must be monotonic increasing or decreasing") result = column.as_column( -1, @@ -1881,9 +1820,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): index=self, positions=result_series, method=method, - target_col=target.to_frame(index=False)[ - list(range(0, self.nlevels)) - ], + target_col=target.to_frame(index=False)[list(range(0, self.nlevels))], tolerance=tolerance, ) elif method is not None: @@ -1896,9 +1833,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): @_cudf_nvtx_annotate def get_loc(self, key): - is_sorted = ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ) + is_sorted = self.is_monotonic_increasing or self.is_monotonic_decreasing is_unique = self.is_unique key = (key,) if not isinstance(key, tuple) else key @@ -1922,11 +1857,7 @@ def get_loc(self, key): if is_unique and lower_bound + 1 == upper_bound: # Indices are unique (Pandas constraint), search result is unique, # return int. - return ( - lower_bound - if is_sorted - else sort_inds.element_indexing(lower_bound) - ) + return lower_bound if is_sorted else sort_inds.element_indexing(lower_bound) if is_sorted: # In monotonic index, lex search result is continuous. A slice for @@ -2070,8 +2001,7 @@ def _split_columns_by_levels(self, levels): # Normalize named levels into indices level_names = list(self.names) level_indices = { - lv if isinstance(lv, int) else level_names.index(lv) - for lv in levels + lv if isinstance(lv, int) else level_names.index(lv) for lv in levels } # Split the columns diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index 1a79b122561..e69701ee838 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -98,9 +98,7 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): obj_type = pickle.loads(header["obj_type"]) - obj = obj_type.deserialize( - header["obj"], frames[: header["num_obj_frames"]] - ) + obj = obj_type.deserialize(header["obj"], frames[: header["num_obj_frames"]]) grouping = _ResampleGrouping.deserialize( header["grouping"], frames[header["num_obj_frames"] :] ) @@ -183,9 +181,7 @@ def _handle_frequency_grouper(self, by): "Resampling by DateOffset objects is not yet supported." ) if not isinstance(freq, str): - raise TypeError( - f"Unsupported type for freq: {type(freq).__name__}" - ) + raise TypeError(f"Unsupported type for freq: {type(freq).__name__}") # convert freq to a pd.DateOffset: offset = pd.tseries.frequencies.to_offset(freq) @@ -247,9 +243,7 @@ def _handle_frequency_grouper(self, by): # column to have the same dtype, so we compute a `result_type` # and cast them both to that type. try: - result_type = np.dtype( - _unit_dtype_map[_offset_alias_to_code[offset.name]] - ) + result_type = np.dtype(_unit_dtype_map[_offset_alias_to_code[offset.name]]) except KeyError: # unsupported resolution (we don't support resolutions >s) # fall back to using datetime64[s] @@ -334,9 +328,7 @@ def _get_timestamp_range_edges( if isinstance(origin, pd.Timestamp) and (origin.tz is None) != ( index_tz is None ): - raise ValueError( - "The origin must have the same timezone as the index." - ) + raise ValueError("The origin must have the same timezone as the index.") elif origin == "epoch": # set the epoch based on the timezone to have similar bins results # when resampling on the same kind of indexes on different diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 2ef39e9357d..9a213260847 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -42,9 +42,7 @@ def _align_objs(objs, how="outer", sort=None): i_objs = iter(objs) first = next(i_objs) - not_matching_index = any( - not first.index.equals(rest.index) for rest in i_objs - ) + not_matching_index = any(not first.index.equals(rest.index) for rest in i_objs) if not_matching_index: if not all(o.index.is_unique for o in objs): @@ -59,9 +57,7 @@ def _align_objs(objs, how="outer", sort=None): final_index.name = name return [ - obj.reindex(final_index) - if not final_index.equals(obj.index) - else obj + obj.reindex(final_index) if not final_index.equals(obj.index) else obj for obj in objs ] else: @@ -242,9 +238,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): axis = _AXIS_MAP.get(axis, None) if axis is None: - raise ValueError( - f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}' - ) + raise ValueError(f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}') # Return for single object if len(objs) == 1: @@ -325,9 +319,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): any_empty = any(obj.empty for obj in objs) if any_empty: # Do not remove until pandas-3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "The behavior of array concatenation with empty entries is " "deprecated. In a future version, this will no longer exclude " @@ -363,9 +355,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): # if join is inner and it contains an empty df # we return an empty df, hence creating an empty # column with dtype metadata retained. - df[name] = cudf.core.column.column_empty_like( - col, newsize=0 - ) + df[name] = cudf.core.column.column_empty_like(col, newsize=0) else: df[name] = col @@ -720,9 +710,7 @@ def get_dummies( encode_fallback_dtypes = ["object", "category"] if columns is None or len(columns) == 0: - columns = df.select_dtypes( - include=encode_fallback_dtypes - )._column_names + columns = df.select_dtypes(include=encode_fallback_dtypes)._column_names _length_check_params(prefix, columns, "prefix") _length_check_params(prefix_sep, columns, "prefix_sep") @@ -757,9 +745,7 @@ def get_dummies( for name in columns: if name not in cats: - unique = _get_unique( - column=df._data[name], dummy_na=dummy_na - ) + unique = _get_unique(column=df._data[name], dummy_na=dummy_na) else: unique = as_column(cats[name]) @@ -839,9 +825,7 @@ def _merge_sorted( if keys is None: key_columns_indices = list(range(0, objs[0]._num_columns)) else: - key_columns_indices = [ - objs[0]._column_names.index(key) for key in keys - ] + key_columns_indices = [objs[0]._column_names.index(key) for key in keys] if not ignore_index: key_columns_indices = [ idx + objs[0]._index.nlevels for idx in key_columns_indices @@ -910,10 +894,7 @@ def as_tuple(x): target._data[None][scatter_map] = col result_frames = target._split(range(nrows, nrows * ncols, nrows)) result.update( - { - name: next(iter(f._columns)) - for name, f in zip(names, result_frames) - } + {name: next(iter(f._columns)) for name, f in zip(names, result_frames)} ) return cudf.DataFrame._from_data( @@ -1122,9 +1103,7 @@ def unstack(df, level, fill_value=None): ) res = df.T.stack(future_stack=False) # Result's index is a multiindex - res.index.names = ( - tuple(df._data.to_pandas_index().names) + df.index.names - ) + res.index.names = tuple(df._data.to_pandas_index().names) + df.index.names return res else: columns = df.index._poplevels(level) @@ -1445,9 +1424,7 @@ def pivot_table( # discard the top level if values_passed and not values_multi and table._data.multiindex: column_names = table._data.level_names[1:] - table_columns = tuple( - map(lambda column: column[1:], table._data.names) - ) + table_columns = tuple(map(lambda column: column[1:], table._data.names)) table.columns = cudf.MultiIndex.from_tuples( tuples=table_columns, names=column_names ) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index f7d05e53ce7..f9816c1c811 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -190,9 +190,7 @@ def _preprocess_host_value(self, value, dtype): if dtype is not None: raise TypeError("Lists may not be cast to a different dtype") else: - dtype = ListDtype.from_arrow( - pa.infer_type([value], from_pandas=True) - ) + dtype = ListDtype.from_arrow(pa.infer_type([value], from_pandas=True)) return value, dtype elif isinstance(dtype, ListDtype): if value not in {None, NA}: @@ -202,9 +200,7 @@ def _preprocess_host_value(self, value, dtype): if isinstance(value, dict): if dtype is None: - dtype = StructDtype.from_arrow( - pa.infer_type([value], from_pandas=True) - ) + dtype = StructDtype.from_arrow(pa.infer_type([value], from_pandas=True)) return value, dtype elif isinstance(dtype, StructDtype): if value not in {None, NA}: @@ -226,15 +222,11 @@ def _preprocess_host_value(self, value, dtype): if isinstance(value, (np.datetime64, np.timedelta64)): unit, _ = np.datetime_data(value) if unit == "generic": - raise TypeError( - "Cant convert generic NaT to null scalar" - ) + raise TypeError("Cant convert generic NaT to null scalar") else: dtype = value.dtype else: - raise TypeError( - "dtype required when constructing a null scalar" - ) + raise TypeError("dtype required when constructing a null scalar") else: dtype = value.dtype @@ -243,9 +235,7 @@ def _preprocess_host_value(self, value, dtype): if not valid: value = ( - NaT - if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype) - else NA + NaT if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype) else NA ) return value, dtype @@ -303,18 +293,13 @@ def __neg__(self): def __repr__(self): # str() fixes a numpy bug with NaT # https://github.com/numpy/numpy/issues/17552 - return ( - f"{self.__class__.__name__}" - f"({str(self.value)}, dtype={self.dtype})" - ) + return f"{self.__class__.__name__}" f"({str(self.value)}, dtype={self.dtype})" def _binop_result_dtype_or_error(self, other, op): if op in {"__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"}: return np.bool_ - out_dtype = get_allowed_combinations_for_operator( - self.dtype, other.dtype, op - ) + out_dtype = get_allowed_combinations_for_operator(self.dtype, other.dtype, op) # datetime handling if out_dtype in {"M", "m"}: @@ -329,10 +314,7 @@ def _binop_result_dtype_or_error(self, other, op): }: return other.dtype else: - if ( - op == "__sub__" - and self.dtype.char == other.dtype.char == "M" - ): + if op == "__sub__" and self.dtype.char == other.dtype.char == "M": res, _ = np.datetime_data(max(self.dtype, other.dtype)) return cudf.dtype("m8" + f"[{res}]") return np.result_type(self.dtype, other.dtype) @@ -371,8 +353,7 @@ def _dispatch_scalar_binop(self, other, op): def _unaop_result_type_or_error(self, op): if op == "__neg__" and self.dtype == "bool": raise TypeError( - "Boolean scalars in cuDF do not support" - " negation, use logical not" + "Boolean scalars in cuDF do not support" " negation, use logical not" ) if op in {"__ceil__", "__floor__"}: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 275dc664175..399b62edee6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -158,9 +158,7 @@ def _describe_categorical(obj, percentiles): # In case there's a tie, break the tie by sorting the index # and take the top. val_counts = obj.value_counts(ascending=False) - tied_val_counts = val_counts[ - val_counts == val_counts.iloc[0] - ].sort_index() + tied_val_counts = val_counts[val_counts == val_counts.iloc[0]].sort_index() data.update( { "top": tied_val_counts.index[0], @@ -214,26 +212,19 @@ def __setitem__(self, key, value): ) and cudf.utils.utils._isnat(value) and not ( - isinstance( - self._frame._column, cudf.core.column.StringColumn - ) + isinstance(self._frame._column, cudf.core.column.StringColumn) and isinstance(value, str) ) ): raise MixedTypeError( - f"Cannot assign {value=} to non-datetime/non-timedelta " - "columns" + f"Cannot assign {value=} to non-datetime/non-timedelta " "columns" ) elif ( not ( is_float_dtype(self._frame._column.dtype) or ( - isinstance( - self._frame._column.dtype, cudf.CategoricalDtype - ) - and is_float_dtype( - self._frame._column.dtype.categories.dtype - ) + isinstance(self._frame._column.dtype, cudf.CategoricalDtype) + and is_float_dtype(self._frame._column.dtype.categories.dtype) ) ) and isinstance(value, (np.float32, np.float64)) @@ -276,9 +267,7 @@ def __setitem__(self, key, value): value = value.astype(to_dtype) if to_dtype != self._frame._column.dtype: # Do not remove until pandas-3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( f"Setting an item of incompatible dtype is deprecated " "and will raise in a future error of pandas. " @@ -385,9 +374,7 @@ def _loc_to_iloc(self, arg): and is_integer_dtype(index_dtype.categories.dtype) ): # TODO: switch to cudf.utils.dtypes.is_integer(arg) - if isinstance(arg, cudf.Scalar) and is_integer_dtype( - arg.dtype - ): + if isinstance(arg, cudf.Scalar) and is_integer_dtype(arg.dtype): # Do not remove until pandas 3.0 support is added. assert ( PANDAS_LT_300 @@ -492,9 +479,7 @@ def _constructor(self): @property def _constructor_sliced(self): - raise NotImplementedError( - "_constructor_sliced not supported for Series!" - ) + raise NotImplementedError("_constructor_sliced not supported for Series!") @property def _constructor_expanddim(self): @@ -638,11 +623,7 @@ def __init__( # be expensive or mark a buffer as # unspillable. has_cai = ( - type( - inspect.getattr_static( - data, "__cuda_array_interface__", None - ) - ) + type(inspect.getattr_static(data, "__cuda_array_interface__", None)) is property ) column = as_column( @@ -741,9 +722,7 @@ def from_pandas(cls, s: pd.Series, nan_as_null=no_default): dtype: float64 """ if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) + nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) result = cls(s, nan_as_null=nan_as_null) @@ -798,9 +777,7 @@ def dt(self): elif isinstance(self._column, TimeDeltaColumn): return TimedeltaProperties(self) else: - raise AttributeError( - "Can only use .dt accessor with datetimelike values" - ) + raise AttributeError("Can only use .dt accessor with datetimelike values") @property # type: ignore @_cudf_nvtx_annotate @@ -860,9 +837,7 @@ def serialize(self): @_cudf_nvtx_annotate def deserialize(cls, header, frames): index_nframes = header["index_frame_count"] - obj = super().deserialize( - header, frames[header["index_frame_count"] :] - ) + obj = super().deserialize(header, frames[header["index_frame_count"] :]) idx_typ = pickle.loads(header["index"]["type-serialized"]) index = idx_typ.deserialize(header["index"], frames[:index_nframes]) @@ -900,9 +875,7 @@ def drop( # Ignore columns for Series if columns is not None: columns = [] - return super().drop( - labels, axis, index, columns, level, inplace, errors - ) + return super().drop(labels, axis, index, columns, level, inplace, errors) def tolist(self): # noqa: D102 raise TypeError( @@ -1001,9 +974,7 @@ def reindex(self, *args, **kwargs): """ if len(args) > 1: - raise TypeError( - "Only one positional argument ('index') is allowed" - ) + raise TypeError("Only one positional argument ('index') is allowed") if args: (index,) = args if "index" in kwargs: @@ -1088,13 +1059,10 @@ def reindex(self, *args, **kwargs): """, ) ) - def reset_index( - self, level=None, drop=False, name=no_default, inplace=False - ): + def reset_index(self, level=None, drop=False, name=no_default, inplace=False): if not drop and inplace: raise TypeError( - "Cannot reset_index inplace on a Series " - "to create a DataFrame" + "Cannot reset_index inplace on a Series " "to create a DataFrame" ) data, index = self._reset_index(level=level, drop=drop) if not drop: @@ -1155,9 +1123,7 @@ def to_frame(self, name=None): @_cudf_nvtx_annotate def memory_usage(self, index=True, deep=False): - return self._column.memory_usage + ( - self._index.memory_usage() if index else 0 - ) + return self._column.memory_usage + (self._index.memory_usage() if index else 0) @_cudf_nvtx_annotate def __array_function__(self, func, types, args, kwargs): @@ -1285,9 +1251,7 @@ def map(self, arg, na_action=None) -> "Series": raise NotImplementedError( "default values in dicts are currently not supported." ) - lhs = cudf.DataFrame( - {"x": self, "orig_order": as_column(range(len(self)))} - ) + lhs = cudf.DataFrame({"x": self, "orig_order": as_column(range(len(self)))}) rhs = cudf.DataFrame( { "x": arg.keys(), @@ -1295,21 +1259,16 @@ def map(self, arg, na_action=None) -> "Series": "bool": as_column(True, length=len(arg), dtype=self.dtype), } ) - res = lhs.merge(rhs, on="x", how="left").sort_values( - by="orig_order" - ) + res = lhs.merge(rhs, on="x", how="left").sort_values(by="orig_order") result = res["s"] result.name = self.name result.index = self.index elif isinstance(arg, cudf.Series): if not arg.index.is_unique: raise ValueError( - "Reindexing only valid with" - " uniquely valued Index objects" + "Reindexing only valid with" " uniquely valued Index objects" ) - lhs = cudf.DataFrame( - {"x": self, "orig_order": as_column(range(len(self)))} - ) + lhs = cudf.DataFrame({"x": self, "orig_order": as_column(range(len(self)))}) rhs = cudf.DataFrame( { "x": arg.keys(), @@ -1317,9 +1276,7 @@ def map(self, arg, na_action=None) -> "Series": "bool": as_column(True, length=len(arg), dtype=self.dtype), } ) - res = lhs.merge(rhs, on="x", how="left").sort_values( - by="orig_order" - ) + res = lhs.merge(rhs, on="x", how="left").sort_values(by="orig_order") result = res["s"] result.name = self.name result.index = self.index @@ -1355,9 +1312,7 @@ def _getitem_preprocessed( elif isinstance(spec, indexing_utils.SliceIndexer): return self._slice(spec.key) elif isinstance(spec, indexing_utils.ScalarIndexer): - return self._gather( - spec.key, keep_index=False - )._column.element_indexing(0) + return self._gather(spec.key, keep_index=False)._column.element_indexing(0) elif isinstance(spec, indexing_utils.EmptyIndexer): return self._empty_like(keep_index=True) assert_never(spec) @@ -1422,12 +1377,8 @@ def __repr__(self): ) else str(cudf.NA) ) - output = repr( - preprocess.astype("str").fillna(fill_value).to_pandas() - ) - elif isinstance( - preprocess._column, cudf.core.column.CategoricalColumn - ): + output = repr(preprocess.astype("str").fillna(fill_value).to_pandas()) + elif isinstance(preprocess._column, cudf.core.column.CategoricalColumn): min_rows = ( height if pd.get_option("display.min_rows") == 0 @@ -1518,9 +1469,7 @@ def _make_operands_and_index_for_binop( and fn in cudf.utils.utils._EQUALITY_OPS and not self.index.equals(other.index) ): - raise ValueError( - "Can only compare identically-labeled Series objects" - ) + raise ValueError("Can only compare identically-labeled Series objects") lhs, other = _align_indices([self, other], allow_non_unique=True) else: lhs = self @@ -1575,9 +1524,7 @@ def _concat(cls, objs, axis=0, index=True): else: with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) - index = cudf.core.index.Index._concat( - [o.index for o in objs] - ) + index = cudf.core.index.Index._concat([o.index for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: @@ -1591,12 +1538,8 @@ def _concat(cls, objs, axis=0, index=True): if ( obj.null_count == len(obj) or len(obj) == 0 - or isinstance( - obj._column, cudf.core.column.CategoricalColumn - ) - or isinstance( - objs[0]._column, cudf.core.column.CategoricalColumn - ) + or isinstance(obj._column, cudf.core.column.CategoricalColumn) + or isinstance(objs[0]._column, cudf.core.column.CategoricalColumn) ): continue @@ -1751,9 +1694,7 @@ def dropna(self, axis=0, inplace=False, how=None): dtype: object """ if axis not in (0, "index"): - raise ValueError( - "Series.dropna supports only one axis to drop values from" - ) + raise ValueError("Series.dropna supports only one axis to drop values from") result = super().dropna(axis=axis) @@ -1834,9 +1775,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): return self._mimic_inplace(result, inplace=inplace) @_cudf_nvtx_annotate - def fillna( - self, value=None, method=None, axis=None, inplace=False, limit=None - ): + def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None): if isinstance(value, pd.Series): value = Series.from_pandas(value) @@ -2328,8 +2267,7 @@ def argsort( def replace(self, to_replace=None, value=no_default, *args, **kwargs): if is_dict_like(to_replace) and value not in {None, no_default}: raise ValueError( - "Series.replace cannot use dict-like to_replace and non-None " - "value" + "Series.replace cannot use dict-like to_replace and non-None " "value" ) return super().replace(to_replace, value, *args, **kwargs) @@ -2683,9 +2621,7 @@ def mode(self, dropna=True): @_cudf_nvtx_annotate def round(self, decimals=0, how="half_even"): if not is_integer(decimals): - raise ValueError( - f"decimals must be an int, got {type(decimals).__name__}" - ) + raise ValueError(f"decimals must be an int, got {type(decimals).__name__}") decimals = int(decimals) return super().round(decimals, how) @@ -2720,9 +2656,7 @@ def cov(self, other, min_periods=None): """ if min_periods is not None: - raise NotImplementedError( - "min_periods parameter is not implemented yet" - ) + raise NotImplementedError("min_periods parameter is not implemented yet") if self.empty or other.empty: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -2736,8 +2670,7 @@ def cov(self, other, min_periods=None): return lhs._column.cov(rhs._column) except AttributeError: raise TypeError( - f"cannot perform covariance with types {self.dtype}, " - f"{other.dtype}" + f"cannot perform covariance with types {self.dtype}, " f"{other.dtype}" ) @_cudf_nvtx_annotate @@ -3136,9 +3069,9 @@ def value_counts( res = res[res.index.notna()] else: res = self.groupby(self, dropna=dropna).count(dropna=dropna) - if isinstance(self.dtype, cudf.CategoricalDtype) and len( - res - ) != len(self.dtype.categories): + if isinstance(self.dtype, cudf.CategoricalDtype) and len(res) != len( + self.dtype.categories + ): # For categorical dtypes: When there exists # categories in dtypes and they are missing in the # column, `value_counts` will have to return @@ -3167,9 +3100,7 @@ def value_counts( return res @_cudf_nvtx_annotate - def quantile( - self, q=0.5, interpolation="linear", exact=True, quant_index=True - ): + def quantile(self, q=0.5, interpolation="linear", exact=True, quant_index=True): """ Return values at the given quantile. @@ -3228,9 +3159,7 @@ def quantile( try: np_array_q = cudf.core.column.as_column(q).values_host except TypeError: - raise TypeError( - f"q must be a scalar or array-like, got {type(q)}" - ) + raise TypeError(f"q must be a scalar or array-like, got {type(q)}") result = self._column.quantile( np_array_q, interpolation, exact, return_scalar=return_scalar @@ -3322,9 +3251,7 @@ def digitize(self, bins, right=False): 3 2 dtype: int32 """ - return Series( - cudf.core.column.numerical.digitize(self._column, bins, right) - ) + return Series(cudf.core.column.numerical.digitize(self._column, bins, right)) @_cudf_nvtx_annotate def diff(self, periods=1): @@ -3626,9 +3553,7 @@ def pct_change( ) if fill_method not in (no_default, None) or limit is not no_default: # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( "The 'fill_method' and 'limit' keywords in " f"{type(self).__name__}.pct_change are deprecated and will be " @@ -4215,9 +4140,7 @@ def quarter(self): 1 4 dtype: int8 """ - res = libcudf.datetime.extract_quarter(self.series._column).astype( - np.int8 - ) + res = libcudf.datetime.extract_quarter(self.series._column).astype(np.int8) return Series._from_data( {None: res}, index=self.series._index, @@ -4413,9 +4336,7 @@ def is_quarter_start(self): dtype: bool """ day = self.series._column.get_dt_field("day") - first_month = self.series._column.get_dt_field("month").isin( - [1, 4, 7, 10] - ) + first_month = self.series._column.get_dt_field("month").isin([1, 4, 7, 10]) result = ((day == cudf.Scalar(1)) & first_month).fillna(False) return Series._from_data( @@ -4464,9 +4385,7 @@ def is_quarter_end(self): day = self.series._column.get_dt_field("day") last_day = libcudf.datetime.last_day_of_month(self.series._column) last_day = last_day.get_dt_field("day") - last_month = self.series._column.get_dt_field("month").isin( - [3, 6, 9, 12] - ) + last_month = self.series._column.get_dt_field("month").isin([3, 6, 9, 12]) result = ((day == last_day) & last_month).fillna(False) return Series._from_data( @@ -4501,9 +4420,7 @@ def is_year_start(self): 2 True dtype: bool """ - outcol = self.series._column.get_dt_field( - "day_of_year" - ) == cudf.Scalar(1) + outcol = self.series._column.get_dt_field("day_of_year") == cudf.Scalar(1) return Series._from_data( {None: outcol.fillna(False)}, index=self.series._index, @@ -4552,9 +4469,7 @@ def is_year_end(self): @_cudf_nvtx_annotate def _get_dt_field(self, field): out_column = self.series._column.get_dt_field(field) - return Series( - data=out_column, index=self.series._index, name=self.series.name - ) + return Series(data=out_column, index=self.series._index, name=self.series.name) @_cudf_nvtx_annotate def ceil(self, freq): @@ -4725,9 +4640,7 @@ def strftime(self, date_format, *args, **kwargs): """ if not isinstance(date_format, str): - raise TypeError( - f"'date_format' must be str, not {type(date_format)}" - ) + raise TypeError(f"'date_format' must be str, not {type(date_format)}") # TODO: Remove following validations # once https://github.com/rapidsai/cudf/issues/5991 @@ -4745,12 +4658,8 @@ def strftime(self, date_format, *args, **kwargs): f"https://github.com/rapidsai/cudf/issues/5991 " f"for tracking purposes." ) - str_col = self.series._column.as_string_column( - dtype="str", format=date_format - ) - return Series( - data=str_col, index=self.series._index, name=self.series.name - ) + str_col = self.series._column.as_string_column(dtype="str", format=date_format) + return Series(data=str_col, index=self.series._index, name=self.series.name) @copy_docstring(DatetimeIndex.tz_localize) def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): @@ -4759,9 +4668,7 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): if tz is None: result_col = delocalize(self.series._column) else: - result_col = localize( - self.series._column, tz, ambiguous, nonexistent - ) + result_col = localize(self.series._column, tz, ambiguous, nonexistent) return Series._from_data( data={self.series.name: result_col}, index=self.series._index, @@ -5028,9 +4935,7 @@ def components(self): @_cudf_nvtx_annotate def _get_td_field(self, field): out_column = getattr(self.series._column, field) - return Series( - data=out_column, index=self.series._index, name=self.series.name - ) + return Series(data=out_column, index=self.series._index, name=self.series.name) @_cudf_nvtx_annotate @@ -5086,9 +4991,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False): # align all Series to the combined index result = [ - sr._align_to_index( - combined_index, how=how, allow_non_unique=allow_non_unique - ) + sr._align_to_index(combined_index, how=how, allow_non_unique=allow_non_unique) for sr in series_list ] diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 19dde2e51b9..5d4a3d49855 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -316,9 +316,9 @@ def _make_operands_for_binop( # Get the appropriate name for output operations involving two objects # that are Series-like objects. The output shares the lhs's name unless # the rhs is a _differently_ named Series-like object. - if isinstance( - other, SingleColumnFrame - ) and not cudf.utils.utils._is_same_name(self.name, other.name): + if isinstance(other, SingleColumnFrame) and not cudf.utils.utils._is_same_name( + self.name, other.name + ): result_name = None else: result_name = self.name @@ -326,9 +326,9 @@ def _make_operands_for_binop( if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): - if not hasattr( - other, "__cuda_array_interface__" - ) and not isinstance(other, cudf.RangeIndex): + if not hasattr(other, "__cuda_array_interface__") and not isinstance( + other, cudf.RangeIndex + ): return NotImplemented # Non-scalar right operands are valid iff they convert to columns. @@ -381,9 +381,7 @@ def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]: return self._column.take(arg) if is_bool_dtype(arg.dtype): if (bn := len(arg)) != (n := len(self)): - raise IndexError( - f"Boolean mask has wrong length: {bn} not {n}" - ) + raise IndexError(f"Boolean mask has wrong length: {bn} not {n}") return self._column.apply_boolean_mask(arg) raise NotImplementedError(f"Unknown indexer {type(arg)}") @@ -395,14 +393,10 @@ def where(self, cond, other=None, inplace=False): ) if isinstance(other, cudf.DataFrame): - raise NotImplementedError( - "cannot align with a higher dimensional Frame" - ) + raise NotImplementedError("cannot align with a higher dimensional Frame") cond = as_column(cond) if len(cond) != len(self): - raise ValueError( - """Array conditional must be same shape as self""" - ) + raise ValueError("""Array conditional must be same shape as self""") if not cudf.api.types.is_scalar(other): other = cudf.core.column.as_column(other) diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 24c49e3662a..475e705b4a4 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -189,8 +189,7 @@ def __call__( if padding != "max_length": error_msg = ( - "Only padding to the provided max_length" - "is currently supported" + "Only padding to the provided max_length" "is currently supported" ) raise NotImplementedError(error_msg) @@ -200,8 +199,7 @@ def __call__( if return_tensors not in {"cp", "pt", "tf"}: error_msg = ( - "Only cupy(cp), pytorch(pt) and tensorflow(tf) " - "tensors are supported" + "Only cupy(cp), pytorch(pt) and tensorflow(tf) " "tensors are supported" ) raise NotImplementedError(error_msg) @@ -219,9 +217,7 @@ def __call__( tokenizer_output = { "input_ids": cp.asarray(input_ids).reshape(-1, max_length), - "attention_mask": cp.asarray(attention_mask).reshape( - -1, max_length - ), + "attention_mask": cp.asarray(attention_mask).reshape(-1, max_length), "metadata": cp.asarray(metadata).reshape(-1, 3), } @@ -248,9 +244,7 @@ def _bert_add_special_tokens(token_o): seq_end_col = cp.clip(seq_end_col + 1, a_min=None, a_max=max_length - 1) _bert_add_special_tokens_input_ids(token_o["input_ids"], seq_end_col) - _bert_add_special_tokens_attention_mask( - token_o["attention_mask"], seq_end_col - ) + _bert_add_special_tokens_attention_mask(token_o["attention_mask"], seq_end_col) _bert_add_special_tokens_metadata(token_o["metadata"], max_length) return token_o @@ -266,9 +260,7 @@ def _bert_add_special_tokens_input_ids(input_ids, seq_end_col): input_ids[:, 0] = 101 # Mark end of sequence [SEP] - input_ids[ - cp.arange(0, input_ids.shape[0], dtype=cp.uint32), seq_end_col - ] = 102 + input_ids[cp.arange(0, input_ids.shape[0], dtype=cp.uint32), seq_end_col] = 102 def _bert_add_special_tokens_attention_mask(attention_mask, seq_end_col): @@ -292,6 +284,4 @@ def _bert_add_special_tokens_metadata(metadata, max_length): # metadata seq starts from plus 1 metadata[:, 1] = metadata[:, 1] + 1 # clip done to take overflow into account - metadata[:, 2] = cp.clip( - metadata[:, 2] + 1, a_min=None, a_max=max_length - 2 - ) + metadata[:, 2] = cp.clip(metadata[:, 2] + 1, a_min=None, a_max=max_length - 2) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index d182b7b4a7c..eee8370c019 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -231,9 +231,7 @@ def to_datetime( + arg[unit_rev["day"]].astype("str").str.zfill(2) ) format = "%Y-%m-%d" - col = new_series._column.as_datetime_column( - "datetime64[s]", format=format - ) + col = new_series._column.as_datetime_column("datetime64[s]", format=format) for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) @@ -267,9 +265,7 @@ def to_datetime( factor = cudf.Scalar( column.datetime._unit_to_nanoseconds_conversion[u] / ( - column.datetime._unit_to_nanoseconds_conversion[ - "s" - ] + column.datetime._unit_to_nanoseconds_conversion["s"] if np.datetime_data(col.dtype)[0] == "s" else 1 ) @@ -280,9 +276,7 @@ def to_datetime( else: times_column = times_column + (current_col * factor) if times_column is not None: - col = (col.astype(dtype="int64") + times_column).astype( - dtype=col.dtype - ) + col = (col.astype(dtype="int64") + times_column).astype(dtype=col.dtype) col = _process_col( col=col, unit=unit, @@ -337,9 +331,7 @@ def _process_col( if col.dtype.kind == "f": if unit not in (None, "ns"): - factor = cudf.Scalar( - column.datetime._unit_to_nanoseconds_conversion[unit] - ) + factor = cudf.Scalar(column.datetime._unit_to_nanoseconds_conversion[unit]) col = col * factor if format is not None: @@ -353,9 +345,7 @@ def _process_col( col.astype("int") .astype("str") .as_datetime_column( - dtype="datetime64[us]" - if "%f" in format - else "datetime64[s]", + dtype="datetime64[us]" if "%f" in format else "datetime64[s]", format=format, ) ) @@ -536,9 +526,7 @@ class DateOffset: def __init__(self, n=1, normalize=False, **kwds): if normalize: - raise NotImplementedError( - "normalize not yet supported for DateOffset" - ) + raise NotImplementedError("normalize not yet supported for DateOffset") all_possible_units = { "years", @@ -619,9 +607,7 @@ def kwds(self): def _combine_months_and_years(self, **kwargs): # TODO: if months is zero, don't do a binop - kwargs["months"] = kwargs.pop("years", 0) * 12 + kwargs.pop( - "months", 0 - ) + kwargs["months"] = kwargs.pop("years", 0) * 12 + kwargs.pop("months", 0) return kwargs def _combine_kwargs_to_seconds(self, **kwargs): @@ -646,9 +632,7 @@ def _combine_kwargs_to_seconds(self, **kwargs): kwargs["seconds"] = seconds return kwargs - def _datetime_binop( - self, datetime_col, op, reflect=False - ) -> column.DatetimeColumn: + def _datetime_binop(self, datetime_col, op, reflect=False) -> column.DatetimeColumn: if reflect and op == "__sub__": raise TypeError( f"Can not subtract a {type(datetime_col).__name__}" @@ -978,8 +962,7 @@ def date_range( # are dropped in conversion during the binops warnings.simplefilter("ignore", UserWarning) end_estim = ( - pd.Timestamp(start.value) - + periods * offset._maybe_as_fast_pandas_offset() + pd.Timestamp(start.value) + periods * offset._maybe_as_fast_pandas_offset() ).to_datetime64() if "months" in offset.kwds or "years" in offset.kwds: @@ -1066,13 +1049,10 @@ def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int: def _to_iso_calendar(arg): formats = ["%G", "%V", "%u"] if not isinstance(arg, (cudf.Index, cudf.core.series.DatetimeProperties)): - raise AttributeError( - "Can only use .isocalendar accessor with series or index" - ) + raise AttributeError("Can only use .isocalendar accessor with series or index") if isinstance(arg, cudf.Index): iso_params = [ - arg._column.as_string_column(arg._values.dtype, fmt) - for fmt in formats + arg._column.as_string_column(arg._values.dtype, fmt) for fmt in formats ] index = arg._column elif isinstance(arg.series, cudf.Series): diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index e1424459c8f..19ae632d7f7 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -106,9 +106,7 @@ def to_numeric(arg, errors="raise", downcast=None): if downcast not in {None, "integer", "signed", "unsigned", "float"}: raise ValueError("invalid downcasting method provided") - if not can_convert_to_column(arg) or ( - hasattr(arg, "ndim") and arg.ndim > 1 - ): + if not can_convert_to_column(arg) or (hasattr(arg, "ndim") and arg.ndim > 1): raise ValueError("arg must be column convertible") col = as_column(arg) diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index fe0637cfaef..cb39dfdb196 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from functools import partial @@ -29,9 +29,7 @@ def group_reduction_impl_basic(context, builder, sig, args, function): retty = sig.return_type # a variable logically corresponding to the calling `Group` - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) + grp = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) # what specific (numba) GroupType grp_type = sig.args[0] @@ -55,12 +53,8 @@ def group_corr(context, builder, sig, args): """ Instruction boilerplate used for calling a groupby correlation """ - lhs_grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - rhs_grp = cgutils.create_struct_proxy(sig.args[1])( - context, builder, value=args[1] - ) + lhs_grp = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) + rhs_grp = cgutils.create_struct_proxy(sig.args[1])(context, builder, value=args[1]) device_func = call_cuda_functions["corr"][ ( @@ -74,12 +68,8 @@ def group_corr(context, builder, sig, args): device_func, nb_signature( types.float64, - types.CPointer( - sig.args[0].group_scalar_type - ), # this group calls corr - types.CPointer( - sig.args[1].group_scalar_type - ), # this group is passed + types.CPointer(sig.args[0].group_scalar_type), # this group calls corr + types.CPointer(sig.args[1].group_scalar_type), # this group is passed group_size_type, ), ( @@ -120,9 +110,7 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function): """ retty = sig.return_type - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) + grp = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) grp_type = sig.args[0] if grp_type.index_type != index_default_type: @@ -154,18 +142,12 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function): cuda_Group_std = partial(group_reduction_impl_basic, function="std") cuda_Group_var = partial(group_reduction_impl_basic, function="var") -cuda_Group_idxmax = partial( - group_reduction_impl_idx_max_or_min, function="idxmax" -) -cuda_Group_idxmin = partial( - group_reduction_impl_idx_max_or_min, function="idxmin" -) +cuda_Group_idxmax = partial(group_reduction_impl_idx_max_or_min, function="idxmax") +cuda_Group_idxmin = partial(group_reduction_impl_idx_max_or_min, function="idxmin") def cuda_Group_size(context, builder, sig, args): - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) + grp = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) return grp.size @@ -181,10 +163,6 @@ def cuda_Group_size(context, builder, sig, args): cuda_lower("GroupType.mean", GroupType(ty))(cuda_Group_mean) cuda_lower("GroupType.std", GroupType(ty))(cuda_Group_std) cuda_lower("GroupType.var", GroupType(ty))(cuda_Group_var) - cuda_lower("GroupType.idxmax", GroupType(ty, types.int64))( - cuda_Group_idxmax - ) - cuda_lower("GroupType.idxmin", GroupType(ty, types.int64))( - cuda_Group_idxmin - ) + cuda_lower("GroupType.idxmax", GroupType(ty, types.int64))(cuda_Group_idxmax) + cuda_lower("GroupType.idxmin", GroupType(ty, types.int64))(cuda_Group_idxmin) cuda_lower("GroupType.corr", GroupType(ty), GroupType(ty))(group_corr) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 72088493074..e288ececb88 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from typing import Any, Dict import numba @@ -30,9 +30,7 @@ numpy_support.as_dtype(dt) for dt in SUPPORTED_GROUPBY_NUMBA_TYPES ] -_UDF_DOC_URL = ( - "https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs/" -) +_UDF_DOC_URL = "https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs/" class Group: @@ -54,9 +52,8 @@ class GroupType(numba.types.Type): """ def __init__(self, group_scalar_type, index_type=index_default_type): - if ( - group_scalar_type not in SUPPORTED_GROUPBY_NUMBA_TYPES - and not isinstance(group_scalar_type, types.Poison) + if group_scalar_type not in SUPPORTED_GROUPBY_NUMBA_TYPES and not isinstance( + group_scalar_type, types.Poison ): # A frame containing an column with an unsupported dtype # is calling groupby apply. Construct a GroupType with @@ -68,9 +65,7 @@ def __init__(self, group_scalar_type, index_type=index_default_type): self.group_data_type = types.CPointer(group_scalar_type) self.group_size_type = group_size_type self.group_index_type = types.CPointer(index_type) - super().__init__( - name=f"Group({self.group_scalar_type}, {self.index_type})" - ) + super().__init__(name=f"Group({self.group_scalar_type}, {self.index_type})") class GroupByJITDataFrame(Row): @@ -201,9 +196,7 @@ def generic(self, args, kws): ) if funcs := call_cuda_functions.get(self.key.__name__): for sig in funcs.keys(): - if all( - arg.group_scalar_type == ty for arg, ty in zip(args, sig) - ): + if all(arg.group_scalar_type == ty for arg, ty in zip(args, sig)): return nb_signature(sig[0], *args) raise UDFError(self.make_error_string(args)) @@ -239,8 +232,7 @@ def generic(self, args, kws): for sig in funcs.keys(): retty, selfty, *argtys = sig if self.this.group_scalar_type == selfty and all( - arg.group_scalar_type == ty - for arg, ty in zip(args, argtys) + arg.group_scalar_type == ty for arg, ty in zip(args, argtys) ): return nb_signature(retty, *args, recvr=self.this) raise UDFError(self.make_error_string(args)) @@ -307,9 +299,7 @@ class GroupCorr(GroupBinaryAttrBase): class DataFrameAttributeTemplate(AttributeTemplate): def resolve(self, value, attr): - raise UDFError( - f"JIT GroupBy.apply() does not support DataFrame.{attr}(). " - ) + raise UDFError(f"JIT GroupBy.apply() does not support DataFrame.{attr}(). ") @cuda_registry.register_attr @@ -329,12 +319,8 @@ class GroupAttr(AttributeTemplate): resolve_var = _make_unary_attr("var") resolve_std = _make_unary_attr("std") - resolve_size = _create_reduction_attr( - "GroupType.size", retty=group_size_type - ) - resolve_count = _create_reduction_attr( - "GroupType.count", retty=types.int64 - ) + resolve_size = _create_reduction_attr("GroupType.size", retty=group_size_type) + resolve_count = _create_reduction_attr("GroupType.count", retty=types.int64) def resolve_idxmax(self, mod): return types.BoundFunction( diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 06d9296ca0f..183ea9c6c73 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -94,9 +94,7 @@ def _groupby_apply_kernel_string_from_template(frame, args): # Generate the initializers for each device function argument initializers = [] for i, colname in enumerate(frame.keys()): - initializers.append( - group_initializer_template.format(idx=i, name=colname) - ) + initializers.append(group_initializer_template.format(idx=i, name=colname)) return groupby_apply_kernel_template.format( input_columns=input_columns, @@ -107,9 +105,7 @@ def _groupby_apply_kernel_string_from_template(frame, args): def _get_groupby_apply_kernel(frame, func, args): np_field_types = np.dtype(list(_all_dtypes_from_frame(frame).items())) - dataframe_group_type = _get_frame_groupby_type( - np_field_types, frame.index.dtype - ) + dataframe_group_type = _get_frame_groupby_type(np_field_types, frame.index.dtype) return_type = _get_udf_return_type(dataframe_group_type, func, args) @@ -219,9 +215,7 @@ def _can_be_jitted(frame, func, args): ).items() ) ) - dataframe_group_type = _get_frame_groupby_type( - np_field_types, frame.index.dtype - ) + dataframe_group_type = _get_frame_groupby_type(np_field_types, frame.index.dtype) try: _get_udf_return_type(dataframe_group_type, func, args) return True diff --git a/python/cudf/cudf/core/udf/masked_lowering.py b/python/cudf/cudf/core/udf/masked_lowering.py index ae09294e3f9..18e2a33f2fc 100644 --- a/python/cudf/cudf/core/udf/masked_lowering.py +++ b/python/cudf/cudf/core/udf/masked_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import operator @@ -55,17 +55,11 @@ def masked_scalar_op_impl(context, builder, sig, args): # Let there be two actual LLVM structs backing the two inputs # https://mapping-high-level-constructs-to-llvm-ir.readthedocs.io/en/latest/basic-constructs/structures.html - m1 = cgutils.create_struct_proxy(masked_type_1)( - context, builder, value=args[0] - ) - m2 = cgutils.create_struct_proxy(masked_type_2)( - context, builder, value=args[1] - ) + m1 = cgutils.create_struct_proxy(masked_type_1)(context, builder, value=args[0]) + m2 = cgutils.create_struct_proxy(masked_type_2)(context, builder, value=args[1]) # we will return an output struct - result = cgutils.create_struct_proxy(masked_return_type)( - context, builder - ) + result = cgutils.create_struct_proxy(masked_return_type)(context, builder) # compute output validity valid = builder.and_(m1.valid, m2.valid) result.valid = valid @@ -103,14 +97,10 @@ def masked_scalar_unary_op_impl(context, builder, sig, args): # MaskedType(...) masked_return_type = sig.return_type - m1 = cgutils.create_struct_proxy(masked_type_1)( - context, builder, value=args[0] - ) + m1 = cgutils.create_struct_proxy(masked_type_1)(context, builder, value=args[0]) # we will return an output struct - result = cgutils.create_struct_proxy(masked_return_type)( - context, builder - ) + result = cgutils.create_struct_proxy(masked_return_type)(context, builder) # compute output validity result.valid = m1.valid @@ -259,9 +249,7 @@ def masked_scalar_is_null_impl(context, builder, sig, args): na, masked_type = sig.args value = args[1] - indata = cgutils.create_struct_proxy(masked_type)( - context, builder, value=value - ) + indata = cgutils.create_struct_proxy(masked_type)(context, builder, value=value) result = cgutils.alloca_once(builder, ir.IntType(1)) with builder.if_else(indata.valid) as (then, otherwise): with then: @@ -295,9 +283,7 @@ def pack_return_scalar_impl(context, builder, sig, args): @cuda_lower(operator.truth, MaskedType) @cuda_lower(bool, MaskedType) def masked_scalar_bool_impl(context, builder, sig, args): - indata = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) + indata = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) result = cgutils.alloca_once(builder, ir.IntType(1)) with builder.if_else(indata.valid) as (then, otherwise): with then: @@ -318,9 +304,7 @@ def masked_scalar_bool_impl(context, builder, sig, args): @cuda_lower(float, MaskedType) @cuda_lower(int, MaskedType) def masked_scalar_cast_impl(context, builder, sig, args): - input = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) + input = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) result = cgutils.create_struct_proxy(sig.return_type)(context, builder) casted = context.cast( @@ -367,9 +351,7 @@ def cast_masked_to_masked(context, builder, fromty, toty, val): # We will operand = cgutils.create_struct_proxy(fromty)(context, builder, value=val) - casted = context.cast( - builder, operand.value, fromty.value_type, toty.value_type - ) + casted = context.cast(builder, operand.value, fromty.value_type, toty.value_type) ext = cgutils.create_struct_proxy(toty)(context, builder) ext.value = casted ext.valid = operand.valid diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py index 4c90c5bbba0..c84549adfc9 100644 --- a/python/cudf/cudf/core/udf/masked_typing.py +++ b/python/cudf/cudf/core/udf/masked_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import operator @@ -47,9 +47,7 @@ TIMEDELTA_TYPES, ) -SUPPORTED_NUMPY_TYPES = ( - NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES -) +SUPPORTED_NUMPY_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES supported_type_str = "\n".join(sorted(list(SUPPORTED_NUMPY_TYPES) + ["bool"])) _units = ["ns", "ms", "us", "s"] @@ -150,9 +148,7 @@ def f(x): # two MaskedType unify to a new MaskedType whose value_type # is the result of unifying `self` and `other` `value_type` elif isinstance(other, MaskedType): - return MaskedType( - context.unify_pairs(self.value_type, other.value_type) - ) + return MaskedType(context.unify_pairs(self.value_type, other.value_type)) # if we have MaskedType and something that results in a # scalar, unify between the MaskedType's value_type @@ -188,8 +184,7 @@ def typeof_masked(val, c): class MaskedConstructor(ConcreteTemplate): key = api.Masked cases = [ - nb_signature(MaskedType(t), t, types.boolean) - for t in _supported_masked_types + nb_signature(MaskedType(t), t, types.boolean) for t in _supported_masked_types ] @@ -205,9 +200,7 @@ def resolve_Masked(self, mod): # Registration of the global is also needed for Numba to type api.Masked cuda_decl_registry.register_global(api, types.Module(api)) # For typing bare Masked (as in `from .api import Masked` -cuda_decl_registry.register_global( - api.Masked, types.Function(MaskedConstructor) -) +cuda_decl_registry.register_global(api.Masked, types.Function(MaskedConstructor)) # Provide access to `m.value` and `m.valid` in a kernel for a Masked `m`. @@ -613,14 +606,10 @@ class MaskedStringViewAttrs(AttributeTemplate): key = MaskedType(string_view) def resolve_replace(self, mod): - return types.BoundFunction( - MaskedStringViewReplace, MaskedType(string_view) - ) + return types.BoundFunction(MaskedStringViewReplace, MaskedType(string_view)) def resolve_count(self, mod): - return types.BoundFunction( - MaskedStringViewCount, MaskedType(string_view) - ) + return types.BoundFunction(MaskedStringViewCount, MaskedType(string_view)) def resolve_value(self, mod): return string_view diff --git a/python/cudf/cudf/core/udf/row_function.py b/python/cudf/cudf/core/udf/row_function.py index e040836f97d..843521e1956 100644 --- a/python/cudf/cudf/core/udf/row_function.py +++ b/python/cudf/cudf/core/udf/row_function.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import math import numpy as np @@ -122,9 +122,7 @@ def f(row, c, k): else unmasked_input_initializer_template ) initializers.append(template.format(idx=idx)) - row_initializers.append( - row_initializer_template.format(idx=idx, name=colname) - ) + row_initializers.append(row_initializer_template.format(idx=idx, name=colname)) return row_kernel_template.format( input_columns=input_columns, @@ -145,9 +143,7 @@ def _get_row_kernel(frame, func, args): sig = _construct_signature(frame, scalar_return_type, args) # this row type is used within the kernel to pack up the column and # mask data into the dict like data structure the user udf expects - np_field_types = np.dtype( - list(_supported_dtypes_from_frame(frame).items()) - ) + np_field_types = np.dtype(list(_supported_dtypes_from_frame(frame).items())) row_type = _get_frame_row_type(np_field_types) # Dict of 'local' variables into which `_kernel` is defined diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index fdce404d887..fd3fe3d4370 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import operator from functools import partial @@ -125,9 +125,7 @@ def cast_string_literal_to_string_view(context, builder, fromty, toty, val): sv = cgutils.create_struct_proxy(string_view)(context, builder) # set the empty strview data pointer to point to the literal value - sv.data = context.insert_string_const_addrspace( - builder, fromty.literal_value - ) + sv.data = context.insert_string_const_addrspace(builder, fromty.literal_value) sv.length = context.get_constant(size_type, len(fromty.literal_value)) sv.bytes = context.get_constant( size_type, len(fromty.literal_value.encode("UTF-8")) @@ -249,7 +247,7 @@ def replace_impl(context, builder, sig, args): replacement_ptr = builder.alloca(args[2].type) builder.store(args[0], src_ptr) - builder.store(args[1], to_replace_ptr), + (builder.store(args[1], to_replace_ptr),) builder.store(args[2], replacement_ptr) udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) @@ -257,9 +255,7 @@ def replace_impl(context, builder, sig, args): _ = context.compile_internal( builder, call_string_view_replace, - types.void( - _UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR - ), + types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), (udf_str_ptr, src_ptr, to_replace_ptr, replacement_ptr), ) @@ -329,9 +325,9 @@ def binary_func_impl(context, builder, sig, args): f"UDFString.{binary_func}", string_view, string_view )(binary_func_impl) else: - binary_func_impl = cuda_lower( - binary_func, string_view, string_view - )(binary_func_impl) + binary_func_impl = cuda_lower(binary_func, string_view, string_view)( + binary_func_impl + ) return binary_func_impl @@ -430,9 +426,7 @@ def id_func_impl(context, builder, sig, args): # Lookup table required for conversion functions # must be resolved at runtime after context initialization, # therefore cannot be a global variable - tbl_ptr = context.get_constant( - types.uintp, get_character_flags_table_ptr() - ) + tbl_ptr = context.get_constant(types.uintp, get_character_flags_table_ptr()) result = context.compile_internal( builder, cuda_func, @@ -473,9 +467,7 @@ def id_func_impl(context, builder, sig, args): special_tbl_ptr = context.get_constant( types.uintp, get_special_case_mapping_table_ptr() ) - udf_str_ptr = builder.alloca( - default_manager[udf_string].get_value_type() - ) + udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) _ = context.compile_internal( builder, @@ -569,9 +561,7 @@ def masked_len_impl(context, builder, sig, args): masked_sv = cgutils.create_struct_proxy(masked_sv_ty)( context, builder, value=args[0] ) - result = len_impl( - context, builder, size_type(string_view), (masked_sv.value,) - ) + result = len_impl(context, builder, size_type(string_view), (masked_sv.value,)) ret.value = result ret.valid = masked_sv.valid @@ -700,15 +690,11 @@ def upper_or_lower_impl(context, builder, sig, args): startswith_impl, types.boolean, ) -create_masked_binary_string_func( - "MaskedType.endswith", endswith_impl, types.boolean -) +create_masked_binary_string_func("MaskedType.endswith", endswith_impl, types.boolean) create_masked_binary_string_func("MaskedType.find", find_impl, size_type) create_masked_binary_string_func("MaskedType.rfind", rfind_impl, size_type) create_masked_binary_string_func("MaskedType.count", count_impl, size_type) -create_masked_binary_string_func( - operator.contains, contains_impl, types.boolean -) +create_masked_binary_string_func(operator.contains, contains_impl, types.boolean) create_masked_unary_identifier_func("MaskedType.isalnum", isalnum_impl) diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index 43604ab21a7..6268ff1b2cd 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -190,9 +190,7 @@ class StringViewReplace(AbstractTemplate): key = "StringView.replace" def generic(self, args, kws): - return nb_signature( - udf_string, string_view, string_view, recvr=self.this - ) + return nb_signature(udf_string, string_view, string_view, recvr=self.this) class StringViewAttrs(AttributeTemplate): @@ -237,9 +235,7 @@ def resolve_replace(self, mod): for func in int_binary_funcs: - setattr( - StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type) - ) + setattr(StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type)) for func in id_unary_funcs: setattr( diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 12baf1ea6d1..ba0d997e3ff 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -41,19 +41,13 @@ from cudf.utils.utils import initfunc # Maximum size of a string column is 2 GiB -_STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get( - "STRINGS_UDF_HEAP_SIZE", 2**31 -) +_STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get("STRINGS_UDF_HEAP_SIZE", 2**31) _heap_size = 0 _cudf_str_dtype = dtype(str) JIT_SUPPORTED_TYPES = ( - NUMERIC_TYPES - | BOOL_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | STRING_TYPES + NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES ) libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32")) MASK_BITSIZE = np.dtype("int32").itemsize * 8 @@ -65,9 +59,7 @@ @functools.cache def _ptx_file(): return _get_ptx_file( - os.path.join( - os.path.dirname(strings_udf.__file__), "..", "core", "udf" - ), + os.path.join(os.path.dirname(strings_udf.__file__), "..", "core", "udf"), "shim_", ) @@ -124,9 +116,7 @@ def _get_udf_return_type(argty, func: Callable, args=()): def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - if str(col.dtype) in supported_types - else np.dtype("O") + colname: col.dtype if str(col.dtype) in supported_types else np.dtype("O") for colname, col in frame._data.items() } @@ -227,9 +217,7 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): """ scalar_argtypes = tuple(typeof(arg) for arg in args) return ( - *cudautils.make_cache_key( - func, tuple(_all_dtypes_from_frame(frame).values()) - ), + *cudautils.make_cache_key(func, tuple(_all_dtypes_from_frame(frame).values())), *(col.mask is None for col in frame._data.values()), *frame._data.keys(), scalar_argtypes, @@ -238,9 +226,7 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): @_cudf_nvtx_annotate -def _compile_or_get( - frame, func, args, kernel_getter=None, suffix="__APPLY_UDF" -): +def _compile_or_get(frame, func, args, kernel_getter=None, suffix="__APPLY_UDF"): """ Return a compiled kernel in terms of MaskedTypes that launches a kernel equivalent of `f` for the dtypes of `df`. The kernel uses @@ -291,9 +277,9 @@ def _get_kernel(kernel_string, globals_, sig, func): globals_["f_"] = f_ exec(kernel_string, globals_) _kernel = globals_["_kernel"] - kernel = cuda.jit( - sig, link=[_ptx_file()], extensions=[str_view_arg_handler] - )(_kernel) + kernel = cuda.jit(sig, link=[_ptx_file()], extensions=[str_view_arg_handler])( + _kernel + ) return kernel diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2037b1682db..b9b3b4f8e76 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -226,12 +226,8 @@ def _apply_agg_column(self, source_column, agg_name): end = as_column(end, dtype="int32") idx = as_column(range(len(start))) - preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype( - "int32" - ) - following_window = (end - idx - cudf.Scalar(1, "int32")).astype( - "int32" - ) + preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype("int32") + following_window = (end - idx - cudf.Scalar(1, "int32")).astype("int32") window = None else: preceding_window = as_column(self.window) @@ -263,11 +259,7 @@ def _apply_agg_dataframe(self, df, agg_name): def _apply_agg(self, agg_name): if isinstance(self.obj, cudf.Series): return cudf.Series._from_data( - { - self.obj.name: self._apply_agg_column( - self.obj._column, agg_name - ) - }, + {self.obj.name: self._apply_agg_column(self.obj._column, agg_name)}, index=self.obj.index, ) else: @@ -439,18 +431,14 @@ def _normalize(self): if self.min_periods is None: min_periods = window else: - if isinstance( - window, (numba.cuda.devicearray.DeviceNDArray, BaseIndexer) - ): + if isinstance(window, (numba.cuda.devicearray.DeviceNDArray, BaseIndexer)): # window is a device_array of window sizes or BaseIndexer self.window = window self.min_periods = min_periods return if not isinstance(self.obj.index, cudf.core.index.DatetimeIndex): - raise ValueError( - "window must be an integer for non datetime index" - ) + raise ValueError("window must be an integer for non datetime index") self._time_window = True @@ -506,14 +494,10 @@ def __init__(self, groupby, window, min_periods=None, center=False): # of `groupby.grouping.keys` and `groupby.obj`. # As an optimization, avoid gathering those twice. self._group_keys = groupby.grouping.keys.take(sort_order) - obj = groupby.obj.drop(columns=groupby.grouping._named_columns).take( - sort_order - ) + obj = groupby.obj.drop(columns=groupby.grouping._named_columns).take(sort_order) gb_size = groupby.size().sort_index() - self._group_starts = ( - gb_size.cumsum().shift(1).fillna(0).repeat(gb_size) - ) + self._group_starts = gb_size.cumsum().shift(1).fillna(0).repeat(gb_size) super().__init__(obj, window, min_periods=min_periods, center=center) diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index 7b183d5f1a3..e311f214d62 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -55,9 +55,7 @@ def timeseries( if dtypes is None: dtypes = {"name": "category", "id": int, "x": float, "y": float} - index = pd.DatetimeIndex( - pd.date_range(start, end, freq=freq, name="timestamp") - ) + index = pd.DatetimeIndex(pd.date_range(start, end, freq=freq, name="timestamp")) state = np.random.RandomState(seed) columns = {k: make[dt](len(index), state) for k, dt in dtypes.items()} df = pd.DataFrame(columns, index=index, columns=sorted(columns)) @@ -158,9 +156,7 @@ def make_string(n, rstate): def make_categorical(n, rstate): - return pd.Categorical.from_codes( - rstate.randint(0, len(names), size=n), names - ) + return pd.Categorical.from_codes(rstate.randint(0, len(names), size=n), names) def make_bool(n, rstate): diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index 728b34045bf..54fbd833b99 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import cudf from cudf import _lib as libcudf @@ -33,7 +33,5 @@ def read_avro( ValueError("URL content-encoding decompression is not supported") return cudf.DataFrame._from_data( - *libcudf.avro.read_avro( - filepath_or_buffer, columns, skiprows, num_rows - ) + *libcudf.avro.read_avro(filepath_or_buffer, columns, skiprows, num_rows) ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 3eeeac405b3..3591072e4e8 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -65,8 +65,7 @@ def read_csv( if use_python_file_object and bytes_per_thread is not None: raise ValueError( - "bytes_per_thread is only supported when " - "`use_python_file_object=False`" + "bytes_per_thread is only supported when " "`use_python_file_object=False`" ) if bytes_per_thread is None: @@ -199,20 +198,16 @@ def to_csv( try: df = df[columns] except KeyError: - raise NameError( - "Dataframe doesn't have the labels provided in columns" - ) + raise NameError("Dataframe doesn't have the labels provided in columns") for col in df._data.columns: if isinstance(col, cudf.core.column.ListColumn): raise NotImplementedError( - "Writing to csv format is not yet supported with " - "list columns." + "Writing to csv format is not yet supported with " "list columns." ) elif isinstance(col, cudf.core.column.StructColumn): raise NotImplementedError( - "Writing to csv format is not yet supported with " - "Struct columns." + "Writing to csv format is not yet supported with " "Struct columns." ) # TODO: Need to typecast categorical columns to the underlying @@ -220,8 +215,7 @@ def to_csv( # workaround once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6661 if any( - isinstance(col, cudf.core.column.CategoricalColumn) - for col in df._data.columns + isinstance(col, cudf.core.column.CategoricalColumn) for col in df._data.columns ) or isinstance(df.index, cudf.CategoricalIndex): df = df.copy(deep=False) for col_name, col in df._data.items(): diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index d3d99aab0cd..37b77eb14ad 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -74,8 +74,7 @@ def to_dlpack(cudf_obj): gdf = cudf.Series._from_data({None: cudf_obj}) else: raise TypeError( - f"Input of type {type(cudf_obj)} cannot be converted " - "to DLPack tensor" + f"Input of type {type(cudf_obj)} cannot be converted " "to DLPack tensor" ) if any( @@ -84,9 +83,7 @@ def to_dlpack(cudf_obj): ): raise TypeError("non-numeric data not yet supported") - dtype = cudf.utils.dtypes.find_common_type( - [col.dtype for col in gdf._data.columns] - ) + dtype = cudf.utils.dtypes.find_common_type([col.dtype for col in gdf._data.columns]) gdf = gdf.astype(dtype) return libdlpack.to_dlpack([*gdf._columns]) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index b2f3fd09146..60530ad56dc 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -60,9 +60,7 @@ def read_json( if engine == "auto": engine = "cudf" if lines else "pandas" if engine != "cudf" and keep_quotes: - raise ValueError( - "keep_quotes='True' is supported only with engine='cudf'" - ) + raise ValueError("keep_quotes='True' is supported only with engine='cudf'") if engine == "cudf_legacy" or engine == "cudf": if dtype is None: @@ -132,8 +130,7 @@ def read_json( storage_options=storage_options, ): raise NotImplementedError( - "`read_json` does not yet support reading " - "multiple files via pandas" + "`read_json` does not yet support reading " "multiple files via pandas" ) path_or_buf, compression = ioutils.get_reader_filepath_or_buffer( @@ -226,13 +223,9 @@ def to_json( if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) - libjson.write_json( - cudf_val, path_or_buf=file_obj, *args, **kwargs - ) + libjson.write_json(cudf_val, path_or_buf=file_obj, *args, **kwargs) else: - libjson.write_json( - cudf_val, path_or_buf=path_or_buf, *args, **kwargs - ) + libjson.write_json(cudf_val, path_or_buf=path_or_buf, *args, **kwargs) if return_as_string: path_or_buf.seek(0) @@ -256,6 +249,5 @@ def to_json( ) else: raise ValueError( - f"`engine` only support {{'auto', 'cudf', 'pandas'}}, " - f"got: {engine}" + f"`engine` only support {{'auto', 'cudf', 'pandas'}}, " f"got: {engine}" ) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index d135a31438e..7e7e161bea7 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import datetime import warnings @@ -45,14 +45,10 @@ def _parse_column_statistics(cs, column_statistics_blob): if cs.HasField("intStatistics"): column_statistics["minimum"] = ( - cs.intStatistics.minimum - if cs.intStatistics.HasField("minimum") - else None + cs.intStatistics.minimum if cs.intStatistics.HasField("minimum") else None ) column_statistics["maximum"] = ( - cs.intStatistics.maximum - if cs.intStatistics.HasField("maximum") - else None + cs.intStatistics.maximum if cs.intStatistics.HasField("maximum") else None ) column_statistics["sum"] = ( cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None @@ -70,9 +66,7 @@ def _parse_column_statistics(cs, column_statistics_blob): else None ) column_statistics["sum"] = ( - cs.doubleStatistics.sum - if cs.doubleStatistics.HasField("sum") - else None + cs.doubleStatistics.sum if cs.doubleStatistics.HasField("sum") else None ) elif cs.HasField("stringStatistics"): @@ -91,8 +85,7 @@ def _parse_column_statistics(cs, column_statistics_blob): elif cs.HasField("bucketStatistics"): column_statistics["true_count"] = cs.bucketStatistics.count[0] column_statistics["false_count"] = ( - column_statistics["number_of_values"] - - column_statistics["true_count"] + column_statistics["number_of_values"] - column_statistics["true_count"] ) elif cs.HasField("decimalStatistics"): @@ -187,9 +180,7 @@ def read_orc_statistics( ) = liborc.read_raw_orc_statistics(path_or_buf) # Parse column names - column_names = [ - column_name.decode("utf-8") for column_name in column_names - ] + column_names = [column_name.decode("utf-8") for column_name in column_names] # Parse statistics cs = cs_pb2.ColumnStatistics() @@ -199,10 +190,7 @@ def read_orc_statistics( for i, raw_file_stats in enumerate(raw_file_statistics) if columns is None or column_names[i] in columns } - if any( - not parsed_statistics - for parsed_statistics in file_statistics.values() - ): + if any(not parsed_statistics for parsed_statistics in file_statistics.values()): continue else: files_statistics.append(file_statistics) @@ -325,15 +313,11 @@ def read_orc( # Must ensure a stripe for each source is specified, unless None if not len(stripes) == len(filepath_or_buffer): - raise ValueError( - "A list of stripes must be provided for each input source" - ) + raise ValueError("A list of stripes must be provided for each input source") filepaths_or_buffers = [] for source in filepath_or_buffer: - if ioutils.is_directory( - path_or_data=source, storage_options=storage_options - ): + if ioutils.is_directory(path_or_data=source, storage_options=storage_options): fs = ioutils._ensure_filesystem( passed_filesystem=None, path=source, @@ -350,9 +334,7 @@ def read_orc( bytes_per_thread=bytes_per_thread, ) if compression is not None: - raise ValueError( - "URL content-encoding decompression is not supported" - ) + raise ValueError("URL content-encoding decompression is not supported") if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: @@ -393,16 +375,14 @@ def read_orc_stripe(orc_file, stripe, columns): warnings.warn("Using CPU via PyArrow to read ORC dataset.") if len(filepath_or_buffer) > 1: raise NotImplementedError( - "Using CPU via PyArrow only supports a single a " - "single input source" + "Using CPU via PyArrow only supports a single a " "single input source" ) orc_file = orc.ORCFile(filepath_or_buffer[0]) if stripes is not None and len(stripes) > 0: for stripe_source_file in stripes: pa_tables = [ - read_orc_stripe(orc_file, i, columns) - for i in stripe_source_file + read_orc_stripe(orc_file, i, columns) for i in stripe_source_file ] pa_table = pa.concat_tables(pa_tables) else: @@ -436,8 +416,7 @@ def to_orc( if isinstance(df.index, cudf.CategoricalIndex): raise NotImplementedError( - "Writing to ORC format is not yet supported with " - "Categorical columns." + "Writing to ORC format is not yet supported with " "Categorical columns." ) if cols_as_map_type is not None and not isinstance(cols_as_map_type, list): diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index bead9c352ef..73986a407af 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -392,14 +392,9 @@ def _process_dataset( # Extract hive-partition keys, and make sure they # are ordered the same as they are in `partition_categories` if partition_categories: - raw_keys = ds._get_partition_keys( - file_fragment.partition_expression - ) + raw_keys = ds._get_partition_keys(file_fragment.partition_expression) partition_keys.append( - [ - (name, raw_keys[name]) - for name in partition_categories.keys() - ] + [(name, raw_keys[name]) for name in partition_categories.keys()] ) # Apply row-group filtering @@ -419,11 +414,7 @@ def _process_dataset( row_groups.append(filtered_row_groups) else: row_groups.append( - [ - rg_id - for rg_id in filtered_row_groups - if rg_id in selection - ] + [rg_id for rg_id in filtered_row_groups if rg_id in selection] ) return ( @@ -539,9 +530,7 @@ def read_parquet( ) if compression is not None: - raise ValueError( - "URL content-encoding decompression is not supported" - ) + raise ValueError("URL content-encoding decompression is not supported") if isinstance(tmp_source, list): filepath_or_buffer.extend(tmp_source) else: @@ -569,8 +558,7 @@ def read_parquet( if columns and filters: projected_columns = columns columns = sorted( - set(v[0] for v in itertools.chain.from_iterable(filters)) - | set(columns) + set(v[0] for v in itertools.chain.from_iterable(filters)) | set(columns) ) # Convert parquet data to a cudf.DataFrame @@ -653,9 +641,7 @@ def _handle_in(column: cudf.Series, value, *, negate) -> cudf.Series: def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: if value not in {np.nan, None}: - raise TypeError( - "Value of 'is'/'is not' filter must be np.nan or None." - ) + raise TypeError("Value of 'is'/'is not' filter must be np.nan or None.") return ~column.isna() if negate else column.isna() handlers: Dict[str, Callable] = { @@ -687,10 +673,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: ( reduce( operator.and_, - ( - handlers[op](df[column], value) - for (column, op, value) in expr - ), + (handlers[op](df[column], value) for (column, op, value) in expr), ) for expr in filters ), @@ -699,9 +682,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: return df[selection].reset_index(drop=True) return df[selection] except (KeyError, TypeError): - warnings.warn( - f"Row-wise filtering failed in read_parquet for {filters}" - ) + warnings.warn(f"Row-wise filtering failed in read_parquet for {filters}") return df @@ -728,9 +709,7 @@ def _parquet_to_frame( partition_meta = None partitioning = (dataset_kwargs or {}).get("partitioning", None) if hasattr(partitioning, "schema"): - partition_meta = cudf.DataFrame.from_arrow( - partitioning.schema.empty_table() - ) + partition_meta = cudf.DataFrame.from_arrow(partitioning.schema.empty_table()) # For partitioned data, we need a distinct read for each # unique set of partition keys. Therefore, we start by @@ -777,9 +756,7 @@ def _parquet_to_frame( # Not building categorical columns, so # `value` is already what we want _dtype = ( - partition_meta[name].dtype - if partition_meta is not None - else None + partition_meta[name].dtype if partition_meta is not None else None ) if pd.isna(value): dfs[-1][name] = column_empty( @@ -835,10 +812,7 @@ def _read_parquet( use_pandas_metadata=use_pandas_metadata, ) else: - if ( - isinstance(filepaths_or_buffers, list) - and len(filepaths_or_buffers) == 1 - ): + if isinstance(filepaths_or_buffers, list) and len(filepaths_or_buffers) == 1: filepaths_or_buffers = filepaths_or_buffers[0] return cudf.DataFrame.from_pandas( @@ -927,10 +901,7 @@ def to_parquet( ) partition_info = ( - [ - (i, j - i) - for i, j in zip(partition_offsets, partition_offsets[1:]) - ] + [(i, j - i) for i, j in zip(partition_offsets, partition_offsets[1:])] if partition_offsets is not None else None ) @@ -957,9 +928,7 @@ def to_parquet( import pyarrow.parquet as pq if partition_offsets is not None: - warnings.warn( - "partition_offsets will be ignored when engine is not cudf" - ) + warnings.warn("partition_offsets will be ignored when engine is not cudf") # If index is empty set it to the expected default value of True if index is None: @@ -1018,9 +987,7 @@ def _get_partitioned( preserve_index=False, storage_options=None, ): - fs = ioutils._ensure_filesystem( - fs, root_path, storage_options=storage_options - ) + fs = ioutils._ensure_filesystem(fs, root_path, storage_options=storage_options) fs.mkdirs(root_path, exist_ok=True) part_names, grouped_df, part_offsets = _get_groups_and_offsets( @@ -1031,10 +998,7 @@ def _get_partitioned( metadata_file_paths = [] for keys in part_names.itertuples(index=False): subdir = fs.sep.join( - [ - _hive_dirname(name, val) - for name, val in zip(partition_cols, keys) - ] + [_hive_dirname(name, val) for name, val in zip(partition_cols, keys)] ) prefix = fs.sep.join([root_path, subdir]) fs.mkdirs(prefix, exist_ok=True) @@ -1065,9 +1029,7 @@ def _get_groups_and_offsets( grouped_df.drop(columns=partition_cols, inplace=True) # Copy the entire keys df in one operation rather than using iloc part_names = ( - part_keys.take(part_offsets[:-1]) - .to_pandas(nullable=True) - .to_frame(index=False) + part_keys.take(part_offsets[:-1]).to_pandas(nullable=True).to_frame(index=False) ) return part_names, grouped_df, part_offsets @@ -1122,16 +1084,12 @@ def _parse_bytes(s): try: n = float(prefix) except ValueError as e: - raise ValueError( - "Could not interpret '%s' as a number" % prefix - ) from e + raise ValueError("Could not interpret '%s' as a number" % prefix) from e try: multiplier = BYTE_SIZES[suffix.lower()] except KeyError as e: - raise ValueError( - "Could not interpret '%s' as a byte unit" % suffix - ) from e + raise ValueError("Could not interpret '%s' as a byte unit" % suffix) from e result = n * multiplier return int(result) @@ -1249,8 +1207,7 @@ def __init__( if max_file_size is not None: if file_name_prefix is None: raise ValueError( - "file_name_prefix cannot be None if max_file_size is " - "passed" + "file_name_prefix cannot be None if max_file_size is " "passed" ) self.max_file_size = _parse_bytes(max_file_size) @@ -1275,10 +1232,7 @@ def write_table(self, df): for idx, keys in enumerate(part_names.itertuples(index=False)): subdir = fs.sep.join( - [ - f"{name}={val}" - for name, val in zip(self.partition_cols, keys) - ] + [f"{name}={val}" for name, val in zip(self.partition_cols, keys)] ) prefix = fs.sep.join([self.path, subdir]) fs.mkdirs(prefix, exist_ok=True) @@ -1296,9 +1250,9 @@ def write_table(self, df): # if the file is too large, compute metadata for # smaller chunks parts = math.ceil(current_file_size / self.max_file_size) - new_offsets = list( - range(start, end, int((end - start) / parts)) - )[1:] + new_offsets = list(range(start, end, int((end - start) / parts)))[ + 1: + ] new_offsets.append(end) num_chunks = len(new_offsets) parts = len(new_offsets) @@ -1315,31 +1269,24 @@ def write_table(self, df): # Check if the same `new_file_name` exists and # generate a `new_file_name` while new_full_path in self._file_sizes and ( - self._file_sizes[new_full_path] - + (current_file_size / parts) + self._file_sizes[new_full_path] + (current_file_size / parts) ) > (self.max_file_size): curr_file_num += 1 - new_file_name = ( - f"{self.filename}_{curr_file_num}.parquet" - ) + new_file_name = f"{self.filename}_{curr_file_num}.parquet" new_full_path = fs.sep.join([prefix, new_file_name]) self._file_sizes[new_full_path] = self._file_sizes.get( new_full_path, 0 ) + (current_file_size / parts) full_paths.append(new_full_path) - metadata_file_paths.append( - fs.sep.join([subdir, new_file_name]) - ) + metadata_file_paths.append(fs.sep.join([subdir, new_file_name])) num_chunks += 1 curr_file_num += 1 else: self.filename = self.filename or _generate_filename() full_path = fs.sep.join([prefix, self.filename]) full_paths.append(full_path) - metadata_file_paths.append( - fs.sep.join([subdir, self.filename]) - ) + metadata_file_paths.append(fs.sep.join([subdir, self.filename])) full_offsets.append(current_offset[1]) paths, metadata_file_paths, offsets = ( @@ -1425,9 +1372,7 @@ def __exit__(self, *args): self.close() -def _default_open_file_options( - open_file_options, columns, row_groups, fs=None -): +def _default_open_file_options(open_file_options, columns, row_groups, fs=None): """ Set default fields in open_file_options. diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 7a0db49bd20..cc80fda3792 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import os import textwrap @@ -62,9 +62,7 @@ def _register_option( Raised by validator if the value is invalid. """ validator(default_value) - _OPTIONS[name] = Option( - default_value, default_value, description, validator - ) + _OPTIONS[name] = Option(default_value, default_value, description, validator) def get_option(name: str) -> Any: @@ -144,8 +142,7 @@ def _make_contains_validator(valid_options: Container) -> Callable: def _validator(val): if val not in valid_options: raise ValueError( - f"{val} is not a valid option. " - f"Must be one of {set(valid_options)}." + f"{val} is not a valid option. " f"Must be one of {set(valid_options)}." ) return _validator @@ -183,9 +180,7 @@ def _integer_validator(val): int(val) return True except ValueError: - raise ValueError( - f"{val} is not a valid option. " f"Must be an integer." - ) + raise ValueError(f"{val} is not a valid option. " f"Must be an integer.") def _integer_and_none_validator(val): @@ -339,8 +334,7 @@ class option_context(ContextDecorator): def __init__(self, *args) -> None: if len(args) % 2 != 0: raise ValueError( - "Need to invoke as option_context(pat, val, " - "[(pat, val), ...])." + "Need to invoke as option_context(pat, val, " "[(pat, val), ...])." ) self.ops = tuple(zip(args[::2], args[1::2])) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index b7c8e92e8db..c948993f556 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -412,9 +412,7 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.BooleanArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={ - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__") - }, + additional_attributes={"__array_ufunc__": _FastSlowAttribute("__array_ufunc__")}, ) BooleanDtype = make_final_proxy_type( @@ -432,9 +430,7 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.IntegerArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={ - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__") - }, + additional_attributes={"__array_ufunc__": _FastSlowAttribute("__array_ufunc__")}, ) Int8Dtype = make_final_proxy_type( @@ -552,9 +548,7 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.FloatingArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={ - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__") - }, + additional_attributes={"__array_ufunc__": _FastSlowAttribute("__array_ufunc__")}, ) Float32Dtype = make_final_proxy_type( @@ -819,9 +813,7 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs): "_TextFileReader", _Unusable, pd.io.parsers.readers.TextFileReader ) -_XportReader = make_intermediate_proxy_type( - "_XportReader", _Unusable, pd_XportReader -) +_XportReader = make_intermediate_proxy_type("_XportReader", _Unusable, pd_XportReader) _SAS7BDATReader = make_intermediate_proxy_type( "_SAS7BDATReader", _Unusable, pd_SAS7BDATReader diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 3f5df18eae1..9d3186e5de4 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -267,9 +267,7 @@ def __init__(self, *args, **kwargs): # disallow __init__. An intermediate proxy type can only be # instantiated from (possibly chained) operations on a final # proxy type. - raise TypeError( - f"Cannot directly instantiate object of type {type(self)}" - ) + raise TypeError(f"Cannot directly instantiate object of type {type(self)}") @property # type: ignore def _fsproxy_state(self): @@ -406,8 +404,7 @@ def __get__(self, obj, owner=None) -> Any: obj = owner if not ( - isinstance(obj, _FastSlowProxy) - or issubclass(type(obj), _FastSlowProxyMeta) + isinstance(obj, _FastSlowProxy) or issubclass(type(obj), _FastSlowProxyMeta) ): # we only want to look up attributes on the underlying # fast/slow objects for instances of _FastSlowProxy or @@ -437,9 +434,7 @@ def __get__(self, obj, owner=None) -> Any: # methods because dir for the method won't be the same as for # the pure unbound function, but the alternative is # materializing the slow object when we don't really want to. - result._fsproxy_slow_dir = dir( - slow_result_type - ) # type: ignore + result._fsproxy_slow_dir = dir(slow_result_type) # type: ignore return result @@ -579,9 +574,7 @@ def __getattr__(self, name: str) -> Any: return obj if not _is_function_or_method(obj): - return _maybe_wrap_result( - obj, getattr, self._fsproxy_slow, name - ) + return _maybe_wrap_result(obj, getattr, self._fsproxy_slow, name) @functools.wraps(obj) def _wrapped_private_slow(*args, **kwargs): @@ -939,8 +932,7 @@ def _transform_arg( # transformed pieces # This handles scipy._lib._bunch._make_tuple_bunch args, kwargs = ( - _transform_arg(a, attribute_name, seen) - for a in arg.__getnewargs_ex__() + _transform_arg(a, attribute_name, seen) for a in arg.__getnewargs_ex__() ) obj = type(arg).__new__(type(arg), *args, **kwargs) if hasattr(obj, "__setstate__"): @@ -962,9 +954,7 @@ def _transform_arg( return type(arg).__new__(type(arg), *args) else: # Hope we can just call the constructor with transformed entries. - return type(arg)( - _transform_arg(a, attribute_name, seen) for a in args - ) + return type(arg)(_transform_arg(a, attribute_name, seen) for a in args) elif isinstance(arg, dict): return { _transform_arg(k, attribute_name, seen): _transform_arg( @@ -973,9 +963,7 @@ def _transform_arg( for k, a in arg.items() } elif isinstance(arg, np.ndarray) and arg.dtype == "O": - transformed = [ - _transform_arg(a, attribute_name, seen) for a in arg.flat - ] + transformed = [_transform_arg(a, attribute_name, seen) for a in arg.flat] # Keep the same memory layout as arg (the default is C_CONTIGUOUS) if arg.flags["F_CONTIGUOUS"] and not arg.flags["C_CONTIGUOUS"]: order = "F" @@ -1049,9 +1037,7 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any: elif isinstance(result, Iterator): return (_maybe_wrap_result(r, lambda x: x, r) for r in result) elif _is_function_or_method(result): - return _MethodProxy._fsproxy_wrap( - result, method_chain=(func, args, kwargs) - ) + return _MethodProxy._fsproxy_wrap(result, method_chain=(func, args, kwargs)) else: return result diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index e97d6e4af24..abcf0f3e98c 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -79,22 +79,16 @@ def deduce_cudf_pandas_mode(slow_lib: str, fast_lib: str) -> DeducedMode: if "CUDF_PANDAS_FALLBACK_MODE" not in os.environ: try: importlib.import_module(fast_lib) - return DeducedMode( - use_fast_lib=True, slow_lib=slow_lib, fast_lib=fast_lib - ) + return DeducedMode(use_fast_lib=True, slow_lib=slow_lib, fast_lib=fast_lib) except Exception as e: warnings.warn( f"Exception encountered importing {fast_lib}: {e}." f"Falling back to only using {slow_lib}." ) - return DeducedMode( - use_fast_lib=False, slow_lib=slow_lib, fast_lib=slow_lib - ) + return DeducedMode(use_fast_lib=False, slow_lib=slow_lib, fast_lib=slow_lib) -class ModuleAcceleratorBase( - importlib.abc.MetaPathFinder, importlib.abc.Loader -): +class ModuleAcceleratorBase(importlib.abc.MetaPathFinder, importlib.abc.Loader): _instance: ModuleAcceleratorBase | None = None mod_name: str fast_lib: str @@ -127,9 +121,7 @@ def __new__( Name of package that provides "slow" fallback implementation """ if ModuleAcceleratorBase._instance is not None: - raise RuntimeError( - "Only one instance of ModuleAcceleratorBase allowed" - ) + raise RuntimeError("Only one instance of ModuleAcceleratorBase allowed") self = object.__new__(cls) self.mod_name = mod_name self.fast_lib = fast_lib @@ -151,8 +143,7 @@ def __new__( def __repr__(self) -> str: return ( - f"{self.__class__.__name__}" - f"(fast={self.fast_lib}, slow={self.slow_lib})" + f"{self.__class__.__name__}" f"(fast={self.fast_lib}, slow={self.slow_lib})" ) def find_spec( @@ -172,9 +163,7 @@ def find_spec( A ModuleSpec with ourself as loader if we're interposing, otherwise None to pass off to the next loader. """ - if fullname == self.mod_name or fullname.startswith( - f"{self.mod_name}." - ): + if fullname == self.mod_name or fullname.startswith(f"{self.mod_name}."): return importlib.machinery.ModuleSpec( name=fullname, loader=self, @@ -316,9 +305,7 @@ def _wrap_attribute( # now, attempt to import the wrapped module, which will # recursively wrap all of its attributes: return importlib.import_module( - rename_root_module( - slow_attr.__name__, self.slow_lib, self.mod_name - ) + rename_root_module(slow_attr.__name__, self.slow_lib, self.mod_name) ) if slow_attr in self._wrapped_objs: if type(fast_attr) is _Unusable: @@ -557,11 +544,8 @@ def getattr_real_or_wrapped( # We cannot possibly be at the top level. assert frame.f_back calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename) - use_real = not calling_module.is_relative_to( - CUDF_PANDAS_PATH - ) and any( - calling_module.is_relative_to(path) - for path in loader._denylist + use_real = not calling_module.is_relative_to(CUDF_PANDAS_PATH) and any( + calling_module.is_relative_to(path) for path in loader._denylist ) try: if use_real: @@ -596,9 +580,7 @@ def install( ) mode = deduce_cudf_pandas_mode(slow_lib, fast_lib) if mode.use_fast_lib: - importlib.import_module( - f".._wrappers.{mode.slow_lib}", __name__ - ) + importlib.import_module(f".._wrappers.{mode.slow_lib}", __name__) try: (self,) = ( p diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py index c5662d06e09..2f0dcd44ac8 100644 --- a/python/cudf/cudf/pandas/profiler.py +++ b/python/cudf/cudf/pandas/profiler.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -124,7 +124,7 @@ def get_namespaced_function_name( _MethodProxy, type[_FinalProxy], type[_IntermediateProxy], - ] + ], ): if isinstance(func_obj, _MethodProxy): # Extract classname from method object @@ -145,16 +145,11 @@ def get_namespaced_function_name( def _tracefunc(self, frame, event, arg): if event == "line" and frame.f_code.co_filename == self._currfile: key = "".join(inspect.stack()[1].code_context) - if not any( - ignore_word in key for ignore_word in Profiler._IGNORE_LIST - ): + if not any(ignore_word in key for ignore_word in Profiler._IGNORE_LIST): self._currkey = (frame.f_lineno, self._currfile, key) self._results.setdefault(self._currkey, {}) self._timer[self._currkey] = time.perf_counter() - elif ( - event == "call" - and frame.f_code.co_name == "_fast_slow_function_call" - ): + elif event == "call" and frame.f_code.co_name == "_fast_slow_function_call": if self._currkey is not None: self._timer[self._currkey] = time.perf_counter() @@ -170,25 +165,18 @@ def _tracefunc(self, frame, event, arg): ): func_name = self.get_namespaced_function_name(func_obj) self._call_stack.append((func_name, time.perf_counter())) - elif ( - event == "return" - and frame.f_code.co_name == "_fast_slow_function_call" - ): + elif event == "return" and frame.f_code.co_name == "_fast_slow_function_call": if self._currkey is not None and arg is not None: if arg[1]: # fast run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey][ - "gpu_time" - ] = run_time + self._results[self._currkey].get( - "gpu_time", 0 - ) + self._results[self._currkey]["gpu_time"] = run_time + self._results[ + self._currkey + ].get("gpu_time", 0) else: run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey][ - "cpu_time" - ] = run_time + self._results[self._currkey].get( - "cpu_time", 0 - ) + self._results[self._currkey]["cpu_time"] = run_time + self._results[ + self._currkey + ].get("cpu_time", 0) frame_locals = inspect.getargvalues(frame).locals if ( diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index f1744c9e92b..c4f1161399a 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -35,9 +35,7 @@ def count_failures(log_file_name, pattern): and line["when"] == "call" and line["outcome"] == "failed" ): - line_module_name = line["location"][0].removeprefix( - PANDAS_TEST_PREFIX - ) + line_module_name = line["location"][0].removeprefix(PANDAS_TEST_PREFIX) if fnmatch(line_module_name, pattern): if "longrepr" in line and line["longrepr"]: if isinstance(line["longrepr"], (tuple, list)): diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index bfc56319d82..0144abb0dca 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -47,9 +47,7 @@ def get_per_module_results(log_file_name): # it's an xpassed test outcome = "failed" module_name = ( - line["nodeid"] - .split("::")[0] - .removeprefix(PANDAS_TEST_PREFIX) + line["nodeid"].split("::")[0].removeprefix(PANDAS_TEST_PREFIX) ) per_module_results.setdefault(module_name, {}) per_module_results[module_name].setdefault("total", 0) @@ -60,9 +58,7 @@ def get_per_module_results(log_file_name): def sort_results(results): - sorted_keys = sorted( - results, key=lambda key: results[key].get("failed", 0) - ) + sorted_keys = sorted(results, key=lambda key: results[key].get("failed", 0)) return {key: results[key] for key in sorted_keys} @@ -98,9 +94,7 @@ def print_results_as_table(results): if __name__ == "__main__": # parse arguments parser = argparse.ArgumentParser() - parser.add_argument( - "log_file_name", nargs=1, help="The input log file name" - ) + parser.add_argument("log_file_name", nargs=1, help="The input log file name") parser.add_argument( "--output", choices=["json", "table"], diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index e067d15af4c..78fc826212e 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -125,9 +125,7 @@ def assert_eq(left, right, **kwargs): # `object`. Check equality before that happens: if kwargs.get("check_dtype", True): if hasattr(left, "dtype") and hasattr(right, "dtype"): - if isinstance( - left.dtype, cudf.core.dtypes._BaseDtype - ) and not isinstance( + if isinstance(left.dtype, cudf.core.dtypes._BaseDtype) and not isinstance( left.dtype, cudf.CategoricalDtype ): # leave categorical comparison to Pandas assert_eq(left.dtype, right.dtype) @@ -150,9 +148,7 @@ def assert_eq(left, right, **kwargs): # This warning comes from a call from pandas to numpy. It is ignored # here because it cannot be fixed within cudf. with warnings.catch_warnings(): - warnings.simplefilter( - "ignore", (DeprecationWarning, FutureWarning) - ) + warnings.simplefilter("ignore", (DeprecationWarning, FutureWarning)) if isinstance(left, pd.DataFrame): tm.assert_frame_equal(left, right, **kwargs) elif isinstance(left, pd.Series): @@ -312,9 +308,7 @@ def gen_rand(dtype, size, **kwargs): elif dtype.kind == "b": low = kwargs.get("low", 0) high = kwargs.get("high", 2) - return np.random.randint(low=low, high=high, size=size).astype( - np.bool_ - ) + return np.random.randint(low=low, high=high, size=size).astype(np.bool_) elif dtype.kind == "M": low = kwargs.get("low", 0) time_unit, _ = np.datetime_data(dtype) @@ -331,9 +325,7 @@ def gen_rand(dtype, size, **kwargs): nchars = np.random.randint(low=low, high=high, size=1)[0] char_options = np.array(list(string.ascii_letters + string.digits)) all_chars = "".join(np.random.choice(char_options, nchars * size)) - return np.array( - [all_chars[nchars * i : nchars * (i + 1)] for i in range(size)] - ) + return np.array([all_chars[nchars * i : nchars * (i + 1)] for i in range(size)]) raise NotImplementedError(f"dtype.kind={dtype.kind}") diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 13c194d6be0..c1bf9ac0746 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -104,10 +104,7 @@ def _generate_column(column_params, num_rows): # Construct set of values to sample from where # set size = cardinality - if ( - isinstance(column_params.dtype, str) - and column_params.dtype == "category" - ): + if isinstance(column_params.dtype, str) and column_params.dtype == "category": vals = pa.array( column_params.generator, size=column_params.cardinality, @@ -115,9 +112,7 @@ def _generate_column(column_params, num_rows): ) return pa.DictionaryArray.from_arrays( dictionary=vals, - indices=np.random.randint( - low=0, high=len(vals), size=num_rows - ), + indices=np.random.randint(low=0, high=len(vals), size=num_rows), mask=np.random.choice( [True, False], size=num_rows, @@ -178,9 +173,7 @@ def _generate_column(column_params, num_rows): else None, size=num_rows, safe=False, - type=None - if isinstance(arrow_type, pa.lib.Decimal128Type) - else arrow_type, + type=None if isinstance(arrow_type, pa.lib.Decimal128Type) else arrow_type, ) if isinstance(arrow_type, pa.lib.Decimal128Type): vals = vals.cast(arrow_type, safe=False) @@ -242,10 +235,7 @@ def get_dataframe(parameters, use_threads): # Get schema for each column table_fields = [] for i, column_params in enumerate(parameters.column_parameters): - if ( - isinstance(column_params.dtype, str) - and column_params.dtype == "category" - ): + if isinstance(column_params.dtype, str) and column_params.dtype == "category": arrow_type = pa.dictionary( index_type=pa.int64(), value_type=np_to_pa_dtype( @@ -280,9 +270,7 @@ def get_dataframe(parameters, use_threads): # Generate data if not use_threads: for i, column_params in enumerate(parameters.column_parameters): - column_data[i] = _generate_column( - column_params, parameters.num_rows - ) + column_data[i] = _generate_column(column_params, parameters.num_rows) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( @@ -398,9 +386,7 @@ def rand_dataframe( ) ) elif dtype == "decimal64": - max_precision = meta.get( - "max_precision", cudf.Decimal64Dtype.MAX_PRECISION - ) + max_precision = meta.get("max_precision", cudf.Decimal64Dtype.MAX_PRECISION) precision = np.random.randint(1, max_precision) scale = np.random.randint(0, precision) dtype = cudf.Decimal64Dtype(precision=precision, scale=scale) @@ -414,9 +400,7 @@ def rand_dataframe( ) ) elif dtype == "decimal32": - max_precision = meta.get( - "max_precision", cudf.Decimal32Dtype.MAX_PRECISION - ) + max_precision = meta.get("max_precision", cudf.Decimal32Dtype.MAX_PRECISION) precision = np.random.randint(1, max_precision) scale = np.random.randint(0, precision) dtype = cudf.Decimal32Dtype(precision=precision, scale=scale) @@ -684,13 +668,9 @@ def get_values_for_nested_data(dtype, lists_max_length=None, size=None): for _ in range(cardinality) ] elif dtype.kind == "M": - values = datetime_generator(dtype=dtype, size=cardinality)().astype( - dtype - ) + values = datetime_generator(dtype=dtype, size=cardinality)().astype(dtype) elif dtype.kind == "m": - values = timedelta_generator(dtype=dtype, size=cardinality)().astype( - dtype - ) + values = timedelta_generator(dtype=dtype, size=cardinality)().astype(dtype) elif dtype.kind == "b": values = boolean_generator(cardinality)().astype(dtype) else: diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index fc253c5c197..5eb53335e2e 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -35,13 +35,9 @@ def dtype_can_compare_equal_to_other(dtype): def _check_isinstance(left, right, obj): if not isinstance(left, obj): - raise AssertionError( - f"{obj} Expected type {obj}, found {type(left)} instead" - ) + raise AssertionError(f"{obj} Expected type {obj}, found {type(left)} instead") elif not isinstance(right, obj): - raise AssertionError( - f"{obj} Expected type {obj}, found {type(right)} instead" - ) + raise AssertionError(f"{obj} Expected type {obj}, found {type(right)} instead") def raise_assert_detail(obj, message, left, right, diff=None): @@ -57,9 +53,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): raise AssertionError(msg) -def _check_types( - left, right, check_categorical=True, exact="equiv", obj="Index" -): +def _check_types(left, right, check_categorical=True, exact="equiv", obj="Index"): if not exact or exact == "equiv": if ( isinstance(left, cudf.RangeIndex) @@ -83,15 +77,9 @@ def _check_types( obj, "Class types are different", f"{type(left)}", f"{type(right)}" ) - if ( - exact - and not isinstance(left, cudf.MultiIndex) - and _is_categorical_dtype(left) - ): + if exact and not isinstance(left, cudf.MultiIndex) and _is_categorical_dtype(left): if left.dtype != right.dtype: - raise_assert_detail( - obj, "Categorical difference", f"{left}", f"{right}" - ) + raise_assert_detail(obj, "Categorical difference", f"{left}", f"{right}") def assert_column_equal( @@ -205,11 +193,7 @@ def assert_column_equal( f"{obj} category", "Orders are different", msg1, msg2 ) - if ( - not check_dtype - and _is_categorical_dtype(left) - and _is_categorical_dtype(right) - ): + if not check_dtype and _is_categorical_dtype(left) and _is_categorical_dtype(right): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) columns_equal = False @@ -227,30 +211,18 @@ def assert_column_equal( ): try: # nulls must be in the same places for all dtypes - columns_equal = cp.all( - left.isnull().values == right.isnull().values - ) + columns_equal = cp.all(left.isnull().values == right.isnull().values) - if ( - columns_equal - and not check_exact - and is_numeric_dtype(left.dtype) - ): + if columns_equal and not check_exact and is_numeric_dtype(left.dtype): # non-null values must be the same columns_equal = cp.allclose( - left.apply_boolean_mask( - left.isnull().unary_operator("not") - ).values, + left.apply_boolean_mask(left.isnull().unary_operator("not")).values, right.apply_boolean_mask( right.isnull().unary_operator("not") ).values, ) - if columns_equal and ( - left.dtype.kind == right.dtype.kind == "f" - ): - columns_equal = cp.all( - is_nan(left).values == is_nan(right).values - ) + if columns_equal and (left.dtype.kind == right.dtype.kind == "f"): + columns_equal = cp.all(is_nan(left).values == is_nan(right).values) else: columns_equal = left.equals(right) except TypeError as e: @@ -373,9 +345,7 @@ def assert_index_equal( # instance validation _check_isinstance(left, right, cudf.BaseIndex) - _check_types( - left, right, exact=exact, check_categorical=check_categorical, obj=obj - ) + _check_types(left, right, exact=exact, check_categorical=check_categorical, obj=obj) if len(left) != len(right): raise_assert_detail( @@ -421,9 +391,7 @@ def assert_index_equal( # metadata comparison if check_names and (left.name != right.name): - raise_assert_detail( - obj, "name mismatch", f"{left.name}", f"{right.name}" - ) + raise_assert_detail(obj, "name mismatch", f"{left.name}", f"{right.name}") def assert_series_equal( @@ -548,9 +516,7 @@ def assert_series_equal( # metadata comparison if check_names and (left.name != right.name): - raise_assert_detail( - obj, "name mismatch", f"{left.name}", f"{right.name}" - ) + raise_assert_detail(obj, "name mismatch", f"{left.name}", f"{right.name}") def assert_frame_equal( diff --git a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py index f2c2d9a263b..bd1dbba5428 100644 --- a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py +++ b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. import pandas as pd @@ -8,12 +8,8 @@ def test_slice_datetimetz_index(): data = ["2001-01-01", "2001-01-02", None, None, "2001-01-03"] - pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize( - "US/Eastern" - ) - idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize( - "US/Eastern" - ) + pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize("US/Eastern") + idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize("US/Eastern") expected = pidx[1:4] got = idx[1:4] assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py index b28ef131025..43edebb5b4f 100644 --- a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py +++ b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py @@ -21,9 +21,7 @@ def test_tz_convert(): idx = cudf.from_pandas(pidx) pidx = pidx.tz_localize("UTC") idx = idx.tz_localize("UTC") - assert_eq( - pidx.tz_convert("America/New_York"), idx.tz_convert("America/New_York") - ) + assert_eq(pidx.tz_convert("America/New_York"), idx.tz_convert("America/New_York")) def test_delocalize_naive(): diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py index d59041e32d5..2d47f051342 100644 --- a/python/cudf/cudf/tests/indexes/test_interval.py +++ b/python/cudf/cudf/tests/indexes/test_interval.py @@ -79,9 +79,7 @@ def test_interval_range_empty(closed): @pytest.mark.parametrize("end", [6, 8, 10, 43, 70]) def test_interval_range_freq_basic(start, end, freq, closed): pindex = pd.interval_range(start=start, end=end, freq=freq, closed=closed) - gindex = cudf.interval_range( - start=start, end=end, freq=freq, closed=closed - ) + gindex = cudf.interval_range(start=start, end=end, freq=freq, closed=closed) assert_eq(pindex, gindex) @@ -97,9 +95,7 @@ def test_interval_range_freq_basic_dtype(start_t, end_t, freq_t): pindex = pd.interval_range( start=start_val, end=end_val, freq=freq_val, closed="left" ) - gindex = cudf.interval_range( - start=start, end=end, freq=freq, closed="left" - ) + gindex = cudf.interval_range(start=start, end=end, freq=freq, closed="left") if gindex.dtype.subtype.kind == "f": gindex = gindex.astype( cudf.IntervalDtype(subtype="float64", closed=gindex.dtype.closed) @@ -119,12 +115,8 @@ def test_interval_range_freq_basic_dtype(start_t, end_t, freq_t): @pytest.mark.parametrize("start", [0, 0.0, 1.0, 1, 2, 2.0, 3.0, 3]) @pytest.mark.parametrize("end", [4, 4.0, 5.0, 5, 6, 6.0, 7.0, 7]) def test_interval_range_periods_basic(start, end, periods, closed): - pindex = pd.interval_range( - start=start, end=end, periods=periods, closed=closed - ) - gindex = cudf.interval_range( - start=start, end=end, periods=periods, closed=closed - ) + pindex = pd.interval_range(start=start, end=end, periods=periods, closed=closed) + gindex = cudf.interval_range(start=start, end=end, periods=periods, closed=closed) assert_eq(pindex, gindex) @@ -136,15 +128,11 @@ def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t): start, end, periods = start_t(0), end_t(4), periods_t(1) start_val = start.value if isinstance(start, cudf.Scalar) else start end_val = end.value if isinstance(end, cudf.Scalar) else end - periods_val = ( - periods.value if isinstance(periods, cudf.Scalar) else periods - ) + periods_val = periods.value if isinstance(periods, cudf.Scalar) else periods pindex = pd.interval_range( start=start_val, end=end_val, periods=periods_val, closed="left" ) - gindex = cudf.interval_range( - start=start, end=end, periods=periods, closed="left" - ) + gindex = cudf.interval_range(start=start, end=end, periods=periods, closed="left") assert_eq(pindex, gindex) @@ -169,12 +157,8 @@ def test_interval_range_periods_warnings(): @pytest.mark.parametrize("freq", [1, 2, 3, 4]) @pytest.mark.parametrize("end", [4, 8, 9, 10]) def test_interval_range_periods_freq_end(end, freq, periods, closed): - pindex = pd.interval_range( - end=end, freq=freq, periods=periods, closed=closed - ) - gindex = cudf.interval_range( - end=end, freq=freq, periods=periods, closed=closed - ) + pindex = pd.interval_range(end=end, freq=freq, periods=periods, closed=closed) + gindex = cudf.interval_range(end=end, freq=freq, periods=periods, closed=closed) assert_eq(pindex, gindex) @@ -186,15 +170,11 @@ def test_interval_range_periods_freq_end_dtype(periods_t, freq_t, end_t): periods, freq, end = periods_t(2), freq_t(3), end_t(10) freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq end_val = end.value if isinstance(end, cudf.Scalar) else end - periods_val = ( - periods.value if isinstance(periods, cudf.Scalar) else periods - ) + periods_val = periods.value if isinstance(periods, cudf.Scalar) else periods pindex = pd.interval_range( end=end_val, freq=freq_val, periods=periods_val, closed="left" ) - gindex = cudf.interval_range( - end=end, freq=freq, periods=periods, closed="left" - ) + gindex = cudf.interval_range(end=end, freq=freq, periods=periods, closed="left") assert_eq(pindex, gindex) @@ -204,12 +184,8 @@ def test_interval_range_periods_freq_end_dtype(periods_t, freq_t, end_t): @pytest.mark.parametrize("freq", [1, 2, 3, 4]) @pytest.mark.parametrize("start", [1, 4, 9, 12]) def test_interval_range_periods_freq_start(start, freq, periods, closed): - pindex = pd.interval_range( - start=start, freq=freq, periods=periods, closed=closed - ) - gindex = cudf.interval_range( - start=start, freq=freq, periods=periods, closed=closed - ) + pindex = pd.interval_range(start=start, freq=freq, periods=periods, closed=closed) + gindex = cudf.interval_range(start=start, freq=freq, periods=periods, closed=closed) assert_eq(pindex, gindex) @@ -221,15 +197,11 @@ def test_interval_range_periods_freq_start_dtype(periods_t, freq_t, start_t): periods, freq, start = periods_t(2), freq_t(3), start_t(9) freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq start_val = start.value if isinstance(start, cudf.Scalar) else start - periods_val = ( - periods.value if isinstance(periods, cudf.Scalar) else periods - ) + periods_val = periods.value if isinstance(periods, cudf.Scalar) else periods pindex = pd.interval_range( start=start_val, freq=freq_val, periods=periods_val, closed="left" ) - gindex = cudf.interval_range( - start=start, freq=freq, periods=periods, closed="left" - ) + gindex = cudf.interval_range(start=start, freq=freq, periods=periods, closed="left") # pandas upcasts to 64 bit https://github.com/pandas-dev/pandas/issues/57268 # using Series to use check_dtype @@ -337,9 +309,7 @@ def test_interval_index_from_breaks(closed): ], ) def test_interval_range_floating(start, stop, freq, periods): - expected = pd.interval_range( - start=start, end=stop, freq=freq, periods=periods - ) + expected = pd.interval_range(start=start, end=stop, freq=freq, periods=periods) got = interval_range(start=start, end=stop, freq=freq, periods=periods) assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/input_output/test_text.py b/python/cudf/cudf/tests/input_output/test_text.py index acba13bb5b0..3839908d4d5 100644 --- a/python/cudf/cudf/tests/input_output/test_text.py +++ b/python/cudf/cudf/tests/input_output/test_text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from io import StringIO @@ -23,10 +23,7 @@ def test_read_text(datadir): # Since Python split removes the delimiter and read_text does # not we need to add it back to the 'content' expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] + [c + delimiter if i < (len(content) - 1) else c for i, c in enumerate(content)] ) actual = cudf.read_text(chess_file, delimiter=delimiter) @@ -45,10 +42,7 @@ def test_read_text_byte_range(datadir): # Since Python split removes the delimiter and read_text does # not we need to add it back to the 'content' expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] + [c + delimiter if i < (len(content) - 1) else c for i, c in enumerate(content)] ) byte_range_size = (len(data) // 3) + (len(data) % 3 != 0) @@ -84,9 +78,7 @@ def test_read_text_byte_range_large(tmpdir): expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) - actual = cudf.read_text( - temp_file, delimiter=delimiter, byte_range=[1000, 1000] - ) + actual = cudf.read_text(temp_file, delimiter=delimiter, byte_range=[1000, 1000]) assert_eq(expected, actual) @@ -106,9 +98,7 @@ def test_read_text_in_memory_strip_delimiter(datadir): # not we need to add it back to the 'content' expected = cudf.Series(["x", "y", "z"]) - actual = cudf.read_text( - StringIO("x::y::z"), delimiter="::", strip_delimiters=True - ) + actual = cudf.read_text(StringIO("x::y::z"), delimiter="::", strip_delimiters=True) assert_eq(expected, actual) @@ -124,10 +114,7 @@ def test_read_text_bgzip(datadir): # Since Python split removes the delimiter and read_text does # not we need to add it back to the 'content' expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] + [c + delimiter if i < (len(content) - 1) else c for i, c in enumerate(content)] ) actual = cudf.read_text( @@ -148,10 +135,7 @@ def test_read_text_bgzip_offsets(datadir): # Since Python split removes the delimiter and read_text does # not we need to add it back to the 'content' expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] + [c + delimiter if i < (len(content) - 1) else c for i, c in enumerate(content)] ) actual = cudf.read_text( diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 98be7045923..976e9e5216e 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -33,9 +33,7 @@ def unit(request): return request.param -@pytest.fixture( - params=["America/New_York", "Asia/Tokyo", "CET", "Etc/GMT+1", "UTC"] -) +@pytest.fixture(params=["America/New_York", "Asia/Tokyo", "CET", "Etc/GMT+1", "UTC"]) def tz(request): return request.param @@ -69,9 +67,7 @@ def test_localize_ambiguous(request, unit, zone_name): ], dtype=f"datetime64[{unit}]", ) - expect = s.to_pandas().dt.tz_localize( - zone_name, ambiguous="NaT", nonexistent="NaT" - ) + expect = s.to_pandas().dt.tz_localize(zone_name, ambiguous="NaT", nonexistent="NaT") got = s.dt.tz_localize(zone_name) assert_eq(expect, got) @@ -95,17 +91,15 @@ def test_localize_nonexistent(request, unit, zone_name): ], dtype=f"datetime64[{unit}]", ) - expect = s.to_pandas().dt.tz_localize( - zone_name, ambiguous="NaT", nonexistent="NaT" - ) + expect = s.to_pandas().dt.tz_localize(zone_name, ambiguous="NaT", nonexistent="NaT") got = s.dt.tz_localize(zone_name) assert_eq(expect, got) def test_delocalize(unit, tz): - psr = pd.Series( - pd.date_range("2001-01-01", "2001-01-02", freq="1s") - ).astype(f"datetime64[{unit}]") + psr = pd.Series(pd.date_range("2001-01-01", "2001-01-02", freq="1s")).astype( + f"datetime64[{unit}]" + ) sr = cudf.from_pandas(psr) expect = psr.dt.tz_localize(tz).dt.tz_localize(None) @@ -123,12 +117,8 @@ def test_delocalize_naive(): assert_eq(expect, got) -@pytest.mark.parametrize( - "from_tz", ["Europe/London", "America/Chicago", "UTC"] -) -@pytest.mark.parametrize( - "to_tz", ["Europe/London", "America/Chicago", "UTC", None] -) +@pytest.mark.parametrize("from_tz", ["Europe/London", "America/Chicago", "UTC"]) +@pytest.mark.parametrize("to_tz", ["Europe/London", "America/Chicago", "UTC", None]) def test_convert(from_tz, to_tz): ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="h")) gs = cudf.from_pandas(ps) @@ -169,12 +159,8 @@ def test_convert_from_naive(): ], ) def test_convert_edge_cases(data, original_timezone, target_timezone): - ps = pd.Series(data, dtype="datetime64[s]").dt.tz_localize( - original_timezone - ) - gs = cudf.Series(data, dtype="datetime64[s]").dt.tz_localize( - original_timezone - ) + ps = pd.Series(data, dtype="datetime64[s]").dt.tz_localize(original_timezone) + gs = cudf.Series(data, dtype="datetime64[s]").dt.tz_localize(original_timezone) expect = ps.dt.tz_convert(target_timezone) got = gs.dt.tz_convert(target_timezone) assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 58939f0ddd9..7bf20bf97b3 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -58,9 +58,7 @@ def test_array_func_cudf_series(np_ar, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))] -) +@pytest.mark.parametrize("pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]) @pytest.mark.parametrize( "func", [ @@ -82,9 +80,7 @@ def test_array_func_cudf_dataframe(pd_df, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))] -) +@pytest.mark.parametrize("pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]) @pytest.mark.parametrize( "func", [ diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index b036c1f13f3..c9d423a59c5 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -22,9 +22,7 @@ ) _UFUNCS = [ - obj - for obj in (getattr(np, name) for name in dir(np)) - if isinstance(obj, np.ufunc) + obj for obj in (getattr(np, name) for name in dir(np)) if isinstance(obj, np.ufunc) ] @@ -267,9 +265,7 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed): @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("indexed", [True, False]) @pytest.mark.parametrize("reflect", [True, False]) -def test_binary_ufunc_series_array( - request, ufunc, has_nulls, indexed, reflect -): +def test_binary_ufunc_series_array(request, ufunc, has_nulls, indexed, reflect): fname = ufunc.__name__ request.applymarker( pytest.mark.xfail( @@ -286,13 +282,9 @@ def test_binary_ufunc_series_array( request.applymarker( pytest.mark.xfail( condition=( - fname in {"greater", "greater_equal", "logical_and"} - and has_nulls - ), - reason=( - "cudf and pandas incompatible casting nans " - "to nulls in binops" + fname in {"greater", "greater_equal", "logical_and"} and has_nulls ), + reason=("cudf and pandas incompatible casting nans " "to nulls in binops"), ) ) N = 100 @@ -425,9 +417,7 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): if indexed and ufunc.nin == 2 else args ) - mask = reduce( - operator.or_, (a["foo"].isna() for a in aligned) - ).to_pandas() + mask = reduce(operator.or_, (a["foo"].isna() for a in aligned)).to_pandas() got = ufunc(*args) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 0e38b10ed52..609012a4c6d 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -64,9 +64,7 @@ def test_can_detect_dtype_from_avro_type( actual = cudf_from_avro_util(schema, []) - expected = cudf.DataFrame( - {"prop": cudf.Series(None, None, expected_dtype)} - ) + expected = cudf.DataFrame({"prop": cudf.Series(None, None, expected_dtype)}) assert_eq(expected, actual) @@ -104,9 +102,7 @@ def test_can_detect_dtype_from_avro_type_nested( ns="" if namespace is None else namespace + "." ) - expected = cudf.DataFrame( - {col_name: cudf.Series(None, None, expected_dtype)} - ) + expected = cudf.DataFrame({col_name: cudf.Series(None, None, expected_dtype)}) assert_eq(expected, actual) @@ -137,9 +133,7 @@ def test_can_parse_single_value(avro_type, cudf_type, avro_val, cudf_val): actual = cudf_from_avro_util(schema_root, records) - expected = cudf.DataFrame( - {"prop": cudf.Series(data=[cudf_val], dtype=cudf_type)} - ) + expected = cudf.DataFrame({"prop": cudf.Series(data=[cudf_val], dtype=cudf_type)}) assert_eq(expected, actual) @@ -156,9 +150,7 @@ def test_can_parse_single_null(avro_type, cudf_type): actual = cudf_from_avro_util(schema_root, records) - expected = cudf.DataFrame( - {"prop": cudf.Series(data=[None], dtype=cudf_type)} - ) + expected = cudf.DataFrame({"prop": cudf.Series(data=[None], dtype=cudf_type)}) assert_eq(expected, actual) @@ -180,9 +172,7 @@ def test_can_parse_no_data(avro_type, cudf_type): assert_eq(expected, actual) -@pytest.mark.xfail( - reason="cudf avro reader is unable to parse zero-field metadata." -) +@pytest.mark.xfail(reason="cudf avro reader is unable to parse zero-field metadata.") @pytest.mark.parametrize("avro_type, cudf_type", avro_type_params) def test_can_parse_no_fields(avro_type, cudf_type): schema_root = { @@ -285,9 +275,7 @@ def test_can_detect_dtypes_from_avro_logical_type( actual = cudf_from_avro_util(schema, []) - expected = cudf.DataFrame( - {"prop": cudf.Series(None, None, expected_dtype)} - ) + expected = cudf.DataFrame({"prop": cudf.Series(None, None, expected_dtype)}) assert_eq(expected, actual) @@ -349,9 +337,7 @@ def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null): actual = cudf_from_avro_util(schema, records) - expected = cudf.DataFrame( - {"o_date": cudf.Series(dates, dtype="datetime64[s]")} - ) + expected = cudf.DataFrame({"o_date": cudf.Series(dates, dtype="datetime64[s]")}) assert_eq(expected, actual) @@ -611,9 +597,7 @@ def test_avro_reader_multiblock( source_df = cudf.DataFrame({"0": pd.Series(values)}) if limit_rows: - expected_df = source_df[skip_rows : skip_rows + num_rows].reset_index( - drop=True - ) + expected_df = source_df[skip_rows : skip_rows + num_rows].reset_index(drop=True) else: expected_df = source_df[skip_rows:].reset_index(drop=True) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 438f3e35ec8..bf54964219e 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -240,9 +240,7 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @pytest.mark.parametrize("binop", _bitwise_binops) -@pytest.mark.parametrize( - "lhs_dtype,rhs_dtype", list(product(_int_types, _int_types)) -) +@pytest.mark.parametrize("lhs_dtype,rhs_dtype", list(product(_int_types, _int_types))) def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): arr1 = (np.random.random(100) * 100).astype(lhs_dtype) sr1 = Series(arr1) @@ -342,13 +340,9 @@ def cmp_scalar(request): return request.param -def test_str_series_compare_str( - str_series_cmp_data, str_series_compare_str_cmpop -): +def test_str_series_compare_str(str_series_cmp_data, str_series_compare_str_cmpop): expect = str_series_compare_str_cmpop(str_series_cmp_data, "a") - got = str_series_compare_str_cmpop( - Series.from_pandas(str_series_cmp_data), "a" - ) + got = str_series_compare_str_cmpop(Series.from_pandas(str_series_cmp_data), "a") utils.assert_eq(expect, got.to_pandas(nullable=True)) @@ -357,9 +351,7 @@ def test_str_series_compare_str_reflected( str_series_cmp_data, str_series_compare_str_cmpop ): expect = str_series_compare_str_cmpop("a", str_series_cmp_data) - got = str_series_compare_str_cmpop( - "a", Series.from_pandas(str_series_cmp_data) - ) + got = str_series_compare_str_cmpop("a", Series.from_pandas(str_series_cmp_data)) utils.assert_eq(expect, got.to_pandas(nullable=True)) @@ -391,9 +383,7 @@ def test_str_series_compare_num_reflected( @pytest.mark.parametrize("cmpop", _cmpops) @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES + ["datetime64[ms]"]) @pytest.mark.parametrize("use_cudf_scalar", [True, False]) -def test_series_compare_scalar( - nelem, cmpop, obj_class, dtype, use_cudf_scalar -): +def test_series_compare_scalar(nelem, cmpop, obj_class, dtype, use_cudf_scalar): arr1 = np.random.randint(0, 100, 100).astype(dtype) sr1 = Series(arr1) rhs = random.choice(arr1).item() @@ -451,13 +441,13 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls): utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool_ )[:nelem] if lhs_nulls == "some" and rhs_nulls == "none": - res_mask = np.asarray( - utils.expand_bits_to_bytes(lhs_mask), dtype=np.bool_ - )[:nelem] + res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask), dtype=np.bool_)[ + :nelem + ] if lhs_nulls == "none" and rhs_nulls == "some": - res_mask = np.asarray( - utils.expand_bits_to_bytes(rhs_mask), dtype=np.bool_ - )[:nelem] + res_mask = np.asarray(utils.expand_bits_to_bytes(rhs_mask), dtype=np.bool_)[ + :nelem + ] # Fill NA values na_value = -10000 got = res.fillna(na_value).to_numpy() @@ -643,12 +633,8 @@ def test_different_shapes_and_same_columns(binop): if binop is operator.pow: return - pd_frame = binop( - pd.DataFrame({"x": [1, 2]}), pd.DataFrame({"x": [1, 2, 3]}) - ) - cd_frame = binop( - cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({"x": [1, 2, 3]}) - ) + pd_frame = binop(pd.DataFrame({"x": [1, 2]}), pd.DataFrame({"x": [1, 2, 3]})) + cd_frame = binop(cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({"x": [1, 2, 3]})) # cast x as float64 so it matches pandas dtype cd_frame["x"] = cd_frame["x"].astype(np.float64) utils.assert_eq(cd_frame, pd_frame) @@ -664,9 +650,7 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): # Test with a RangeIndex pdf1 = pd.DataFrame({"x": [4, 3, 2, 1], "y": [7, 3, 8, 6]}) # Test with an Index - pdf2 = pd.DataFrame( - {"x": [1, 2, 3, 7], "y": [4, 5, 6, 7]}, index=[0, 1, 3, 4] - ) + pdf2 = pd.DataFrame({"x": [1, 2, 3, 7], "y": [4, 5, 6, 7]}, index=[0, 1, 3, 4]) # Test with an Index in a different order pdf3 = pd.DataFrame( {"x": [4, 5, 6, 7], "y": [1, 2, 3, 7], "z": [0, 5, 3, 7]}, @@ -735,18 +719,12 @@ def test_operator_func_between_series(dtype, func, has_nulls, fill_value): gdf_series_a = utils.gen_rand_series( dtype, count, has_nulls=has_nulls, stride=10000 ) - gdf_series_b = utils.gen_rand_series( - dtype, count, has_nulls=has_nulls, stride=100 - ) + gdf_series_b = utils.gen_rand_series(dtype, count, has_nulls=has_nulls, stride=100) pdf_series_a = gdf_series_a.to_pandas() pdf_series_b = gdf_series_b.to_pandas() - gdf_result = getattr(gdf_series_a, func)( - gdf_series_b, fill_value=fill_value - ) - pdf_result = getattr(pdf_series_a, func)( - pdf_series_b, fill_value=fill_value - ) + gdf_result = getattr(gdf_series_a, func)(gdf_series_b, fill_value=fill_value) + pdf_result = getattr(pdf_series_a, func)(pdf_series_b, fill_value=fill_value) utils.assert_eq(pdf_result, gdf_result) @@ -761,18 +739,14 @@ def test_operator_func_series_and_scalar( ): count = 1000 scalar = 59 - gdf_series = utils.gen_rand_series( - dtype, count, has_nulls=has_nulls, stride=10000 - ) + gdf_series = utils.gen_rand_series(dtype, count, has_nulls=has_nulls, stride=10000) pdf_series = gdf_series.to_pandas() gdf_series_result = getattr(gdf_series, func)( cudf.Scalar(scalar) if use_cudf_scalar else scalar, fill_value=fill_value, ) - pdf_series_result = getattr(pdf_series, func)( - scalar, fill_value=fill_value - ) + pdf_series_result = getattr(pdf_series, func)(scalar, fill_value=fill_value) utils.assert_eq(pdf_series_result, gdf_series_result) @@ -794,12 +768,8 @@ def test_operator_func_between_series_logical( pdf_series_a = gdf_series_a.to_pandas(nullable=True) pdf_series_b = gdf_series_b.to_pandas(nullable=True) - gdf_series_result = getattr(gdf_series_a, func)( - gdf_series_b, fill_value=fill_value - ) - pdf_series_result = getattr(pdf_series_a, func)( - pdf_series_b, fill_value=fill_value - ) + gdf_series_result = getattr(gdf_series_a, func)(gdf_series_b, fill_value=fill_value) + pdf_series_result = getattr(pdf_series_a, func)(pdf_series_b, fill_value=fill_value) expect = pdf_series_result got = gdf_series_result.to_pandas(nullable=True) @@ -845,9 +815,7 @@ def test_operator_func_series_and_scalar_logical( cudf.Scalar(scalar) if use_cudf_scalar else scalar, fill_value=fill_value, ) - pdf_series_result = getattr(pdf_series, func)( - scalar, fill_value=fill_value - ) + pdf_series_result = getattr(pdf_series, func)(scalar, fill_value=fill_value) expect = pdf_series_result got = gdf_series_result.to_pandas(nullable=True) @@ -873,9 +841,7 @@ def gen_df(): colname = ascii_lowercase[cols[i]] data = utils.gen_rand("float64", num_rows) * 10000 if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data return pdf @@ -909,9 +875,7 @@ def gen_df(): colname = ascii_lowercase[cols[i]] data = utils.gen_rand("float64", num_rows) * 10000 if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data return pdf @@ -950,14 +914,10 @@ def gen_df(): def test_binop_bool_uint(func, rhs): psr = pd.Series([True, False, False]) gsr = cudf.from_pandas(psr) - utils.assert_eq( - getattr(psr, func)(rhs), getattr(gsr, func)(rhs), check_dtype=False - ) + utils.assert_eq(getattr(psr, func)(rhs), getattr(gsr, func)(rhs), check_dtype=False) -@pytest.mark.parametrize( - "series_dtype", (np.int8, np.uint8, np.int64, np.uint64) -) +@pytest.mark.parametrize("series_dtype", (np.int8, np.uint8, np.int64, np.uint64)) @pytest.mark.parametrize( "divisor_dtype", ( @@ -1008,9 +968,7 @@ def test_floordiv_zero_bool(scalar_divisor): pytest.param( np.bool_, marks=pytest_xfail( - reason=( - "Pandas handling of division by zero-bool is too strange" - ) + reason=("Pandas handling of division by zero-bool is too strange") ), ), np.int8, @@ -1089,29 +1047,19 @@ def make_scalar_add_data(): ) # to any float, we may add any int, float, or bool - valid |= set( - product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES) - ) + valid |= set(product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES)) # to any datetime, we may add any int, timedelta, or bool - valid |= set( - product(DATETIME_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES) - ) + valid |= set(product(DATETIME_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES)) # to any timedelta, we may add any int, datetime, other timedelta, or bool - valid |= set( - product(TIMEDELTA_TYPES, INTEGER_TYPES | DATETIME_TYPES | BOOL_TYPES) - ) + valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | DATETIME_TYPES | BOOL_TYPES)) # to any bool, we may add any int, float, datetime, timedelta, or bool valid |= set( product( BOOL_TYPES, - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, + INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, ) ) @@ -1185,9 +1133,7 @@ def make_scalar_difference_data(): ) # from any float, we may subtract any int, float, or bool - valid |= set( - product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES) - ) + valid |= set(product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES)) # from any datetime we may subtract any int, datetime, timedelta, or bool valid |= set( @@ -1198,14 +1144,10 @@ def make_scalar_difference_data(): ) # from any timedelta we may subtract any int, timedelta, or bool - valid |= set( - product(TIMEDELTA_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES) - ) + valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES)) # from any bool we may subtract any int, float or timedelta - valid |= set( - product(BOOL_TYPES, INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES) - ) + valid |= set(product(BOOL_TYPES, INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES)) return sorted(list(valid)) @@ -1248,9 +1190,7 @@ def test_scalar_difference(dtype_l, dtype_r): assert expect.dtype == got.dtype -@pytest.mark.parametrize( - "dtype_l,dtype_r", make_scalar_difference_data_invalid() -) +@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_difference_data_invalid()) def test_scalar_difference_invalid(dtype_l, dtype_r): test_value = 1 @@ -1276,9 +1216,7 @@ def make_scalar_product_data(): valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | BOOL_TYPES)) # we can multiply a float by any int, float, or bool - valid |= set( - product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES) - ) + valid |= set(product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES)) return sorted(list(valid)) @@ -1290,11 +1228,7 @@ def make_scalar_product_data_invalid(): # or bools by datetimes invalid |= set( product( - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, + INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, DATETIME_TYPES, ) ) @@ -1303,11 +1237,7 @@ def make_scalar_product_data_invalid(): invalid |= set( product( DATETIME_TYPES, - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, + INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, ) ) @@ -1382,11 +1312,7 @@ def make_scalar_floordiv_data_invalid(): invalid |= set( product( DATETIME_TYPES, - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, + INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, ) ) @@ -1413,9 +1339,7 @@ def test_scalar_floordiv(dtype_l, dtype_r): assert expect.dtype == got.dtype -@pytest.mark.parametrize( - "dtype_l,dtype_r", make_scalar_floordiv_data_invalid() -) +@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_floordiv_data_invalid()) def test_scalar_floordiv_invalid(dtype_l, dtype_r): test_value = 1 @@ -1460,18 +1384,12 @@ def make_scalar_truediv_data_invalid(): invalid |= set( product( DATETIME_TYPES, - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, + INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, ) ) # we cant true divide timedeltas by datetimes or bools or floats - invalid |= set( - product(TIMEDELTA_TYPES, DATETIME_TYPES | BOOL_TYPES | FLOAT_TYPES) - ) + invalid |= set(product(TIMEDELTA_TYPES, DATETIME_TYPES | BOOL_TYPES | FLOAT_TYPES)) return sorted(list(invalid)) @@ -1573,9 +1491,7 @@ def test_scalar_remainder(dtype_l, dtype_r): assert expect.dtype == got.dtype -@pytest.mark.parametrize( - "dtype_l,dtype_r", make_scalar_remainder_data_invalid() -) +@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_remainder_data_invalid()) def test_scalar_remainder_invalid(dtype_l, dtype_r): test_value = 1 @@ -1602,11 +1518,7 @@ def make_scalar_power_data_invalid(): # datetimes and timedeltas cant go in exponents invalid |= set( product( - INTEGER_TYPES - | FLOAT_TYPES - | TIMEDELTA_TYPES - | DATETIME_TYPES - | BOOL_TYPES, + INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES | DATETIME_TYPES | BOOL_TYPES, DATETIME_TYPES | TIMEDELTA_TYPES, ) ) @@ -1616,11 +1528,7 @@ def make_scalar_power_data_invalid(): invalid |= set( product( DATETIME_TYPES | TIMEDELTA_TYPES, - DATETIME_TYPES - | TIMEDELTA_TYPES - | INTEGER_TYPES - | FLOAT_TYPES - | BOOL_TYPES, + DATETIME_TYPES | TIMEDELTA_TYPES | INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, ) ) @@ -1661,14 +1569,8 @@ def make_scalar_null_binops_data(): + [(operator.sub, *dtypes) for dtypes in make_scalar_difference_data()] + [(operator.mul, *dtypes) for dtypes in make_scalar_product_data()] + [(operator.add, *dtypes) for dtypes in make_scalar_add_data()] - + [ - (operator.floordiv, *dtypes) - for dtypes in make_scalar_floordiv_data() - ] - + [ - (operator.truediv, *dtypes) - for dtypes in make_scalar_truediv_data() - ] + + [(operator.floordiv, *dtypes) for dtypes in make_scalar_floordiv_data()] + + [(operator.truediv, *dtypes) for dtypes in make_scalar_truediv_data()] + [(operator.mod, *dtypes) for dtypes in make_scalar_remainder_data()] + [(operator.pow, *dtypes) for dtypes in make_scalar_power_data()] ) @@ -1714,9 +1616,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"], ) @pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_datetime_dateoffset_binaryop( - request, n_periods, frequency, dtype, op -): +def test_datetime_dateoffset_binaryop(request, n_periods, frequency, dtype, op): request.applymarker( pytest.mark.xfail( PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION @@ -1775,9 +1675,7 @@ def test_datetime_dateoffset_binaryop( @pytest.mark.filterwarnings( "ignore:Non-vectorized DateOffset:pandas.errors.PerformanceWarning" ) -@pytest.mark.filterwarnings( - "ignore:Discarding nonzero nanoseconds:UserWarning" -) +@pytest.mark.filterwarnings("ignore:Discarding nonzero nanoseconds:UserWarning") @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): gsr = cudf.Series(date_col, dtype="datetime64[ns]") @@ -2335,9 +2233,7 @@ def test_binops_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype): ), ], ) -def test_binops_reflect_decimal( - op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype -): +def test_binops_reflect_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype): a = utils._decimal_series(lhs, l_dtype) b = utils._decimal_series(rhs, r_dtype) expect = utils._decimal_series(expect, expect_dtype) @@ -3018,9 +2914,7 @@ def test_column_null_scalar_comparison(dtype, null_scalar, cmpop): @pytest.mark.parametrize("fn", ["eq", "ne", "lt", "gt", "le", "ge"]) def test_equality_ops_index_mismatch(fn): - a = cudf.Series( - [1, 2, 3, None, None, 4], index=["a", "b", "c", "d", "e", "f"] - ) + a = cudf.Series([1, 2, 3, None, None, 4], index=["a", "b", "c", "d", "e", "f"]) b = cudf.Series( [-5, 4, 3, 2, 1, 0, 19, 11], index=["aa", "b", "c", "d", "e", "f", "y", "z"], @@ -3152,9 +3046,7 @@ def test_empty_column(binop, data, scalar): "other", [ cudf.DataFrame([[9, 10], [11, 12], [13, 14], [15, 16]]), - cudf.DataFrame( - [[9.4, 10.5], [11.6, 12.7], [13.8, 14.9], [15.1, 16.2]] - ), + cudf.DataFrame([[9.4, 10.5], [11.6, 12.7], [13.8, 14.9], [15.1, 16.2]]), cudf.Series([5, 6, 7, 8]), cudf.Series([5.6, 6.7, 7.8, 8.9]), np.array([5, 6, 7, 8]), diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index ad32ebce01b..372911ced54 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -48,9 +48,7 @@ def test_categorical_basic(): assert_eq(pdsr.cat.categories, sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered - np.testing.assert_array_equal( - pdsr.cat.codes.values, sr.cat.codes.to_numpy() - ) + np.testing.assert_array_equal(pdsr.cat.codes.values, sr.cat.codes.to_numpy()) string = str(sr) expect_str = """ @@ -191,9 +189,7 @@ def test_categorical_masking(): expect_matches = pdsr == "a" got_matches = sr == "a" - np.testing.assert_array_equal( - expect_matches.values, got_matches.to_numpy() - ) + np.testing.assert_array_equal(expect_matches.values, got_matches.to_numpy()) # mask series expect_masked = pdsr[expect_matches] @@ -257,9 +253,7 @@ def test_categorical_unique(num_elements): np.random.seed(12) pd_cat = pd.Categorical( pd.Series( - np.random.choice( - list(string.ascii_letters + string.digits), num_elements - ), + np.random.choice(list(string.ascii_letters + string.digits), num_elements), dtype="category", ) ) @@ -284,9 +278,7 @@ def test_categorical_unique_count(nelem): np.random.seed(12) pd_cat = pd.Categorical( pd.Series( - np.random.choice( - list(string.ascii_letters + string.digits), nelem - ), + np.random.choice(list(string.ascii_letters + string.digits), nelem), dtype="category", ) ) @@ -315,9 +307,7 @@ def test_categorical_empty(): assert_eq(pdsr.cat.categories, sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered - np.testing.assert_array_equal( - pdsr.cat.codes.values, sr.cat.codes.to_numpy() - ) + np.testing.assert_array_equal(pdsr.cat.codes.values, sr.cat.codes.to_numpy()) def test_categorical_set_categories(): @@ -648,9 +638,7 @@ def test_add_categories(data, add): with _hide_cudf_safe_casting_warning(): actual = gds.cat.add_categories(add) - assert_eq( - expected.cat.codes, actual.cat.codes.astype(expected.cat.codes.dtype) - ) + assert_eq(expected.cat.codes, actual.cat.codes.astype(expected.cat.codes.dtype)) # Need to type-cast pandas object to str due to mixed-type # support in "object" @@ -754,16 +742,14 @@ def test_categorical_allow_nan(): assert_eq(expected_categories, gs.cat.categories) actual_ps = gs.to_pandas() - expected_ps = pd.Series( - [1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category" - ) + expected_ps = pd.Series([1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category") assert_eq(actual_ps, expected_ps) def test_categorical_setitem_with_nan(): - gs = cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") + gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False).astype( + "category" + ) gs[[1, 3]] = np.nan expected_series = cudf.Series( @@ -776,9 +762,7 @@ def test_categorical_setitem_with_nan(): @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) def test_series_construction_with_nulls(input_obj, dtype): dtype = cudf.dtype(dtype) - input_obj = [ - dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj - ] + input_obj = [dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj] expect = pd.Series(input_obj, dtype="category") got = cudf.Series(input_obj, dtype="category").to_pandas() diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 8e8555b2005..9f9612f4933 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -88,8 +88,7 @@ def test_column_offset_and_size(pandas_input, offset, size): if col.size > 0: assert col.size == (col.children[0].size - 1) assert col.size == ( - (col.children[0].data.size / col.children[0].dtype.itemsize) - - 1 + (col.children[0].data.size / col.children[0].dtype.itemsize) - 1 ) else: assert col.size == (col.data.size / col.dtype.itemsize) @@ -200,11 +199,7 @@ def test_column_mixed_dtype(data, error): ) @pytest.mark.parametrize("size", [1, 10]) def test_as_column_scalar_with_nan(nan_as_null, scalar, size): - expected = ( - cudf.Series([scalar] * size, nan_as_null=nan_as_null) - .dropna() - .to_numpy() - ) + expected = cudf.Series([scalar] * size, nan_as_null=nan_as_null).dropna().to_numpy() got = ( cudf.Series(as_column(scalar, length=size, nan_as_null=nan_as_null)) @@ -335,8 +330,7 @@ def test_column_view_valid_string_to_numeric(data, to_dtype): def test_column_view_nulls_widths_even(): data = [1, 2, None, 4, None] expect_data = [ - np.int32(val).view("float32") if val is not None else np.nan - for val in data + np.int32(val).view("float32") if val is not None else np.nan for val in data ] sr = cudf.Series(data, dtype="int32") @@ -347,8 +341,7 @@ def test_column_view_nulls_widths_even(): data = [None, 2.1, None, 5.3, 8.8] expect_data = [ - np.float64(val).view("int64") if val is not None else val - for val in data + np.float64(val).view("int64") if val is not None else val for val in data ] sr = cudf.Series(data, dtype="float64") @@ -369,9 +362,7 @@ def test_column_view_numeric_slice(slc): assert_eq(expect, got) -@pytest.mark.parametrize( - "slc", [slice(3, 5), slice(0, 4), slice(2, 5), slice(1, 3)] -) +@pytest.mark.parametrize("slc", [slice(3, 5), slice(0, 4), slice(2, 5), slice(1, 3)]) def test_column_view_string_slice(slc): data = ["a", "bcde", "cd", "efg", "h"] @@ -507,9 +498,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): # check mask expect_mask = [x is not pd.NA for x in pd_data] - got_mask = mask_to_bools( - gd_data._column.base_mask, 0, len(gd_data) - ).values_host + got_mask = mask_to_bools(gd_data._column.base_mask, 0, len(gd_data)).values_host np.testing.assert_array_equal(expect_mask, got_mask) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index a8eac2edf2b..32425c78d26 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -76,9 +76,7 @@ def test_to_pandas_multiindex_names(): ) assert_eq( ca.to_pandas_index(), - pd.MultiIndex.from_tuples( - (("a", "b"), ("c", "d")), names=("foo", "bar") - ), + pd.MultiIndex.from_tuples((("a", "b"), ("c", "d")), names=("foo", "bar")), ) @@ -255,9 +253,7 @@ def test_select_by_index_empty(): }, multiindex=True, ) - expect = ColumnAccessor( - {}, multiindex=True, level_names=((None, None, None)) - ) + expect = ColumnAccessor({}, multiindex=True, level_names=((None, None, None))) got = ca.select_by_index(slice(None, 0)) check_ca_equal(expect, got) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index cdb47ea79d8..3c04f62d29f 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -25,8 +25,7 @@ def _hide_concat_empty_dtype_warning(): # being caught and validated in other tests. warnings.filterwarnings( "ignore", - "The behavior of array concatenation with empty entries " - "is deprecated.", + "The behavior of array concatenation with empty entries " "is deprecated.", category=FutureWarning, ) yield @@ -207,9 +206,7 @@ def test_concat_misordered_columns(): @pytest.mark.parametrize("axis", [1, "columns"]) def test_concat_columns(axis): pdf1 = pd.DataFrame(np.random.randint(10, size=(5, 3)), columns=[1, 2, 3]) - pdf2 = pd.DataFrame( - np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7] - ) + pdf2 = pd.DataFrame(np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7]) gdf1 = cudf.from_pandas(pdf1) gdf2 = cudf.from_pandas(pdf2) @@ -266,9 +263,7 @@ def test_concat_multiindex_series(): pd.concat([pdg1, pdg2]), check_index_type=True, ) - assert_eq( - cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1) - ) + assert_eq(cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1)) def test_concat_multiindex_dataframe_and_series(): @@ -329,21 +324,11 @@ def test_concat_string_index_name(myindex): def test_pandas_concat_compatibility_axis1(): - d1 = cudf.datasets.randomdata( - 3, dtypes={"a": float, "ind": float} - ).set_index("ind") - d2 = cudf.datasets.randomdata( - 3, dtypes={"b": float, "ind": float} - ).set_index("ind") - d3 = cudf.datasets.randomdata( - 3, dtypes={"c": float, "ind": float} - ).set_index("ind") - d4 = cudf.datasets.randomdata( - 3, dtypes={"d": float, "ind": float} - ).set_index("ind") - d5 = cudf.datasets.randomdata( - 3, dtypes={"e": float, "ind": float} - ).set_index("ind") + d1 = cudf.datasets.randomdata(3, dtypes={"a": float, "ind": float}).set_index("ind") + d2 = cudf.datasets.randomdata(3, dtypes={"b": float, "ind": float}).set_index("ind") + d3 = cudf.datasets.randomdata(3, dtypes={"c": float, "ind": float}).set_index("ind") + d4 = cudf.datasets.randomdata(3, dtypes={"d": float, "ind": float}).set_index("ind") + d5 = cudf.datasets.randomdata(3, dtypes={"e": float, "ind": float}).set_index("ind") pd1 = d1.to_pandas() pd2 = d2.to_pandas() @@ -462,15 +447,11 @@ def test_concat_mixed_input(): pd.DataFrame({"a": [1, 2]}), ], [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130] - ), + pd.Series([1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130]), pd.DataFrame({"a": [1, 2]}), ], [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"] - ), + pd.Series([1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"]), pd.DataFrame({"a": [1, 2]}, index=["a", "b"]), ], [ @@ -558,9 +539,7 @@ def test_concat_series_dataframe_input_str(objs): [ pd.DataFrame(), pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), + pd.DataFrame({"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), @@ -574,9 +553,7 @@ def test_concat_series_dataframe_input_str(objs): [ [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], [ - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), + pd.DataFrame({"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), pd.DataFrame(), pd.DataFrame(), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), @@ -585,16 +562,12 @@ def test_concat_series_dataframe_input_str(objs): pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), + pd.DataFrame({"cat": pd.Series(["two", "three"], dtype="category")}), ], [ pd.DataFrame([]), pd.DataFrame([], index=[100]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), + pd.DataFrame({"cat": pd.Series(["two", "three"], dtype="category")}), ], ], ) @@ -715,9 +688,7 @@ def test_concat_dataframe_with_multiindex(df1, df2): "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], } ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))} - ), + pd.DataFrame({"x": range(10, 20), "y": list(map(float, range(10, 20)))}), ], [ pd.DataFrame( @@ -755,9 +726,7 @@ def test_concat_join(objs, ignore_index, sort, join, axis): gpu_objs = [cudf.from_pandas(o) for o in objs] assert_eq( - pd.concat( - objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis - ), + pd.concat(objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis), cudf.concat( gpu_objs, sort=sort, @@ -780,9 +749,7 @@ def test_concat_join(objs, ignore_index, sort, join, axis): "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], } ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))} - ), + pd.DataFrame({"x": range(10, 20), "y": list(map(float, range(10, 20)))}), ], ], ) @@ -813,9 +780,7 @@ def test_concat_join_axis_1_dup_error(objs): "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], } ), - pd.DataFrame( - {"l": range(10, 20), "m": list(map(float, range(10, 20)))} - ), + pd.DataFrame({"l": range(10, 20), "m": list(map(float, range(10, 20)))}), ], ], ) @@ -853,9 +818,7 @@ def test_concat_join_many_df_and_empty_df(ignore_index, sort, join, axis): "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], } ) - pdf2 = pd.DataFrame( - {"l": range(10, 20), "m": list(map(float, range(10, 20)))} - ) + pdf2 = pd.DataFrame({"l": range(10, 20), "m": list(map(float, range(10, 20)))}) pdf3 = pd.DataFrame({"j": [1, 2], "k": [1, 2], "s": [1, 2], "t": [1, 2]}) pdf_empty1 = pd.DataFrame() @@ -916,12 +879,8 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}), ), ( - pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["p", "q", "r"] - ), - pd.DataFrame( - {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["r", "p", "z"] - ), + pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["p", "q", "r"]), + pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}, index=["r", "p", "z"]), ), ], ) @@ -929,9 +888,7 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0, 1]) -def test_concat_join_no_overlapping_columns( - pdf1, pdf2, ignore_index, sort, join, axis -): +def test_concat_join_no_overlapping_columns(pdf1, pdf2, ignore_index, sort, join, axis): gdf1 = cudf.from_pandas(pdf1) gdf2 = cudf.from_pandas(pdf2) @@ -1002,12 +959,8 @@ def test_concat_join_no_overlapping_columns_many_and_empty( "objs", [ [ - pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["z", "t", "k"] - ), - pd.DataFrame( - {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["z", "t", "k"] - ), + pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["z", "t", "k"]), + pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}, index=["z", "t", "k"]), pd.DataFrame( { "x": range(10), @@ -1152,9 +1105,7 @@ def test_concat_join_series(ignore_index, sort, join, axis): [ pd.DataFrame(), pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), + pd.DataFrame({"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), @@ -1168,9 +1119,7 @@ def test_concat_join_series(ignore_index, sort, join, axis): [ [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], [ - pd.DataFrame( - {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), + pd.DataFrame({"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), pd.DataFrame(), pd.DataFrame(), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), @@ -1179,16 +1128,12 @@ def test_concat_join_series(ignore_index, sort, join, axis): pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), pd.DataFrame({"k": [10]}, index=[200]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), + pd.DataFrame({"cat": pd.Series(["two", "three"], dtype="category")}), ], [ pd.DataFrame([]), pd.DataFrame([], index=[100]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), + pd.DataFrame({"cat": pd.Series(["two", "three"], dtype="category")}), ], ], ) @@ -1196,9 +1141,7 @@ def test_concat_join_series(ignore_index, sort, join, axis): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0]) -def test_concat_join_empty_dataframes( - df, other, ignore_index, axis, join, sort -): +def test_concat_join_empty_dataframes(df, other, ignore_index, axis, join, sort): other_pd = [df] + other gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] @@ -1216,9 +1159,7 @@ def test_concat_join_empty_dataframes( if not _is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 - expected[key] = ( - expected[key].fillna("-1").astype("str") - ) + expected[key] = expected[key].fillna("-1").astype("str") else: expected[key] = ( expected[key] @@ -1235,9 +1176,7 @@ def test_concat_join_empty_dataframes( expected.fillna(-1), actual.fillna(-1), check_dtype=False, - check_index_type=False - if len(expected) == 0 or actual.empty - else True, + check_index_type=False if len(expected) == 0 or actual.empty else True, check_column_type=False, ) else: @@ -1262,9 +1201,7 @@ def test_concat_join_empty_dataframes( [ pd.DataFrame(), pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), + pd.DataFrame({"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), @@ -1278,9 +1215,7 @@ def test_concat_join_empty_dataframes( [ [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], [ - pd.DataFrame( - {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), + pd.DataFrame({"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), pd.DataFrame(), pd.DataFrame(), pd.DataFrame([[5, 6], [7, 8]], columns=list("CD")), @@ -1289,16 +1224,12 @@ def test_concat_join_empty_dataframes( pd.DataFrame({"g": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"h": [10]}), pd.DataFrame({"k": [10]}, index=[200]), - pd.DataFrame( - {"dog": pd.Series(["two", "three"], dtype="category")} - ), + pd.DataFrame({"dog": pd.Series(["two", "three"], dtype="category")}), ], [ pd.DataFrame([]), pd.DataFrame([], index=[100]), - pd.DataFrame( - {"bird": pd.Series(["two", "three"], dtype="category")} - ), + pd.DataFrame({"bird": pd.Series(["two", "three"], dtype="category")}), ], ], ) @@ -1306,9 +1237,7 @@ def test_concat_join_empty_dataframes( @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [1]) -def test_concat_join_empty_dataframes_axis_1( - df, other, ignore_index, axis, join, sort -): +def test_concat_join_empty_dataframes_axis_1(df, other, ignore_index, axis, join, sort): # no duplicate columns other_pd = [df] + other gdf = cudf.from_pandas(df) @@ -1340,9 +1269,7 @@ def test_concat_join_empty_dataframes_axis_1( expected.fillna(-1), actual.fillna(-1), check_dtype=False, - check_index_type=False - if len(expected) == 0 or actual.empty - else True, + check_index_type=False if len(expected) == 0 or actual.empty else True, check_column_type=False, ) else: @@ -1353,9 +1280,7 @@ def test_concat_join_empty_dataframes_axis_1( check_index_type=False, check_column_type=False, ) - assert_eq( - expected, actual, check_index_type=False, check_column_type=False - ) + assert_eq(expected, actual, check_index_type=False, check_column_type=False) def test_concat_preserve_order(): @@ -1588,9 +1513,7 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): ), ), ( - cudf.Series( - [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2) - ), + cudf.Series([Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2)), cudf.Series([33, 984], dtype="uint32"), cudf.Series([593, -702], dtype="int32"), cudf.Series( @@ -1703,9 +1626,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): dtype=Decimal64Dtype(5, 2), ), cudf.Series( - np.arange( - "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" - ), + np.arange("2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]"), dtype="datetime64[s]", ), cudf.Series( @@ -1735,9 +1656,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): [Decimal("753.0"), Decimal("94.22")], dtype=Decimal64Dtype(5, 2), ), - cudf.Series( - [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")] - ), + cudf.Series([np.timedelta64(940252, "s"), np.timedelta64(758385, "s")]), cudf.Series( ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"], index=[0, 1, 0, 1], @@ -1817,9 +1736,7 @@ def test_concat_list_column(frame1, frame2, expected): def test_concat_categorical_ordering(): # https://github.com/rapidsai/cudf/issues/11486 - sr = pd.Series( - ["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category" - ) + sr = pd.Series(["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category") sr = sr.cat.set_categories(["d", "a", "b", "c", "e"]) df = pd.DataFrame({"a": sr}) @@ -1858,13 +1775,9 @@ def singleton_concat_obj(request, singleton_concat_index): @pytest.mark.parametrize("axis", [0, 1, "columns", "index"]) @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("ignore_index", [False, True]) -def test_concat_singleton_sorting( - axis, sort, ignore_index, singleton_concat_obj -): +def test_concat_singleton_sorting(axis, sort, ignore_index, singleton_concat_obj): gobj = cudf.from_pandas(singleton_concat_obj) - gconcat = cudf.concat( - [gobj], axis=axis, sort=sort, ignore_index=ignore_index - ) + gconcat = cudf.concat([gobj], axis=axis, sort=sort, ignore_index=ignore_index) pconcat = pd.concat( [singleton_concat_obj], axis=axis, sort=sort, ignore_index=ignore_index ) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 15dfa111860..7b4caf41559 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import datetime @@ -27,9 +27,7 @@ def cudf_num_series(start, stop, step=1): def get_categorical_series(): return Series( - pd.Categorical( - ["ab", "ac", "cd", "ab", "cd"], categories=["ab", "ac", "cd"] - ) + pd.Categorical(["ab", "ac", "cd", "ab", "cd"], categories=["ab", "ac", "cd"]) ) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index e737a73e86b..cb99f576a79 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -113,9 +113,7 @@ def test_series_setitem_partial_slice_cow_on(): assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) new_slice = actual[2:] - assert ( - new_slice._column.base_data.owner == actual._column.base_data.owner - ) + assert new_slice._column.base_data.owner == actual._column.base_data.owner new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) @@ -400,29 +398,21 @@ def test_series_cat_copy(copy_on_write): def test_dataframe_cow_slice_setitem(): with cudf.option_context("copy_on_write", True): - df = cudf.DataFrame( - {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} - ) + df = cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}) slice_df = df[1:4] assert_eq( slice_df, - cudf.DataFrame( - {"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3] - ), + cudf.DataFrame({"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3]), ) slice_df["a"][2] = 1111 assert_eq( slice_df, - cudf.DataFrame( - {"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3] - ), + cudf.DataFrame({"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3]), ) assert_eq( df, - cudf.DataFrame( - {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} - ), + cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}), ) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 2d728fb94ba..70ff69f5a61 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -71,9 +71,7 @@ def make_datetime_dataframe(include_non_standard=False): def make_numpy_mixed_dataframe(): df = pd.DataFrame() df["Integer"] = np.array([2345, 11987, 9027, 9027]) - df["Date"] = np.array( - ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"] - ) + df["Date"] = np.array(["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"]) df["Float"] = np.array([9.001, 8.343, 6, 2.781]) df["Integer2"] = np.array([2345, 106, 2088, 789277]) df["Category"] = np.array(["M", "F", "F", "F"]) @@ -551,9 +549,7 @@ def test_csv_reader_NaN_values(): custom_na_values = ["NV_NAN", "NotANumber"] # test default NA values. empty cells should also yield NaNs - gdf = read_csv( - StringIO(default_na_cells + empty_cells), names=names, dtype=dtypes - ) + gdf = read_csv(StringIO(default_na_cells + empty_cells), names=names, dtype=dtypes) pdf = pd.read_csv( StringIO(default_na_cells + empty_cells), names=names, dtype=np.float32 ) @@ -631,9 +627,7 @@ def test_csv_reader_thousands(tmpdir): uint32_ref = [1234567, 12345] uint64_ref = [1234567890, 123456789] - df = read_csv( - str(fname), names=names, dtype=dtypes, skiprows=1, thousands="'" - ) + df = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1, thousands="'") np.testing.assert_allclose(f32_ref, df["float32"].to_numpy()) np.testing.assert_allclose(f64_ref, df["float64"].to_numpy()) @@ -659,9 +653,7 @@ def test_csv_reader_buffer_strings(): assert df["text"][2] == "c" assert df["text"][3] == "d" - df2 = read_csv( - BytesIO(str.encode(buffer)), names=names, dtype=dtypes, skiprows=1 - ) + df2 = read_csv(BytesIO(str.encode(buffer)), names=names, dtype=dtypes, skiprows=1) assert len(df2.columns) == 2 assert df2["text"].dtype == np.dtype("object") assert df2["int"].dtype == np.dtype("int64") @@ -684,18 +676,14 @@ def test_csv_reader_buffer_strings(): ("", None, None), ], ) -def test_csv_reader_compression( - tmpdir, ext, out_comp, in_comp, pd_mixed_dataframe -): +def test_csv_reader_compression(tmpdir, ext, out_comp, in_comp, pd_mixed_dataframe): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_compression" + ext) df = pd_mixed_dataframe df.to_csv(fname, index=False, header=False, compression=out_comp) gdf = read_csv(fname, names=list(df.columns.values), compression=in_comp) - pdf = pd.read_csv( - fname, names=list(df.columns.values), compression=in_comp - ) + pdf = pd.read_csv(fname, names=list(df.columns.values), compression=in_comp) assert_eq(gdf, pdf) @@ -892,9 +880,7 @@ def test_csv_reader_nrows(tmpdir): assert df["int2"][read_rows - 1] == 2 * (read_rows - 1 + skip_rows) # with column name inference - df = read_csv( - str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows - ) + df = read_csv(str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows) assert df.shape == (read_rows, 2) assert str(skip_rows) in list(df)[0] assert str(2 * skip_rows) in list(df)[1] @@ -912,9 +898,7 @@ def test_csv_reader_nrows(tmpdir): assert df["int2"][rows - 1] == 2 * (rows - 1) # nrows + skiprows larger than the file - df = read_csv( - str(fname), dtype=dtypes, nrows=read_rows, skiprows=read_rows - ) + df = read_csv(str(fname), dtype=dtypes, nrows=read_rows, skiprows=read_rows) assert df.shape == (rows - read_rows, 2) # nrows equal to zero @@ -979,9 +963,7 @@ def test_csv_reader_skiprows_header(skip_rows, header_row): cu_df = read_csv( StringIO(buffer), dtype=dtypes, skiprows=skip_rows, header=header_row ) - pd_df = pd.read_csv( - StringIO(buffer), skiprows=skip_rows, header=header_row - ) + pd_df = pd.read_csv(StringIO(buffer), skiprows=skip_rows, header=header_row) assert cu_df.shape == pd_df.shape assert list(cu_df.columns.values) == list(pd_df.columns.values) @@ -1053,9 +1035,7 @@ def test_csv_reader_filenotfound(tmpdir): read_csv(str(dname)) -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "string_io", "url"] -) +@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "string_io", "url"]) def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src): expect = pd.read_csv(path_or_buf("filepath")) got = cudf.read_csv(path_or_buf(src)) @@ -1273,9 +1253,7 @@ def test_csv_reader_delim_whitespace(): with pytest.warns(FutureWarning): cu_df = read_csv(StringIO(buffer), delim_whitespace=True, header=None) with pytest.warns(FutureWarning): - pd_df = pd.read_csv( - StringIO(buffer), delim_whitespace=True, header=None - ) + pd_df = pd.read_csv(StringIO(buffer), delim_whitespace=True, header=None) assert pd_df.shape == cu_df.shape # should raise an error if used with delimiter or sep @@ -1338,9 +1316,7 @@ def test_csv_reader_index_col(): # using a column index with names cu_df = read_csv(StringIO(buffer), header=None, index_col=0, names=names) - pd_df = pd.read_csv( - StringIO(buffer), header=None, index_col=0, names=names - ) + pd_df = pd.read_csv(StringIO(buffer), header=None, index_col=0, names=names) assert_eq(cu_df.index, pd_df.index) # passing False to avoid using a column as index (no-op in cuDF) @@ -1352,9 +1328,7 @@ def test_csv_reader_index_col(): @pytest.mark.parametrize("index_name", [None, "custom name", 124]) @pytest.mark.parametrize("index_col", [None, 0, "a"]) def test_csv_reader_index_names(index_name, index_col): - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": [10, 11, 12]}, index=["AB", "CD", "EF"] - ) + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [10, 11, 12]}, index=["AB", "CD", "EF"]) pdf.index.name = index_name buffer = pdf.to_csv() @@ -1363,9 +1337,7 @@ def test_csv_reader_index_names(index_name, index_col): assert_eq(actual, expected) -@pytest.mark.parametrize( - "names", [["a", "b", "c"], [416, 905, 647], range(3), None] -) +@pytest.mark.parametrize("names", [["a", "b", "c"], [416, 905, 647], range(3), None]) def test_csv_reader_column_names(names): buffer = "0,1,2\n3,4,5\n6,7,8" @@ -1403,9 +1375,7 @@ def test_csv_reader_aligned_byte_range(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file19.csv") nelem = 1000 - input_df = pd.DataFrame( - {"key": np.arange(0, nelem), "zeros": np.zeros(nelem)} - ) + input_df = pd.DataFrame({"key": np.arange(0, nelem), "zeros": np.zeros(nelem)}) input_df.to_csv(fname) df = cudf.read_csv(str(fname), byte_range=(0, 4096)) @@ -1427,9 +1397,7 @@ def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): # require explicit `hex` dtype to parse hexadecimals pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"]) gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) - np.testing.assert_array_equal( - pdf["hex_int"], gdf["hex_int"].to_numpy() - ) + np.testing.assert_array_equal(pdf["hex_int"], gdf["hex_int"].to_numpy()) else: # otherwise, dtype inference returns as object (string) pdf = pd.read_csv(StringIO(buffer), names=["hex_int"]) @@ -1474,9 +1442,7 @@ def test_csv_reader_pd_consistent_quotes(quoting): buffer = "\n".join(lines) - gd_df = read_csv( - StringIO(buffer), names=names, dtype=dtypes, quoting=quoting - ) + gd_df = read_csv(StringIO(buffer), names=names, dtype=dtypes, quoting=quoting) pd_df = pd.read_csv(StringIO(buffer), names=names, quoting=quoting) assert_eq(pd_df, gd_df) @@ -1690,9 +1656,7 @@ def test_csv_writer_terminator_sep(lineterminator, sep, cudf_mixed_dataframe): assert_eq(df, got) -@pytest.mark.parametrize( - "lineterminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")] -) +@pytest.mark.parametrize("lineterminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")]) def test_csv_writer_multichar_terminator(lineterminator, cudf_mixed_dataframe): df = cudf_mixed_dataframe @@ -1718,12 +1682,8 @@ def test_csv_writer_multichar_terminator(lineterminator, cudf_mixed_dataframe): None, ], ) -@pytest.mark.parametrize( - "header", [True, False, np.bool_(True), np.bool_(False)] -) -@pytest.mark.parametrize( - "index", [True, False, np.bool_(True), np.bool_(False)] -) +@pytest.mark.parametrize("header", [True, False, np.bool_(True), np.bool_(False)]) +@pytest.mark.parametrize("index", [True, False, np.bool_(True), np.bool_(False)]) def test_csv_writer_column_and_header_options( columns, header, index, pd_mixed_dataframe ): @@ -1793,12 +1753,8 @@ def test_csv_writer_chunksize(chunksize, dtype): "df", [ cudf.DataFrame({"vals": [1, 2, 3]}), - cudf.DataFrame( - {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]} - ), - cudf.DataFrame( - {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]} - ), + cudf.DataFrame({"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}), + cudf.DataFrame({"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}), ], ) def test_to_csv_empty_filename(df): @@ -1814,12 +1770,8 @@ def test_to_csv_empty_filename(df): "df", [ cudf.DataFrame({"vals": [1, 2, 3]}), - cudf.DataFrame( - {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]} - ), - cudf.DataFrame( - {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]} - ), + cudf.DataFrame({"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}), + cudf.DataFrame({"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}), ], ) def test_to_csv_StringIO(df): @@ -1933,9 +1885,7 @@ def test_csv_write_empty_dataframe(df, index): pd.DataFrame(columns=[""]), ], ) -@pytest.mark.parametrize( - "na_rep", ["", "_NA_", "---", "_____CUSTOM_NA_REP______"] -) +@pytest.mark.parametrize("na_rep", ["", "_NA_", "---", "_____CUSTOM_NA_REP______"]) def test_csv_write_dataframe_na_rep(df, na_rep): gdf = cudf.from_pandas(df) @@ -1980,9 +1930,7 @@ def test_csv_reader_nullable_dtypes(dtype): assert_eq(expected, actual.to_pandas(nullable=True)) -@pytest.mark.parametrize( - "dtype", sorted(list(cudf.utils.dtypes.TIMEDELTA_TYPES)) -) +@pytest.mark.parametrize("dtype", sorted(list(cudf.utils.dtypes.TIMEDELTA_TYPES))) def test_csv_reader_timedetla_dtypes(dtype): buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n43432423,13342,13243214\n" @@ -1992,9 +1940,7 @@ def test_csv_reader_timedetla_dtypes(dtype): assert_eq(expected, actual) -@pytest.mark.parametrize( - "dtype", sorted(list(cudf.utils.dtypes.DATETIME_TYPES)) -) +@pytest.mark.parametrize("dtype", sorted(list(cudf.utils.dtypes.DATETIME_TYPES))) def test_csv_reader_datetime_dtypes(dtype): buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n43432423,13342,13243214\n" @@ -2016,9 +1962,7 @@ def test_csv_reader_datetime_dtypes(dtype): cudf.DataFrame( { "a": cudf.Series([1.1, 2, 3, 1.1, 2], dtype="category"), - "b": cudf.Series( - [None, "c", None, "b", "a"], dtype="category" - ), + "b": cudf.Series([None, "c", None, "b", "a"], dtype="category"), } ), cudf.DataFrame( @@ -2026,9 +1970,7 @@ def test_csv_reader_datetime_dtypes(dtype): "b": cudf.Series( [1.1, 2, 3, 1.1, 2], dtype="category", - index=cudf.CategoricalIndex( - ["abc", "def", "ghi", "jkl", "xyz"] - ), + index=cudf.CategoricalIndex(["abc", "def", "ghi", "jkl", "xyz"]), ) } ), @@ -2089,12 +2031,8 @@ def test_na_filter_empty_fields(): gdf = cudf.read_csv(StringIO(buffer), keep_default_na=False) assert_eq(pdf, gdf) - pdf = pd.read_csv( - StringIO(buffer), keep_default_na=False, na_values=test_na - ) - gdf = cudf.read_csv( - StringIO(buffer), keep_default_na=False, na_values=test_na - ) + pdf = pd.read_csv(StringIO(buffer), keep_default_na=False, na_values=test_na) + gdf = cudf.read_csv(StringIO(buffer), keep_default_na=False, na_values=test_na) assert_eq(pdf, gdf) @@ -2147,9 +2085,7 @@ def test_empty_df_no_index(): assert_eq(actual, result) -def test_default_integer_bitwidth( - cudf_mixed_dataframe, default_integer_bitwidth -): +def test_default_integer_bitwidth(cudf_mixed_dataframe, default_integer_bitwidth): # Test that integer columns in csv are _inferred_ as user specified # bitwidth buf = BytesIO() @@ -2157,9 +2093,7 @@ def test_default_integer_bitwidth( buf.seek(0) read = cudf.read_csv(buf) assert read["Integer"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") - assert read["Integer2"].dtype == np.dtype( - f"i{default_integer_bitwidth//8}" - ) + assert read["Integer2"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") def test_default_integer_bitwidth_partial( @@ -2172,9 +2106,7 @@ def test_default_integer_bitwidth_partial( buf.seek(0) read = cudf.read_csv(buf, dtype={"Integer": "int64"}) assert read["Integer"].dtype == np.dtype("i8") - assert read["Integer2"].dtype == np.dtype( - f"i{default_integer_bitwidth//8}" - ) + assert read["Integer2"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py index 7fdf9754534..c5d032a98b1 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/test_cuda_apply.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. """ Test method that apply GPU kernel to a frame. @@ -141,10 +141,7 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): expect_out1 = extra2 * in1 - extra1 * in2 + in3 expect_out2 = np.hstack( - [ - tpb * np.arange(e - s) - for s, e in zip(chunks, chunks[1:] + [len(df)]) - ] + [tpb * np.arange(e - s) for s, e in zip(chunks, chunks[1:] + [len(df)])] ) outdf = df.apply_chunks( diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 213c6c2c1f9..550cffe7a9e 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -79,8 +79,7 @@ def test_cuda_array_interface_interop_out_masked(dtype, module): expectation = does_not_raise() if module == "cupy": pytest.skip( - "cupy doesn't support version 1 of " - "`__cuda_array_interface__` yet" + "cupy doesn't support version 1 of " "`__cuda_array_interface__` yet" ) module_constructor = cupy.asarray @@ -129,17 +128,13 @@ def test_cuda_array_interface_as_column(dtype, nulls, mask_type): sr = sr.astype(dtype) - obj = types.SimpleNamespace( - __cuda_array_interface__=sr.__cuda_array_interface__ - ) + obj = types.SimpleNamespace(__cuda_array_interface__=sr.__cuda_array_interface__) if mask_type == "bools": if nulls == "some": obj.__cuda_array_interface__["mask"] = numba.cuda.to_device(mask) elif nulls == "all": - obj.__cuda_array_interface__["mask"] = numba.cuda.to_device( - [False] * 10 - ) + obj.__cuda_array_interface__["mask"] = numba.cuda.to_device([False] * 10) expect = sr got = cudf.Series(obj) diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py index 24c1eaa8f02..84e163d400a 100644 --- a/python/cudf/cudf/tests/test_cut.py +++ b/python/cudf/cudf/tests/test_cut.py @@ -18,9 +18,7 @@ @pytest.mark.parametrize("bins", [1, 2, 3]) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize( - "ordered", [True] -) # if ordered is False we need labels +@pytest.mark.parametrize("ordered", [True]) # if ordered is False we need labels @pytest.mark.parametrize("precision", [1, 2, 3]) def test_cut_basic(x, bins, right, include_lowest, ordered, precision): # will test optional labels, retbins and duplicates separately @@ -57,9 +55,7 @@ def test_cut_basic(x, bins, right, include_lowest, ordered, precision): @pytest.mark.parametrize( "labels", [["bad", "medium", "good"], ["A", "B", "C"], [1, 2, 3], False] ) -def test_cut_labels( - x, bins, right, include_lowest, ordered, precision, labels -): +def test_cut_labels(x, bins, right, include_lowest, ordered, precision, labels): pcat = pd.cut( x=x, bins=bins, @@ -87,9 +83,7 @@ def test_cut_labels( @pytest.mark.parametrize("bins", [3]) # labels must be the same len as bins @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize( - "ordered", [False] -) # labels must be unique if ordered=True +@pytest.mark.parametrize("ordered", [False]) # labels must be unique if ordered=True @pytest.mark.parametrize("precision", [1, 2, 3]) @pytest.mark.parametrize( "labels", [["bad", "good", "good"], ["B", "A", "B"], [1, 2, 2], False] diff --git a/python/cudf/cudf/tests/test_dask.py b/python/cudf/cudf/tests/test_dask.py index 3af21b4a7ff..6048ccb5327 100644 --- a/python/cudf/cudf/tests/test_dask.py +++ b/python/cudf/cudf/tests/test_dask.py @@ -1,12 +1,10 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import pytest import cudf -is_dataframe_like = pytest.importorskip( - "dask.dataframe.utils" -).is_dataframe_like +is_dataframe_like = pytest.importorskip("dask.dataframe.utils").is_dataframe_like is_index_like = pytest.importorskip("dask.dataframe.utils").is_index_like is_series_like = pytest.importorskip("dask.dataframe.utils").is_series_like diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e034a3f5e10..48c3fd911a6 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -94,8 +94,7 @@ def _hide_concat_empty_dtype_warning(): # being caught and validated in other tests. warnings.filterwarnings( "ignore", - "The behavior of array concatenation with empty " - "entries is deprecated.", + "The behavior of array concatenation with empty " "entries is deprecated.", category=FutureWarning, ) yield @@ -273,12 +272,8 @@ def test_init_unaligned_with_index(): def test_init_series_list_columns_unsort(): - pseries = [ - pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) - ] - gseries = [ - cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) - ] + pseries = [pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)] + gseries = [cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)] pdf = pd.DataFrame(pseries) gdf = cudf.DataFrame(gseries) assert_eq(pdf, gdf) @@ -393,17 +388,11 @@ def test_dataframe_truncate_axis_1(): def test_dataframe_truncate_datetimeindex(): - dates = cudf.date_range( - "2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s" - ) + dates = cudf.date_range("2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s") df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates) pdf = df.to_pandas() - expected = pdf.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ) - actual = df.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ) + expected = pdf.truncate(before="2021-01-01 23:45:18", after="2021-01-01 23:45:27") + actual = df.truncate(before="2021-01-01 23:45:18", after="2021-01-01 23:45:27") assert_eq(actual, expected) @@ -489,9 +478,7 @@ def test_dataframe_basic(): name="custom_name", ), ), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), + pd.DataFrame({"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}), ], ) @pytest.mark.parametrize( @@ -520,9 +507,7 @@ def test_dataframe_drop_columns(pdf, columns, inplace): {"a": range(10), "b": range(10, 20), "c": range(1, 11)}, index=pd.Index(list(range(10)), name="custom_name"), ), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), + pd.DataFrame({"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}), ], ) @pytest.mark.parametrize( @@ -556,9 +541,7 @@ def test_dataframe_drop_labels_axis_0(pdf, labels, inplace): "pdf", [ pd.DataFrame({"a": range(10), "b": range(10, 20), "c": range(1, 11)}), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), + pd.DataFrame({"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}), pd.DataFrame( { "a": range(10), @@ -644,9 +627,7 @@ def test_dataframe_drop_multiindex(pdf, index, level, inplace): "pdf", [ pd.DataFrame({"a": range(10), "b": range(10, 20), "c": range(1, 11)}), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), + pd.DataFrame({"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}), ], ) @pytest.mark.parametrize( @@ -738,9 +719,7 @@ def test_dataframe_swaplevel_axis_0(): def test_dataframe_swaplevel_TypeError(): - cdf = cudf.DataFrame( - {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] - ) + cdf = cudf.DataFrame({"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"]) with pytest.raises(TypeError): cdf.swaplevel() @@ -771,9 +750,7 @@ def test_dataframe_swaplevel_axis_1(): def test_dataframe_drop_raises(): - df = cudf.DataFrame( - {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] - ) + df = cudf.DataFrame({"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"]) pdf = df.to_pandas() assert_exceptions_equal( lfunc=pdf.drop, @@ -886,9 +863,7 @@ def test_dataframe_index_rename(axis): def test_dataframe_MI_rename(): - gdf = cudf.DataFrame( - {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} - ) + gdf = cudf.DataFrame({"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)}) gdg = gdf.groupby(["a", "b"]).count() pdg = gdg.to_pandas() @@ -921,9 +896,7 @@ def test_dataframe_column_rename(axis): def test_dataframe_pop(): - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [7.0, 8.0, 9.0]} - ) + pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [7.0, 8.0, 9.0]}) gdf = cudf.DataFrame.from_pandas(pdf) # Test non-existing column error @@ -999,9 +972,7 @@ def test_index_astype(nelem): def test_dataframe_to_string_with_skipped_rows(): # Test skipped rows - df = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} - ) + df = cudf.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}) with pd.option_context("display.max_rows", 5): got = df.to_string() @@ -1050,9 +1021,7 @@ def test_dataframe_to_string_with_skipped_rows_and_columns(): def test_dataframe_to_string_with_masked_data(): # Test masked data - df = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} - ) + df = cudf.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}) data = np.arange(6) mask = np.zeros(1, dtype=cudf.utils.utils.mask_dtype) @@ -1167,9 +1136,7 @@ def test_dataframe_copy_shallow(): def test_dataframe_dtypes(): - dtypes = pd.Series( - [np.int32, np.float32, np.float64], index=["c", "a", "b"] - ) + dtypes = pd.Series([np.int32, np.float32, np.float64], index=["c", "a", "b"]) df = cudf.DataFrame({k: np.ones(10, dtype=v) for k, v in dtypes.items()}) assert df.dtypes.equals(dtypes) @@ -1320,14 +1287,10 @@ def test_dataframe_setitem_from_masked_object(): test1_nan = cudf.Series(ary, nan_as_null=False) assert test1_nan.null_count == 0 - test2_null = cudf.DataFrame.from_pandas( - pd.DataFrame({"a": ary}), nan_as_null=True - ) + test2_null = cudf.DataFrame.from_pandas(pd.DataFrame({"a": ary}), nan_as_null=True) assert test2_null["a"].nullable assert test2_null["a"].null_count == 20 - test2_nan = cudf.DataFrame.from_pandas( - pd.DataFrame({"a": ary}), nan_as_null=False - ) + test2_nan = cudf.DataFrame.from_pandas(pd.DataFrame({"a": ary}), nan_as_null=False) assert test2_nan["a"].null_count == 0 gpu_ary = cupy.asarray(ary) @@ -1967,9 +1930,7 @@ def test_from_arrow(nelem, data_type): "b": np.random.randint(0, 1000, nelem).astype(data_type), } ) - padf = pa.Table.from_pandas( - df, preserve_index=False - ).replace_schema_metadata(None) + padf = pa.Table.from_pandas(df, preserve_index=False).replace_schema_metadata(None) gdf = cudf.DataFrame.from_arrow(padf) assert isinstance(gdf, cudf.DataFrame) @@ -1995,9 +1956,7 @@ def test_to_arrow(nelem, data_type): ) gdf = cudf.DataFrame.from_pandas(df) - pa_df = pa.Table.from_pandas( - df, preserve_index=False - ).replace_schema_metadata(None) + pa_df = pa.Table.from_pandas(df, preserve_index=False).replace_schema_metadata(None) pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) @@ -2056,9 +2015,7 @@ def test_to_arrow_categorical(): df["a"] = pd.Series(["a", "b", "c"], dtype="category") gdf = cudf.DataFrame.from_pandas(df) - pa_df = pa.Table.from_pandas( - df, preserve_index=False - ).replace_schema_metadata(None) + pa_df = pa.Table.from_pandas(df, preserve_index=False).replace_schema_metadata(None) pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) assert isinstance(pa_gdf, pa.Table) @@ -2096,9 +2053,7 @@ def test_to_arrow_missing_categorical(): def test_from_scalar_typing(data_type): if data_type == "datetime64[ms]": scalar = ( - np.dtype("int64") - .type(np.random.randint(0, 5)) - .astype("datetime64[ms]") + np.dtype("int64").type(np.random.randint(0, 5)).astype("datetime64[ms]") ) elif data_type.startswith("datetime64"): scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]") @@ -2207,9 +2162,7 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): dtype=dtype, ) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) if len(idx): data[idx] = null_rep elif nulls == "all": @@ -2351,7 +2304,7 @@ def test_dataframe_reductions(data, axis, func, skipna): for kwargs in all_kwargs: if expected_exception is not None: with pytest.raises(expected_exception): - getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs), + (getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs),) else: expect = getattr(pdf, func)(axis=axis, skipna=skipna, **kwargs) with expect_warning_if( @@ -2681,9 +2634,7 @@ def test_iteritems(gdf): def test_quantile(q, numeric_only): ts = pd.date_range("2018-08-24", periods=5, freq="D") td = pd.to_timedelta(np.arange(5), unit="h") - pdf = pd.DataFrame( - {"date": ts, "delta": td, "val": np.random.randn(len(ts))} - ) + pdf = pd.DataFrame({"date": ts, "delta": td, "val": np.random.randn(len(ts))}) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf["date"].quantile(q), gdf["date"].quantile(q)) @@ -2705,9 +2656,7 @@ def test_quantile(q, numeric_only): ) def test_decimal_quantile(q, interpolation, decimal_type): data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"] - gdf = cudf.DataFrame( - {"id": np.random.randint(0, 10, size=len(data)), "val": data} - ) + gdf = cudf.DataFrame({"id": np.random.randint(0, 10, size=len(data)), "val": data}) gdf["id"] = gdf["id"].astype("float64") gdf["val"] = gdf["val"].astype(decimal_type(7, 2)) pdf = gdf.to_pandas() @@ -2789,8 +2738,7 @@ def test_cuda_array_interface(dtype): @pytest.mark.parametrize("data_type", dtypes) def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): np_list_data = [ - np.random.randint(0, 100, nelem).astype(data_type) - for i in range(nchunks) + np.random.randint(0, 100, nelem).astype(data_type) for i in range(nchunks) ] pa_chunk_array = pa.chunked_array(np_list_data) @@ -2806,13 +2754,10 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): assert_eq(expect, got) np_list_data2 = [ - np.random.randint(0, 100, nelem).astype(data_type) - for i in range(nchunks) + np.random.randint(0, 100, nelem).astype(data_type) for i in range(nchunks) ] pa_chunk_array2 = pa.chunked_array(np_list_data2) - pa_table = pa.Table.from_arrays( - [pa_chunk_array, pa_chunk_array2], names=["a", "b"] - ) + pa_table = pa.Table.from_arrays([pa_chunk_array, pa_chunk_array2], names=["a", "b"]) expect = pa_table.to_pandas() if cudf.api.types.is_datetime64_dtype( @@ -2903,9 +2848,7 @@ def test_dataframe_boolmask(mask_shape): [True, False, True], pytest.param( cudf.Series([True, False, True]), - marks=pytest_xfail( - reason="Pandas can't index a multiindex with a Series" - ), + marks=pytest_xfail(reason="Pandas can't index a multiindex with a Series"), ), ], ) @@ -3364,9 +3307,7 @@ def test_dataframe_reindex(copy, reindex_data, args, gd_kwargs): ), ], ) -def test_dataframe_reindex_fill_value( - reindex_data_numeric, args, kwargs, fill_value -): +def test_dataframe_reindex_fill_value(reindex_data_numeric, args, kwargs, fill_value): pdf, gdf = reindex_data_numeric.to_pandas(), reindex_data_numeric kwargs["fill_value"] = fill_value assert_eq(pdf.reindex(*args, **kwargs), gdf.reindex(*args, **kwargs)) @@ -3395,9 +3336,7 @@ def test_series_categorical_reindex(copy): gdf = cudf.datasets.randomdata(nrows=6, dtypes={"a": "category"}) pdf = gdf.to_pandas() assert_eq(pdf["a"].reindex(copy=True), gdf["a"].reindex(copy=copy)) - assert_eq( - pdf["a"].reindex(index, copy=True), gdf["a"].reindex(index, copy=copy) - ) + assert_eq(pdf["a"].reindex(index, copy=True), gdf["a"].reindex(index, copy=copy)) assert_eq( pdf["a"].reindex(index=index, copy=True), gdf["a"].reindex(index=index, copy=copy), @@ -3410,9 +3349,7 @@ def test_series_float_reindex(copy): gdf = cudf.datasets.randomdata(nrows=6, dtypes={"c": float}) pdf = gdf.to_pandas() assert_eq(pdf["c"].reindex(copy=True), gdf["c"].reindex(copy=copy)) - assert_eq( - pdf["c"].reindex(index, copy=True), gdf["c"].reindex(index, copy=copy) - ) + assert_eq(pdf["c"].reindex(index, copy=True), gdf["c"].reindex(index, copy=copy)) assert_eq( pdf["c"].reindex(index=index, copy=True), gdf["c"].reindex(index=index, copy=copy), @@ -3425,9 +3362,7 @@ def test_series_string_reindex(copy): gdf = cudf.datasets.randomdata(nrows=6, dtypes={"d": str}) pdf = gdf.to_pandas() assert_eq(pdf["d"].reindex(copy=True), gdf["d"].reindex(copy=copy)) - assert_eq( - pdf["d"].reindex(index, copy=True), gdf["d"].reindex(index, copy=copy) - ) + assert_eq(pdf["d"].reindex(index, copy=True), gdf["d"].reindex(index, copy=copy)) assert_eq( pdf["d"].reindex(index=index, copy=True), gdf["d"].reindex(index=index, copy=copy), @@ -3454,9 +3389,7 @@ def test_reindex_multiindex_col_to_multiindex(names, klass): @pytest.mark.parametrize("names", [None, ["a", "b"]]) @pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) def test_reindex_tuple_col_to_multiindex(names, klass): - idx = pd.Index( - [("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False - ) + idx = pd.Index([("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False) df = pd.DataFrame([[1, 2]], columns=idx) gdf = cudf.from_pandas(df) midx = klass.from_tuples([("A", "one"), ("A", "two")], names=names) @@ -3725,9 +3658,7 @@ def test_select_dtype(): ), ) - gdf = cudf.DataFrame( - {"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]} - ) + gdf = cudf.DataFrame({"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]}) pdf = gdf.to_pandas() assert_eq( pdf.select_dtypes(include=["object", "int", "category"]), @@ -3752,9 +3683,7 @@ def test_select_dtype(): pdf.select_dtypes(include=["object"]), gdf.select_dtypes(include=["object"]), ) - assert_eq( - pdf.select_dtypes(include=["int"]), gdf.select_dtypes(include=["int"]) - ) + assert_eq(pdf.select_dtypes(include=["int"]), gdf.select_dtypes(include=["int"])) assert_eq( pdf.select_dtypes(exclude=["float"]), gdf.select_dtypes(exclude=["float"]), @@ -3786,9 +3715,7 @@ def test_select_dtype(): gdf.select_dtypes(include=["int"], exclude=["object"]), ) - gdf = cudf.DataFrame( - {"int_col": [0, 1, 2], "list_col": [[1, 2], [3, 4], [5, 6]]} - ) + gdf = cudf.DataFrame({"int_col": [0, 1, 2], "list_col": [[1, 2], [3, 4], [5, 6]]}) pdf = gdf.to_pandas() assert_eq( pdf.select_dtypes("int64"), @@ -3913,9 +3840,7 @@ def test_dataframe_describe_percentiles(): def test_get_numeric_data(): - pdf = pd.DataFrame( - {"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]} - ) + pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]}) gdf = cudf.from_pandas(pdf) assert_eq(pdf._get_numeric_data(), gdf._get_numeric_data()) @@ -4043,9 +3968,7 @@ def test_ndim(): [1, 4, 3, -6], index=["floats", "ints", "floats_with_nan", "floats_same"], ), - cudf.Series( - [-4, -2, 12], index=["ints", "floats_with_nan", "floats_same"] - ), + cudf.Series([-4, -2, 12], index=["ints", "floats_with_nan", "floats_same"]), {"floats": -1, "ints": 15, "floats_will_nan": 2}, ], ) @@ -4284,16 +4207,12 @@ def test_as_column_types(): assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="float32") - gds = cudf.Series( - column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32") - ) + gds = cudf.Series(column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32")) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="str") - gds = cudf.Series( - column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str") - ) + gds = cudf.Series(column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str")) assert_eq(pds, gds) @@ -4327,10 +4246,7 @@ def test_no_cols_head(index): @pytest.mark.parametrize("dtype", ALL_TYPES) @pytest.mark.parametrize( "np_dtype,pd_dtype", - [ - tuple(item) - for item in cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.items() - ], + [tuple(item) for item in cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.items()], ) def test_series_astype_pandas_nullable(dtype, np_dtype, pd_dtype): source = cudf.Series([0, 1, None], dtype=dtype) @@ -4464,9 +4380,7 @@ def test_series_astype_to_categorical_ordered(ordered): psr = pd.Series([1, 2, 3, 1], dtype="category") gsr = cudf.from_pandas(psr) - ordered_dtype_pd = pd.CategoricalDtype( - categories=[1, 2, 3], ordered=ordered - ) + ordered_dtype_pd = pd.CategoricalDtype(categories=[1, 2, 3], ordered=ordered) ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) assert_eq( psr.astype("int32").astype(ordered_dtype_pd).astype("int32"), @@ -4477,9 +4391,7 @@ def test_series_astype_to_categorical_ordered(ordered): @pytest.mark.parametrize("ordered", [True, False]) def test_series_astype_cat_ordered_to_unordered(ordered): pd_dtype = pd.CategoricalDtype(categories=[1, 2, 3], ordered=ordered) - pd_to_dtype = pd.CategoricalDtype( - categories=[1, 2, 3], ordered=not ordered - ) + pd_to_dtype = pd.CategoricalDtype(categories=[1, 2, 3], ordered=not ordered) gd_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) gd_to_dtype = cudf.CategoricalDtype.from_pandas(pd_to_dtype) @@ -4569,9 +4481,7 @@ def test_series_astype_null_cases(): assert_eq( pd.Series(data, dtype="datetime64[ns]").astype("category"), - cudf.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype( - "category" - ), + cudf.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype("category"), ) @@ -4665,9 +4575,7 @@ def test_dataframe_columns_returns_rangeindex_single_col(): @pytest.mark.parametrize("idx_data", [[], [1, 2]]) @pytest.mark.parametrize("data", [None, [], {}]) def test_dataframe_columns_empty_data_preserves_dtype(dtype, idx_data, data): - result = cudf.DataFrame( - data, columns=cudf.Index(idx_data, dtype=dtype) - ).columns + result = cudf.DataFrame(data, columns=cudf.Index(idx_data, dtype=dtype)).columns expected = pd.Index(idx_data, dtype=dtype) assert_eq(result, expected) @@ -4752,15 +4660,11 @@ def test_series_values_property(data): {"A": np.float32(np.arange(3)), "B": np.float64(np.arange(3))}, pytest.param( {"A": [1, None, 3], "B": [1, 2, None]}, - marks=pytest_xfail( - reason="Nulls not supported by values accessor" - ), + marks=pytest_xfail(reason="Nulls not supported by values accessor"), ), pytest.param( {"A": [None, None, None], "B": [None, None, None]}, - marks=pytest_xfail( - reason="Nulls not supported by values accessor" - ), + marks=pytest_xfail(reason="Nulls not supported by values accessor"), ), {"A": [], "B": []}, pytest.param( @@ -4891,8 +4795,7 @@ def test_isin_dataframe(data, values): except TypeError as e: # Can't do isin with different categories if str(e) == ( - "Categoricals can only be compared if 'categories' " - "are the same." + "Categoricals can only be compared if 'categories' " "are the same." ): return @@ -5086,9 +4989,7 @@ def test_df_astype_to_categorical_ordered(ordered): pdf["bar"] = psr gdf = cudf.DataFrame.from_pandas(pdf) - ordered_dtype_pd = pd.CategoricalDtype( - categories=[1, 2, 3], ordered=ordered - ) + ordered_dtype_pd = pd.CategoricalDtype(categories=[1, 2, 3], ordered=ordered) ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) assert_eq( @@ -5115,9 +5016,7 @@ def test_empty_df_astype(dtype): @pytest.mark.parametrize( "errors", [ - pytest.param( - "raise", marks=pytest_xfail(reason="should raise error here") - ), + pytest.param("raise", marks=pytest_xfail(reason="should raise error here")), pytest.param("other", marks=pytest_xfail(raises=ValueError)), "ignore", ], @@ -5170,9 +5069,7 @@ def test_df_constructor_dtype(dtype): { "a": [1, 2, 3, 4], "b": [7, np.NaN, 9, 10], - "c": cudf.Series( - [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False - ), + "c": cudf.Series([np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False), "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), @@ -5188,9 +5085,7 @@ def test_df_constructor_dtype(dtype): ), ], ) -@pytest.mark.parametrize( - "op", ["max", "min", "sum", "product", "mean", "var", "std"] -) +@pytest.mark.parametrize("op", ["max", "min", "sum", "product", "mean", "var", "std"]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_rowwise_ops(data, op, skipna, numeric_only): @@ -5202,11 +5097,7 @@ def test_rowwise_ops(data, op, skipna, numeric_only): kwargs["ddof"] = 0 if not numeric_only and not all( - ( - (pdf[column].count() == 0) - if skipna - else (pdf[column].notna().count() == 0) - ) + ((pdf[column].count() == 0) if skipna else (pdf[column].notna().count() == 0)) or cudf.api.types.is_numeric_dtype(pdf[column].dtype) or cudf.api.types.is_bool_dtype(pdf[column].dtype) for column in pdf @@ -5227,9 +5118,7 @@ def test_rowwise_ops(data, op, skipna, numeric_only): ) -@pytest.mark.parametrize( - "op", ["max", "min", "sum", "product", "mean", "var", "std"] -) +@pytest.mark.parametrize("op", ["max", "min", "sum", "product", "mean", "var", "std"]) def test_rowwise_ops_nullable_dtypes_all_null(op): gdf = cudf.DataFrame( { @@ -5435,23 +5324,17 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes ): with pytest.raises(TypeError): - got = getattr(gdf, op)( - axis=1, skipna=skipna, numeric_only=numeric_only - ) + got = getattr(gdf, op)(axis=1, skipna=skipna, numeric_only=numeric_only) with pytest.raises(TypeError): expected = getattr(pdf, op)( axis=1, skipna=skipna, numeric_only=numeric_only ) else: - got = getattr(gdf, op)( - axis=1, skipna=skipna, numeric_only=numeric_only - ) - expected = getattr(pdf, op)( - axis=1, skipna=skipna, numeric_only=numeric_only - ) - if got.dtype == cudf.dtype( - "datetime64[us]" - ) and expected.dtype == np.dtype("datetime64[ns]"): + got = getattr(gdf, op)(axis=1, skipna=skipna, numeric_only=numeric_only) + expected = getattr(pdf, op)(axis=1, skipna=skipna, numeric_only=numeric_only) + if got.dtype == cudf.dtype("datetime64[us]") and expected.dtype == np.dtype( + "datetime64[ns]" + ): # Workaround for a PANDAS-BUG: # https://github.com/pandas-dev/pandas/issues/52524 assert_eq(got.astype("datetime64[ns]"), expected) @@ -5468,9 +5351,7 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): ["2020-08-01 09:00:00", "1920-05-01 10:30:00"], dtype=""} - ], + "val": [{"name": "var1", "val": None, "type": "optional"}], "type": "list", }, {}, diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index 161b245953b..ba0a47f793d 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -126,9 +126,7 @@ def test_drop_duplicates(): expected = pdf.drop_duplicates("E", keep="last") assert_eq(result, expected) - pdf = pd.DataFrame( - {"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]} - ) + pdf = pd.DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) @@ -157,9 +155,7 @@ def test_drop_duplicates(): @pytest.mark.skip(reason="cudf does not support duplicate column names yet") def test_drop_duplicates_with_duplicate_column_names(): - df = pd.DataFrame( - [[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"] - ) + df = pd.DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) df = cudf.DataFrame.from_pandas(df) result0 = df.drop_duplicates() @@ -343,12 +339,8 @@ def test_dataframe_drop_duplicates_method(): assert_eq(gdf.drop_duplicates("n1"), pdf.drop_duplicates("n1")) assert_eq(gdf.drop_duplicates("n2"), pdf.drop_duplicates("n2")) assert_eq(gdf.drop_duplicates("s1"), pdf.drop_duplicates("s1")) - assert_eq( - gdf.drop_duplicates(["n1", "n2"]), pdf.drop_duplicates(["n1", "n2"]) - ) - assert_eq( - gdf.drop_duplicates(["n1", "s1"]), pdf.drop_duplicates(["n1", "s1"]) - ) + assert_eq(gdf.drop_duplicates(["n1", "n2"]), pdf.drop_duplicates(["n1", "n2"])) + assert_eq(gdf.drop_duplicates(["n1", "s1"]), pdf.drop_duplicates(["n1", "s1"])) # Test drop error assert_exceptions_equal( diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 12a325fa4e8..9a4a4489750 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import os from string import ascii_letters @@ -19,10 +19,7 @@ def pdf(request): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) - for typ in types - } + {f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) for typ in types} ) # Delete the name of the column index, and rename the row index test_pdf.columns.name = None diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 06516b6b4ea..a9b319d88ad 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -111,9 +111,7 @@ def pdf(gdf): @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_mean(nelem): got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).mean() - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() - ) + expect_df = make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() assert_groupby_results_equal(got_df, expect_df) @@ -121,30 +119,18 @@ def test_groupby_mean(nelem): def test_groupby_mean_3level(nelem): lvls = "z" bys = list("xyz") - got_df = ( - make_frame(DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys) - .mean() - ) + got_df = make_frame(DataFrame, nelem=nelem, extra_levels=lvls).groupby(bys).mean() expect_df = ( - make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys) - .mean() + make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls).groupby(bys).mean() ) assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_agg_mean_min(nelem): - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"]) - .agg(["mean", "min"]) - ) + got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(["mean", "min"]) expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"]) - .agg(["mean", "min"]) + make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(["mean", "min"]) ) assert_groupby_results_equal(got_df, expect_df) @@ -199,17 +185,11 @@ def test_groupby_as_index_apply(pdf, gdf, as_index, engine): @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_as_index_multiindex(pdf, gdf, as_index): - pdf = pd.DataFrame( - {"a": [1, 2, 1], "b": [3, 3, 3], "c": [2, 2, 3], "d": [3, 1, 2]} - ) + pdf = pd.DataFrame({"a": [1, 2, 1], "b": [3, 3, 3], "c": [2, 2, 3], "d": [3, 1, 2]}) gdf = cudf.from_pandas(pdf) - gdf = gdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( - {"c": "mean"} - ) - pdf = pdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( - {"c": "mean"} - ) + gdf = gdf.groupby(["a", "b"], as_index=as_index, sort=True).agg({"c": "mean"}) + pdf = pdf.groupby(["a", "b"], as_index=as_index, sort=True).agg({"c": "mean"}) if as_index: assert_eq(pdf, gdf) @@ -382,9 +362,7 @@ def foo(key1, val1, com1, com2): got = got.to_pandas() expect = df.copy() - expect["com1"] = (expect["key1"] * 10000 + expect["key1"]).astype( - np.float64 - ) + expect["com1"] = (expect["key1"] * 10000 + expect["key1"]).astype(np.float64) expect["com2"] = np.zeros(nelem, dtype=np.int32) assert_groupby_results_equal(expect, got) @@ -422,9 +400,7 @@ def groupby_jit_data_large(groupby_jit_data_small): manifesting numerical issues such as overflow. """ max_tpb = 1024 - factor = ( - max_tpb + 1 - ) # bigger than a block but not always an exact multiple + factor = max_tpb + 1 # bigger than a block but not always an exact multiple df = cudf.concat([groupby_jit_data_small] * factor) return df @@ -500,18 +476,14 @@ def func(df): "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] ) @pytest.mark.parametrize("dataset", ["small", "large", "nans"]) -def test_groupby_apply_jit_unary_reductions( - func, dtype, dataset, groupby_jit_datasets -): +def test_groupby_apply_jit_unary_reductions(func, dtype, dataset, groupby_jit_datasets): dataset = groupby_jit_datasets[dataset] groupby_apply_jit_reductions_test_inner(func, dataset, dtype) # test unary reductions for special values -def groupby_apply_jit_reductions_special_vals_inner( - func, data, dtype, special_val -): +def groupby_apply_jit_reductions_special_vals_inner(func, data, dtype, special_val): funcstr = textwrap.dedent( f""" def func(df): @@ -531,9 +503,7 @@ def func(df): # test unary index reductions for special values -def groupby_apply_jit_idx_reductions_special_vals_inner( - func, data, dtype, special_val -): +def groupby_apply_jit_idx_reductions_special_vals_inner(func, data, dtype, special_val): funcstr = textwrap.dedent( f""" def func(df): @@ -617,9 +587,7 @@ def func(group): pytest.param( "small", marks=[ - pytest.mark.filterwarnings( - "ignore:Degrees of Freedom <= 0 for slice" - ), + pytest.mark.filterwarnings("ignore:Degrees of Freedom <= 0 for slice"), pytest.mark.filterwarnings( "ignore:divide by zero encountered in divide" ), @@ -642,10 +610,7 @@ def func(group): if np.dtype(dtype).kind == "f": # Correlation of floating types is not yet supported: # https://github.com/rapidsai/cudf/issues/13839 - m = ( - f"Series.corr\\(Series\\) is not " - f"supported for \\({dtype}, {dtype}\\)" - ) + m = f"Series.corr\\(Series\\) is not " f"supported for \\({dtype}, {dtype}\\)" with pytest.raises(UDFError, match=m): run_groupby_apply_jit_test(dataset, func, keys) return @@ -658,9 +623,7 @@ def test_groupby_apply_jit_correlation_zero_variance(dtype): # pearson correlation is undefined when the variance of either # variable is zero. This test ensures that the jit implementation # returns the same result as pandas in this case. - data = DataFrame( - {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]} - ) + data = DataFrame({"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]}) def func(group): return group["b"].corr(group["c"]) @@ -684,9 +647,7 @@ def func(group): @pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_groupby_apply_jit_invalid_binary_ops_error( - groupby_jit_data_small, op -): +def test_groupby_apply_jit_invalid_binary_ops_error(groupby_jit_data_small, op): keys = ["key1"] def func(group): @@ -757,13 +718,9 @@ def f3(df, k, L, m): return [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] -@pytest.mark.parametrize( - "func,args", create_test_groupby_apply_jit_args_params() -) +@pytest.mark.parametrize("func,args", create_test_groupby_apply_jit_args_params()) def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): - run_groupby_apply_jit_test( - groupby_jit_data_small, func, ["key1", "key2"], *args - ) + run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"], *args) def test_groupby_apply_jit_block_divergence(): @@ -920,9 +877,7 @@ def pdf_func(df): ) def test_groupby_2keys_agg(nelem, func): # gdf (Note: lack of multiIndex) - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) - ) + expect_df = make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) check_dtype = func not in _index_type_aggs @@ -1291,12 +1246,8 @@ def test_groupby_list_then_string(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [6, 7, 6, 7, 6] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg( - {"b": ["min", "max"], "c": "max"} - ) - pdg = pdf.groupby("a", as_index=True).agg( - {"b": ["min", "max"], "c": "max"} - ) + gdg = gdf.groupby("a", as_index=True).agg({"b": ["min", "max"], "c": "max"}) + pdg = pdf.groupby("a", as_index=True).agg({"b": ["min", "max"], "c": "max"}) assert_groupby_results_equal(gdg, pdg) @@ -1306,12 +1257,8 @@ def test_groupby_different_unequal_length_column_aggregations(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg( - {"b": "min", "c": ["max", "min"]} - ) - pdg = pdf.groupby("a", as_index=True).agg( - {"b": "min", "c": ["max", "min"]} - ) + gdg = gdf.groupby("a", as_index=True).agg({"b": "min", "c": ["max", "min"]}) + pdg = pdf.groupby("a", as_index=True).agg({"b": "min", "c": ["max", "min"]}) assert_groupby_results_equal(pdg, gdg) @@ -1483,9 +1430,7 @@ def test_groupby_nulls_in_index(): pdf = pd.DataFrame({"a": [None, 2, 1, 1], "b": [1, 2, 3, 4]}) gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - pdf.groupby("a").sum(), gdf.groupby("a").sum() - ) + assert_groupby_results_equal(pdf.groupby("a").sum(), gdf.groupby("a").sum()) def test_groupby_all_nulls_index(): @@ -1496,17 +1441,13 @@ def test_groupby_all_nulls_index(): } ) pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby("a").sum(), gdf.groupby("a").sum() - ) + assert_groupby_results_equal(pdf.groupby("a").sum(), gdf.groupby("a").sum()) gdf = cudf.DataFrame( {"a": cudf.Series([np.nan, np.nan, np.nan, np.nan]), "b": [1, 2, 3, 4]} ) pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby("a").sum(), gdf.groupby("a").sum() - ) + assert_groupby_results_equal(pdf.groupby("a").sum(), gdf.groupby("a").sum()) @pytest.mark.parametrize("sort", [True, False]) @@ -1520,9 +1461,7 @@ def test_groupby_sort(sort): check_like=not sort, ) - pdf = pd.DataFrame( - {"c": [-1, 2, 1, 4], "b": [1, 2, 3, 4], "a": [2, 2, 1, 1]} - ) + pdf = pd.DataFrame({"c": [-1, 2, 1, 4], "b": [1, 2, 3, 4], "a": [2, 2, 1, 1]}) gdf = cudf.from_pandas(pdf) assert_eq( @@ -1581,9 +1520,7 @@ def test_groupby_quantile(request, interpolation, q): request.applymarker( pytest.mark.xfail( condition=(q == 0.5 and interpolation == "nearest"), - reason=( - "Pandas NaN Rounding will fail nearest interpolation at 0.5" - ), + reason=("Pandas NaN Rounding will fail nearest interpolation at 0.5"), ) ) @@ -1684,9 +1621,7 @@ def test_groupby_cumcount(index): @pytest.mark.parametrize("nelem", [2, 3, 1000]) @pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize( - "agg", ["min", "max", "idxmin", "idxmax", "mean", "count"] -) +@pytest.mark.parametrize("agg", ["min", "max", "idxmin", "idxmax", "mean", "count"]) def test_groupby_datetime(nelem, as_index, agg): if agg == "mean" and as_index is True: return @@ -1712,9 +1647,7 @@ def test_groupby_datetime(nelem, as_index, agg): def test_groupby_dropna(): df = cudf.DataFrame({"a": [1, 1, None], "b": [1, 2, 3]}) - expect = cudf.DataFrame( - {"b": [3, 3]}, index=cudf.Series([1, None], name="a") - ) + expect = cudf.DataFrame({"b": [3, 3]}, index=cudf.Series([1, None], name="a")) got = df.groupby("a", dropna=False).sum() assert_groupby_results_equal(expect, got) @@ -1782,9 +1715,7 @@ def test_groupby_series_same_name_as_dataframe_column(): def test_group_by_series_and_column_name_in_by(): - gdf = cudf.DataFrame( - {"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}, index=[1, 2, 3] - ) + gdf = cudf.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}, index=[1, 2, 3]) gsr0 = cudf.Series([0.0, 1.0, 2.0], name="a", index=[1, 2, 3]) gsr1 = cudf.Series([0.0, 1.0, 3.0], name="b", index=[3, 4, 5]) @@ -1820,9 +1751,7 @@ def test_grouping(grouper): ) gdf = cudf.from_pandas(pdf) - for pdf_group, gdf_group in zip( - pdf.groupby(grouper), gdf.groupby(grouper) - ): + for pdf_group, gdf_group in zip(pdf.groupby(grouper), gdf.groupby(grouper)): assert pdf_group[0] == gdf_group[0] assert_eq(pdf_group[1], gdf_group[1]) @@ -1944,9 +1873,7 @@ def test_groupby_agg_combinations(agg): def test_groupby_apply_noempty_group(): - pdf = pd.DataFrame( - {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} - ) + pdf = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]}) gdf = cudf.from_pandas(pdf) expect = ( @@ -2003,9 +1930,7 @@ def _groupby(self): ], ) def test_groupby_groups(by): - pdf = pd.DataFrame( - {"a": [1, 2, 1, 2, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6, 7]} - ) + pdf = pd.DataFrame({"a": [1, 2, 1, 2, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6, 7]}) gdf = cudf.from_pandas(pdf) pdg = pdf.groupby(by) @@ -2124,9 +2049,7 @@ def test_groupby_list_single_element(list_agg): ) -@pytest.mark.parametrize( - "agg", [list, [list, "count"], {"b": list, "c": "sum"}] -) +@pytest.mark.parametrize("agg", [list, [list, "count"], {"b": list, "c": "sum"}]) def test_groupby_list_strings(agg): pdf = pd.DataFrame( { @@ -2206,9 +2129,7 @@ def f3(x, k, L, m): return [(f0, ()), (f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] -@pytest.mark.parametrize( - "func,args", create_test_groupby_apply_return_scalars_params() -) +@pytest.mark.parametrize("func,args", create_test_groupby_apply_return_scalars_params()) def test_groupby_apply_return_scalars(func, args): pdf = pd.DataFrame( { @@ -2268,9 +2189,7 @@ def f5(x, k, L, m): "func,args", create_test_groupby_apply_return_series_dataframe_params() ) def test_groupby_apply_return_series_dataframe(func, args): - pdf = pd.DataFrame( - {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} - ) + pdf = pd.DataFrame({"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]}) gdf = cudf.from_pandas(pdf) expected = pdf.groupby(["key"], group_keys=False).apply( @@ -2354,17 +2273,11 @@ def test_groupby_unique(by, data, dtype): @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -@pytest.mark.parametrize( - "func", ["cummin", "cummax", "cumcount", "cumsum", "cumprod"] -) +@pytest.mark.parametrize("func", ["cummin", "cummax", "cumcount", "cumsum", "cumprod"]) def test_groupby_2keys_scan(nelem, func): pdf = make_frame(pd.DataFrame, nelem=nelem) expect_df = pdf.groupby(["x", "y"], sort=True).agg(func) - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) + got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"], sort=True).agg(func) # pd.groupby.cumcount returns a series. if isinstance(expect_df, pd.Series): expect_df = expect_df.to_frame("val") @@ -2402,9 +2315,7 @@ def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct): def test_groupby_rank_fails(): - gdf = cudf.DataFrame( - {"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]} - ) + gdf = cudf.DataFrame({"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]}) with pytest.raises(NotImplementedError): gdf.groupby(["x", "y"]).rank(method="min", axis=1) gdf = cudf.DataFrame( @@ -2417,9 +2328,7 @@ def test_groupby_rank_fails(): gdf.groupby(["a"]).rank(method="min", axis=1) -@pytest.mark.parametrize( - "with_nan", [False, True], ids=["just-NA", "also-NaN"] -) +@pytest.mark.parametrize("with_nan", [False, True], ids=["just-NA", "also-NaN"]) @pytest.mark.parametrize("dropna", [False, True], ids=["keepna", "dropna"]) @pytest.mark.parametrize( "duplicate_index", [False, True], ids=["rangeindex", "dupindex"] @@ -2467,14 +2376,10 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): gdf = cudf.from_pandas(pdf) n_shift = int(nelem * shift_perc) * direction - expected = pdf.groupby(["x", "y"]).shift( - periods=n_shift, fill_value=fill_value - ) + expected = pdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value) got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value) - assert_groupby_results_equal( - expected[["val", "val2"]], got[["val", "val2"]] - ) + assert_groupby_results_equal(expected[["val", "val2"]], got[["val", "val2"]]) @pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) @@ -2498,9 +2403,7 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): ), ], ) -def test_groupby_shift_row_mixed_numerics( - nelem, shift_perc, direction, fill_value -): +def test_groupby_shift_row_mixed_numerics(nelem, shift_perc, direction, fill_value): t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2583,9 +2486,7 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction): ] ], ) -def test_groupby_shift_row_mixed_fill( - nelem, shift_perc, direction, fill_value -): +def test_groupby_shift_row_mixed_fill(nelem, shift_perc, direction, fill_value): t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2616,9 +2517,7 @@ def test_groupby_shift_row_mixed_fill( if isinstance(single_fill, cudf.Scalar): single_fill = single_fill._host_value expected[col] = ( - pdf[col] - .groupby(pdf["0"]) - .shift(periods=n_shift, fill_value=single_fill) + pdf[col].groupby(pdf["0"]).shift(periods=n_shift, fill_value=single_fill) ) got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) @@ -2671,9 +2570,7 @@ def test_groupby_diff_row(nelem, shift_perc, direction): expected = pdf.groupby(["x", "y"]).diff(periods=n_shift) got = gdf.groupby(["x", "y"]).diff(periods=n_shift) - assert_groupby_results_equal( - expected[["val", "val2"]], got[["val", "val2"]] - ) + assert_groupby_results_equal(expected[["val", "val2"]], got[["val", "val2"]]) @pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) @@ -2775,8 +2672,7 @@ def test_groupby_fillna_multi_value(nelem): # fill the dataframe with the first non-null item in the column fill_values = { - name: pdf[name].loc[pdf[name].first_valid_index()] - for name in value_cols + name: pdf[name].loc[pdf[name].first_valid_index()] for name in value_cols } # cudf can't fillna with a pandas.Timedelta type fill_values["4"] = fill_values["4"].to_numpy() @@ -2820,8 +2716,7 @@ def test_groupby_fillna_multi_value_df(nelem): # fill the dataframe with the first non-null item in the column fill_values = { - name: pdf[name].loc[pdf[name].first_valid_index()] - for name in value_cols + name: pdf[name].loc[pdf[name].first_valid_index()] for name in value_cols } # cudf can't fillna with a pandas.Timedelta type fill_values["4"] = fill_values["4"].to_numpy() @@ -2840,9 +2735,7 @@ def test_groupby_fillna_multi_value_df(nelem): "by", [pd.Series([1, 1, 2, 2, 3, 4]), lambda x: x % 2 == 0, pd.Grouper(level=0)], ) -@pytest.mark.parametrize( - "data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]] -) +@pytest.mark.parametrize("data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]]) @pytest.mark.parametrize("args", [{"value": 42}, {"method": "ffill"}]) def test_groupby_various_by_fillna(by, data, args): ps = pd.Series(data) @@ -2906,9 +2799,7 @@ def test_groupby_fillna_method(nelem, method): with pytest.warns(FutureWarning): got = gdf.groupby(key_col).fillna(method=method) - assert_groupby_results_equal( - expect[value_cols], got[value_cols], sort=False - ) + assert_groupby_results_equal(expect[value_cols], got[value_cols], sort=False) @pytest.mark.parametrize( @@ -3238,9 +3129,7 @@ def test_groupby_select_then_diff(): @pytest.mark.parametrize("by", ["a", ["a", "b"], pd.Series([1, 2, 1, 3])]) def test_groupby_transform_maintain_index(by): # test that we maintain the index after a groupby transform - gdf = cudf.DataFrame( - {"a": [1, 1, 1, 2], "b": [1, 2, 1, 2]}, index=[3, 2, 1, 0] - ) + gdf = cudf.DataFrame({"a": [1, 1, 1, 2], "b": [1, 2, 1, 2]}, index=[3, 2, 1, 0]) pdf = gdf.to_pandas() assert_groupby_results_equal( pdf.groupby(by).transform("max"), gdf.groupby(by).transform("max") @@ -3287,9 +3176,7 @@ def test_groupby_pct_change(data, gkey, periods, fill_method): pdf = gdf.to_pandas() with expect_warning_if(fill_method not in (no_default, None)): - actual = gdf.groupby(gkey).pct_change( - periods=periods, fill_method=fill_method - ) + actual = gdf.groupby(gkey).pct_change(periods=periods, fill_method=fill_method) with expect_warning_if( ( fill_method not in (no_default, None) @@ -3387,9 +3274,7 @@ def test_groupby_ngroup(by, ascending, df_ngroup): PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="warning not present in older pandas versions", ) -@pytest.mark.parametrize( - "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] -) +@pytest.mark.parametrize("groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]) def test_groupby_dtypes(groups): df = cudf.DataFrame( {"a": [1, 2, 3, 3], "b": ["x", "y", "z", "a"], "c": [10, 11, 12, 12]} @@ -3415,9 +3300,7 @@ def test_groupby_by_index_names(index_names): ) -@pytest.mark.parametrize( - "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] -) +@pytest.mark.parametrize("groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]) def test_group_by_pandas_compat(groups): with cudf.option_context("mode.pandas_compatible", True): df = cudf.DataFrame( @@ -3439,9 +3322,7 @@ def index(self, request): if request.param == "rangeindex": return cudf.RangeIndex(2, n + 2) elif request.param == "intindex": - return cudf.Index( - [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32" - ) + return cudf.Index([2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32") elif request.param == "strindex": return cudf.Index(list(string.ascii_lowercase[:n])) elif request.param == "default": @@ -3511,9 +3392,7 @@ def test_not_implemented_arguments(self, df): @pytest.mark.parametrize("frac", [0, 1 / 3, 1 / 2, 2 / 3, 1]) @pytest.mark.parametrize("replace", [False, True]) def test_fraction_rounding(self, df, by, frac, replace): - result = ( - df.groupby(by).sample(frac=frac, replace=replace).sort_values("a") - ) + result = df.groupby(by).sample(frac=frac, replace=replace).sort_values("a") assert_eq(self.expected(df, frac=frac), result.reset_index(drop=True)) @@ -3522,9 +3401,7 @@ class TestHeadTail: def n(self, request): return request.param - @pytest.fixture( - params=[False, True], ids=["no-preserve-order", "preserve-order"] - ) + @pytest.fixture(params=[False, True], ids=["no-preserve-order", "preserve-order"]) def preserve_order(self, request): return request.param @@ -3565,9 +3442,7 @@ def expected(self, df, n, take_head, preserve_order): slicefunc = operator.itemgetter(slice(None, n)) else: # Tail does group[-n:] except when n == 0 - slicefunc = operator.itemgetter( - slice(-n, None) if n else slice(0) - ) + slicefunc = operator.itemgetter(slice(-n, None) if n else slice(0)) values_to_sort = np.hstack( [df.values_host, np.arange(len(df)).reshape(-1, 1)] ) @@ -3580,9 +3455,7 @@ def expected(self, df, n, take_head, preserve_order): ) ) ) - return cudf.DataFrame( - {"a": expect_a, "b": expect_b}, index=index - ) + return cudf.DataFrame({"a": expect_a, "b": expect_b}, index=index) def test_head_tail(self, df, n, take_head, expected, preserve_order): if take_head: @@ -3609,9 +3482,7 @@ def test_head_tail_empty(): assert_eq(expected, got, check_column_type=False) -@pytest.mark.parametrize( - "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] -) +@pytest.mark.parametrize("groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]) @pytest.mark.parametrize("sort", [True, False]) def test_group_by_pandas_sort_order(groups, sort): with cudf.option_context("mode.pandas_compatible", True): @@ -3652,9 +3523,7 @@ def test_group_by_empty_reduction(dtype, reduce_op): gg = gdf.groupby("a")["c"] pg = pdf.groupby("a")["c"] - assert_eq( - getattr(gg, reduce_op)(), getattr(pg, reduce_op)(), check_dtype=True - ) + assert_eq(getattr(gg, reduce_op)(), getattr(pg, reduce_op)(), check_dtype=True) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py index e081119ff89..9d7b4599a89 100644 --- a/python/cudf/cudf/tests/test_hash_vocab.py +++ b/python/cudf/cudf/tests/test_hash_vocab.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import filecmp import os import warnings @@ -10,9 +10,7 @@ @pytest.fixture(scope="module") def datadir(datadir): - return os.path.join( - datadir, "subword_tokenizer_data", "bert_base_cased_sampled" - ) + return os.path.join(datadir, "subword_tokenizer_data", "bert_base_cased_sampled") def test_correct_bert_base_vocab_hash(datadir, tmpdir): diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py index d420c95cfb4..d48bf4c5b98 100644 --- a/python/cudf/cudf/tests/test_hdf.py +++ b/python/cudf/cudf/tests/test_hdf.py @@ -15,9 +15,7 @@ @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): - types = set(NUMERIC_TYPES + ["datetime64[ns]"] + ["bool"]) - set( - UNSIGNED_TYPES - ) + types = set(NUMERIC_TYPES + ["datetime64[ns]"] + ["bool"]) - set(UNSIGNED_TYPES) typer = {"col_" + val: val for val in types} ncols = len(types) nrows = request.param @@ -59,9 +57,7 @@ def hdf_files(request, tmp_path_factory, pdf): fname_series = {} for column in pdf.columns: - fname_series[column] = ( - tmp_path_factory.mktemp("hdf") / "test_series.hdf" - ) + fname_series[column] = tmp_path_factory.mktemp("hdf") / "test_series.hdf" pdf[column].to_hdf( fname_series[column], key="hdf_series_tests", format=request.param ) @@ -83,9 +79,7 @@ def test_hdf_reader(hdf_files, columns): expect_df = pd.read_hdf(hdf_df_file, columns=columns) got_df = cudf.read_hdf(hdf_df_file, columns=columns) - assert_eq( - expect_df, got_df, check_categorical=False, check_index_type=False - ) + assert_eq(expect_df, got_df, check_categorical=False, check_index_type=False) for column in hdf_series.keys(): expect_series = pd.read_hdf(hdf_series[column]) diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py index f8de16f8609..61ece42e9fc 100644 --- a/python/cudf/cudf/tests/test_hdfs.py +++ b/python/cudf/cudf/tests/test_hdfs.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import os from io import BytesIO @@ -57,9 +57,7 @@ def test_read_csv(tmpdir, pdf, hdfs, test_url): hdfs.upload(basedir + "/test_csv_reader.csv", buffer) if test_url: - hd_fpath = "hdfs://{}:{}{}/test_csv_reader.csv".format( - host, port, basedir - ) + hd_fpath = "hdfs://{}:{}{}/test_csv_reader.csv".format(host, port, basedir) else: hd_fpath = f"hdfs://{basedir}/test_csv_reader.csv" @@ -76,9 +74,7 @@ def test_read_csv(tmpdir, pdf, hdfs, test_url): def test_write_csv(pdf, hdfs, test_url): gdf = cudf.from_pandas(pdf) if test_url: - hd_fpath = "hdfs://{}:{}{}/test_csv_writer.csv".format( - host, port, basedir - ) + hd_fpath = "hdfs://{}:{}{}/test_csv_writer.csv".format(host, port, basedir) else: hd_fpath = f"hdfs://{basedir}/test_csv_writer.csv" @@ -136,9 +132,7 @@ def test_write_parquet(pdf, hdfs, test_url): assert_eq(pdf, got) -@pytest.mark.xfail( - reason="Writing string columns with parition_cols is incorrect" -) +@pytest.mark.xfail(reason="Writing string columns with parition_cols is incorrect") @pytest.mark.parametrize("test_url", [False, True]) def test_write_parquet_partitioned(tmpdir, pdf, hdfs, test_url): pdf.to_parquet( @@ -155,15 +149,11 @@ def test_write_parquet_partitioned(tmpdir, pdf, hdfs, test_url): hd_fpath = f"hdfs://{basedir}/test_parquet_partitioned.parquet" # Clear data written from previous runs hdfs.rm(f"{basedir}/test_parquet_partitioned.parquet", recursive=True) - gdf.to_parquet( - hd_fpath, index=False, partition_cols=["Integer", "Boolean"] - ) + gdf.to_parquet(hd_fpath, index=False, partition_cols=["Integer", "Boolean"]) assert hdfs.exists(f"{basedir}/test_parquet_partitioned.parquet") got = pd.read_parquet(hd_fpath) - expect = pd.read_parquet( - tmpdir.join("pandas_parquet_writer_partitioned.parquet") - ) + expect = pd.read_parquet(tmpdir.join("pandas_parquet_writer_partitioned.parquet")) assert_eq(expect, got) @@ -181,9 +171,7 @@ def test_read_json(tmpdir, pdf, hdfs, test_url): hdfs.upload(basedir + "/test_json_reader.json", buffer) if test_url: - hd_fpath = "hdfs://{}:{}{}/test_json_reader.json".format( - host, port, basedir - ) + hd_fpath = "hdfs://{}:{}{}/test_json_reader.json".format(host, port, basedir) else: hd_fpath = f"hdfs://{basedir}/test_json_reader.json" @@ -221,9 +209,7 @@ def test_write_orc(pdf, hdfs, test_url): pdf["Integer2"] = pdf["Integer2"].astype("int64") gdf = cudf.from_pandas(pdf) if test_url: - hd_fpath = "hdfs://{}:{}{}/test_orc_writer.orc".format( - host, port, basedir - ) + hd_fpath = "hdfs://{}:{}{}/test_orc_writer.orc".format(host, port, basedir) else: hd_fpath = f"hdfs://{basedir}/test_orc_writer.orc" diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 51e9a3022f4..8c139a75161 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -193,9 +193,7 @@ def test_pandas_as_index(): pdf_int_index = pd.Index([1, 2, 3, 4, 5]) pdf_uint_index = pd.Index([1, 2, 3, 4, 5]) pdf_float_index = pd.Index([1.0, 2.0, 3.0, 4.0, 5.0]) - pdf_datetime_index = pd.DatetimeIndex( - [1000000, 2000000, 3000000, 4000000, 5000000] - ) + pdf_datetime_index = pd.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"]) # Define cudf Indexes @@ -221,9 +219,7 @@ def test_pandas_as_index(): assert_eq( pdf_category_index.codes, - gdf_category_index.codes.astype( - pdf_category_index.codes.dtype - ).to_numpy(), + gdf_category_index.codes.astype(pdf_category_index.codes.dtype).to_numpy(), ) @@ -708,9 +704,7 @@ def test_index_argsort(data): pd.Index([1, 10, 2, 100, -10], name="abc"), pd.Index(["z", "x", "a", "c", "b"]), pd.Index(["z", "x", "a", "c", "b"], dtype="category"), - pd.Index( - [-10.2, 100.1, -100.2, 0.0, 0.23], name="this is a float index" - ), + pd.Index([-10.2, 100.1, -100.2, 0.0, 0.23], name="this is a float index"), pd.Index([102, 1001, 1002, 0.0, 23], dtype="datetime64[ns]"), pd.Index([13240.2, 1001, 100.2, 0.0, 23], dtype="datetime64[ns]"), pd.RangeIndex(0, 10, 1), @@ -724,12 +718,8 @@ def test_index_sort_values(data, ascending, return_indexer): pdi = data gdi = cudf.from_pandas(pdi) - expected = pdi.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - actual = gdi.sort_values( - ascending=ascending, return_indexer=return_indexer - ) + expected = pdi.sort_values(ascending=ascending, return_indexer=return_indexer) + actual = gdi.sort_values(ascending=ascending, return_indexer=return_indexer) if return_indexer: expected_indexer = expected[1] @@ -1089,11 +1079,7 @@ def test_index_append_error(data, other): gd_data = cudf.core.index.as_index(data) gd_other = cudf.core.index.as_index(other) - got_dtype = ( - gd_other.dtype - if gd_data.dtype == np.dtype("object") - else gd_data.dtype - ) + got_dtype = gd_other.dtype if gd_data.dtype == np.dtype("object") else gd_data.dtype with pytest.raises( TypeError, match=re.escape( @@ -1243,9 +1229,7 @@ def test_index_append_list(data, other): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) @pytest.mark.parametrize("name", [1, "a", None]) def test_index_basic(data, dtype, name): pdi = pd.Index(data, dtype=dtype, name=name) @@ -1368,9 +1352,7 @@ def test_multiindex_append(data, other): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) def test_index_empty(data, dtype): pdi = pd.Index(data, dtype=dtype) gdi = cudf.Index(data, dtype=dtype) @@ -1379,9 +1361,7 @@ def test_index_empty(data, dtype): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) def test_index_size(data, dtype): pdi = pd.Index(data, dtype=dtype) gdi = cudf.Index(data, dtype=dtype) @@ -1390,9 +1370,7 @@ def test_index_size(data, dtype): @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) def test_index_drop_duplicates(data, dtype): pdi = pd.Index(data, dtype=dtype) gdi = cudf.Index(data, dtype=dtype) @@ -1406,9 +1384,7 @@ def test_dropna_bad_how(): @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) def test_index_tolist(data, dtype): gdi = cudf.Index(data, dtype=dtype) @@ -1424,9 +1400,7 @@ def test_index_tolist(data, dtype): @pytest.mark.parametrize("data", [[], [1], [1, 2, 3]]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) def test_index_iter_error(data, dtype): gdi = cudf.Index(data, dtype=dtype) @@ -1442,9 +1416,7 @@ def test_index_iter_error(data, dtype): @pytest.mark.parametrize("data", [[], [1], [1, 2, 3, 4, 5]]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) def test_index_values_host(data, dtype): gdi = cudf.Index(data, dtype=dtype) pdi = pd.Index(data, dtype=dtype) @@ -1564,9 +1536,7 @@ def test_multiindex_from_arrow(): def test_index_equals_categories(): - lhs = cudf.CategoricalIndex( - ["a", "b", "c", "b", "a"], categories=["a", "b", "c"] - ) + lhs = cudf.CategoricalIndex(["a", "b", "c", "b", "a"], categories=["a", "b", "c"]) rhs = cudf.CategoricalIndex( ["a", "b", "c", "b", "a"], categories=["a", "b", "c", "_"] ) @@ -1721,8 +1691,7 @@ def test_get_indexer_single_unique_numeric(idx, key, method): if ( # `method` only applicable to monotonic index - not pi.is_monotonic_increasing - and method is not None + not pi.is_monotonic_increasing and method is not None ): assert_exceptions_equal( lfunc=pi.get_loc, @@ -1930,8 +1899,7 @@ def test_get_indexer_single_duplicate_string(idx, key, method): if ( # `method` only applicable to monotonic index - (not pi.is_monotonic_increasing and method is not None) - or not pi.is_unique + (not pi.is_monotonic_increasing and method is not None) or not pi.is_unique ): assert_exceptions_equal( lfunc=pi.get_indexer, @@ -2027,15 +1995,11 @@ def test_get_loc_multi_numeric_deviate(idx, key, result): pi = idx gi = cudf.from_pandas(pi) - with expect_warning_if( - isinstance(key, tuple), pd.errors.PerformanceWarning - ): + with expect_warning_if(isinstance(key, tuple), pd.errors.PerformanceWarning): key_flag = key not in pi if key_flag: - with expect_warning_if( - isinstance(key, tuple), pd.errors.PerformanceWarning - ): + with expect_warning_if(isinstance(key, tuple), pd.errors.PerformanceWarning): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, @@ -2202,9 +2166,7 @@ def test_get_loc_multi_string(idx, key): ), ], ) -@pytest.mark.parametrize( - "key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]] -) +@pytest.mark.parametrize("key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]]) @pytest.mark.parametrize("method", [None, "ffill", "bfill"]) def test_get_indexer_multi_string(idx, key, method): pi = idx.sort_values() @@ -2247,9 +2209,7 @@ def test_get_indexer_multi_string(idx, key, method): def test_get_indexer_invalid(idx1, idx2): idx1 = idx1() idx2 = idx2() - assert_eq( - idx1.get_indexer(idx2), idx1.to_pandas().get_indexer(idx2.to_pandas()) - ) + assert_eq(idx1.get_indexer(idx2), idx1.to_pandas().get_indexer(idx2.to_pandas())) @pytest.mark.parametrize( @@ -2427,9 +2387,7 @@ def test_index_type_methods(data, func): assert_eq(expected, actual) -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] -) +@pytest.mark.parametrize("resolution", ["D", "h", "min", "s", "ms", "us", "ns"]) def test_index_datetime_ceil(resolution): cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) pidx = cuidx.to_pandas() @@ -2440,9 +2398,7 @@ def test_index_datetime_ceil(resolution): assert_eq(pidx_ceil, cuidx_ceil) -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] -) +@pytest.mark.parametrize("resolution", ["D", "h", "min", "s", "ms", "us", "ns"]) def test_index_datetime_floor(resolution): cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) pidx = cuidx.to_pandas() @@ -2453,9 +2409,7 @@ def test_index_datetime_floor(resolution): assert_eq(pidx_floor, cuidx_floor) -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] -) +@pytest.mark.parametrize("resolution", ["D", "h", "min", "s", "ms", "us", "ns"]) def test_index_datetime_round(resolution): cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) pidx = cuidx.to_pandas() @@ -2641,9 +2595,7 @@ def test_index_constructor_integer(default_integer_bitwidth): def test_index_constructor_float(default_float_bitwidth): got = cudf.Index([1.0, 2.0, 3.0]) - expect = cudf.Index( - [1.0, 2.0, 3.0], dtype=f"float{default_float_bitwidth}" - ) + expect = cudf.Index([1.0, 2.0, 3.0], dtype=f"float{default_float_bitwidth}") assert_eq(expect, got) @@ -2679,9 +2631,7 @@ def test_rangeindex_take_default_user_option(default_integer_bitwidth): # configuration for take operation. idx = cudf.RangeIndex(0, 100) actual = idx.take([0, 3, 7, 62]) - expected = cudf.Index( - [0, 3, 7, 62], dtype=f"int{default_integer_bitwidth}" - ) + expected = cudf.Index([0, 3, 7, 62], dtype=f"int{default_integer_bitwidth}") assert_eq(expected, actual) @@ -2724,9 +2674,7 @@ def test_rangeindex_binops_user_option( # configuration for binary operation. idx = cudf.RangeIndex(1, 5) actual = op(idx) - expected = cudf.Index( - expected, dtype=f"{expected_kind}{default_integer_bitwidth}" - ) + expected = cudf.Index(expected, dtype=f"{expected_kind}{default_integer_bitwidth}") assert_eq( expected, actual, @@ -3038,9 +2986,7 @@ def test_empty_index_init(): assert_eq(pidx, gidx) -@pytest.mark.parametrize( - "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] -) +@pytest.mark.parametrize("data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]) @pytest.mark.parametrize("data_name", [None, 1, "abc"]) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("name", [None, no_default, 1, "abc"]) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 5f5c4579e01..4d7eecc767c 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -188,9 +188,7 @@ def test_series_indexing_large_size(): @pytest.mark.parametrize("psr", [pd.Series([1, 2, 3], index=["a", "b", "c"])]) -@pytest.mark.parametrize( - "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] -) +@pytest.mark.parametrize("arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]]) def test_series_get_item(psr, arg): gsr = cudf.from_pandas(psr) @@ -205,12 +203,8 @@ def test_dataframe_column_name_indexing(): data = np.asarray(range(10), dtype=np.int32) df["a"] = data df[1] = data - np.testing.assert_equal( - df["a"].to_numpy(), np.asarray(range(10), dtype=np.int32) - ) - np.testing.assert_equal( - df[1].to_numpy(), np.asarray(range(10), dtype=np.int32) - ) + np.testing.assert_equal(df["a"].to_numpy(), np.asarray(range(10), dtype=np.int32)) + np.testing.assert_equal(df[1].to_numpy(), np.asarray(range(10), dtype=np.int32)) pdf = pd.DataFrame() nelem = 10 @@ -241,13 +235,9 @@ def test_dataframe_column_name_indexing(): def test_dataframe_slicing(): df = cudf.DataFrame() size = 123 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) + df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(np.int32) df["b"] = hb = np.random.random(size).astype(np.float32) - df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype( - np.int64 - ) + df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype(np.int64) df["d"] = hd = np.random.random(size).astype(np.float64) # Row slice first 10 @@ -326,17 +316,13 @@ def test_dataframe_loc(scalar, step): assert_eq(df.loc[begin:end, ["c", "d"]], pdf.loc[begin:end, ["c", "d"]]) # Slicing on columns: - assert_eq( - df.loc[begin:end:step, "a":"c"], pdf.loc[begin:end:step, "a":"c"] - ) + assert_eq(df.loc[begin:end:step, "a":"c"], pdf.loc[begin:end:step, "a":"c"]) # Slicing of size 1: assert_eq(df.loc[begin:begin, "a"], pdf.loc[begin:begin, "a"]) # TODO: Pandas changes the dtype here when it shouldn't - assert_eq( - df.loc[begin, "a":"a"], pdf.loc[begin, "a":"a"], check_dtype=False - ) + assert_eq(df.loc[begin, "a":"a"], pdf.loc[begin, "a":"a"], check_dtype=False) # Repeat with at[] assert_eq( @@ -383,9 +369,7 @@ def test_dataframe_loc_duplicate_index_scalar(): ) @pytest.mark.parametrize("arg", ["a", slice("a", "a"), slice("a", "b")]) def test_dataframe_loc_mask(mask, arg): - pdf = pd.DataFrame( - {"a": ["a", "b", "c", "d", "e"], "b": ["f", "g", "h", "i", "j"]} - ) + pdf = pd.DataFrame({"a": ["a", "b", "c", "d", "e"], "b": ["f", "g", "h", "i", "j"]}) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf.loc[mask, arg], gdf.loc[mask, arg]) @@ -394,9 +378,7 @@ def test_dataframe_loc_mask(mask, arg): def test_dataframe_loc_outbound(): df = cudf.DataFrame() size = 10 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) + df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(np.int32) df["b"] = hb = np.random.random(size).astype(np.float32) pdf = pd.DataFrame() @@ -441,9 +423,7 @@ def test_series_loc_float_index(): def test_series_loc_string(): - ps = pd.Series( - [1, 2, 3, 4, 5], index=["one", "two", "three", "four", "five"] - ) + ps = pd.Series([1, 2, 3, 4, 5], index=["one", "two", "three", "four", "five"]) gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc["one"], gs.loc["one"]) @@ -460,9 +440,7 @@ def test_series_loc_string(): def test_series_loc_datetime(): - ps = pd.Series( - [1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105") - ) + ps = pd.Series([1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105")) gs = cudf.Series.from_pandas(ps) # a few different ways of specifying a datetime label: @@ -523,9 +501,7 @@ def test_series_loc_datetime(): def test_series_loc_categorical(): - ps = pd.Series( - [1, 2, 3, 4, 5], index=pd.Categorical(["a", "b", "c", "d", "e"]) - ) + ps = pd.Series([1, 2, 3, 4, 5], index=pd.Categorical(["a", "b", "c", "d", "e"])) gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc["a"], gs.loc["a"]) @@ -537,9 +513,7 @@ def test_series_loc_categorical(): # order of categories changes, so we can only # compare values: - assert_eq( - ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_numpy() - ) + assert_eq(ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_numpy()) assert_eq( ps.loc[[True, False, True, False, True]], @@ -553,25 +527,19 @@ def test_series_loc_categorical(): pd.DataFrame( {"a": [1, 2, 3, 4]}, index=pd.MultiIndex.from_frame( - pd.DataFrame( - {"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]} - ) + pd.DataFrame({"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]}) ), ), pd.Series( [1, 2, 3, 4], index=pd.MultiIndex.from_frame( - pd.DataFrame( - {"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]} - ) + pd.DataFrame({"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]}) ), ), ], ) def test_dataframe_series_loc_multiindex(obj): - pindex = pd.MultiIndex.from_frame( - pd.DataFrame({"A": [3, 2], "B": ["high", "low"]}) - ) + pindex = pd.MultiIndex.from_frame(pd.DataFrame({"A": [3, 2], "B": ["high", "low"]})) gobj = cudf.from_pandas(obj) gindex = cudf.MultiIndex.from_pandas(pindex) @@ -627,9 +595,7 @@ def test_series_iloc(nelem): def test_dataframe_iloc(nelem): gdf = cudf.DataFrame() - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) + gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(np.int32) gdf["b"] = hb = np.random.random(nelem).astype(np.float32) pdf = pd.DataFrame() @@ -681,9 +647,7 @@ def test_dataframe_iloc(nelem): def test_dataframe_iloc_tuple(): gdf = cudf.DataFrame() nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) + gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(np.int32) gdf["b"] = hb = np.random.random(nelem).astype(np.float32) pdf = pd.DataFrame() @@ -697,9 +661,7 @@ def test_dataframe_iloc_tuple(): def test_dataframe_iloc_index_error(): gdf = cudf.DataFrame() nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) + gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(np.int32) gdf["b"] = hb = np.random.random(nelem).astype(np.float32) pdf = pd.DataFrame() @@ -1218,9 +1180,7 @@ def test_dataframe_setitem_iloc_multiindex(key, value, pdf_gdf_multi): def test_boolean_indexing_single_row(pdf_gdf): pdf, gdf = pdf_gdf - assert_eq( - pdf.loc[[True, False, False], :], gdf.loc[[True, False, False], :] - ) + assert_eq(pdf.loc[[True, False, False], :], gdf.loc[[True, False, False], :]) def test_iloc_negative_indices(): @@ -1688,9 +1648,7 @@ def test_dataframe_loc_inplace_update_shape_mismatch_RHS_df(): def test_dataframe_iloc_inplace_update_shape_mismatch_RHS_df(): gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) with pytest.raises(ValueError, match="shape mismatch:"): - gdf.iloc[[0, 2]] = cudf.DataFrame( - {"x": [10, 20]}, index=cudf.Index([0, 2]) - ) + gdf.iloc[[0, 2]] = cudf.DataFrame({"x": [10, 20]}, index=cudf.Index([0, 2])) @pytest.mark.parametrize( @@ -1803,18 +1761,14 @@ def test_boolean_mask_columns_iloc_series(): @pytest.mark.parametrize("index_type", ["single", "slice"]) def test_loc_timestamp_issue_8585(index_type): # https://github.com/rapidsai/cudf/issues/8585 - start = pd.Timestamp( - datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") - ) + start = pd.Timestamp(datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")) end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M")) timestamps = pd.date_range(start, end, periods=12) value = np.random.normal(size=12) df = pd.DataFrame(value, index=timestamps, columns=["value"]) cdf = cudf.from_pandas(df) if index_type == "single": - index = pd.Timestamp( - datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M") - ) + index = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")) elif index_type == "slice": index = slice(start, end, None) else: @@ -1844,9 +1798,7 @@ def test_loc_timestamp_issue_8585(index_type): ) def test_loc_multiindex_timestamp_issue_8585(index_type): # https://github.com/rapidsai/cudf/issues/8585 - start = pd.Timestamp( - datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") - ) + start = pd.Timestamp(datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")) end = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")) timestamps = pd.date_range(start, end, periods=4) labels = ["A", "B", "C"] @@ -1856,14 +1808,10 @@ def test_loc_multiindex_timestamp_issue_8585(index_type): value = np.random.normal(size=12) df = pd.DataFrame(value, index=index, columns=["value"]) cdf = cudf.from_pandas(df) - start = pd.Timestamp( - datetime.strptime("2021-03-12 01:00", "%Y-%m-%d %H:%M") - ) + start = pd.Timestamp(datetime.strptime("2021-03-12 01:00", "%Y-%m-%d %H:%M")) end = pd.Timestamp(datetime.strptime("2021-03-12 02:00", "%Y-%m-%d %H:%M")) if index_type == "single": - index = pd.Timestamp( - datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M") - ) + index = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")) elif index_type == "slice": index = slice(start, end, None) elif index_type == "date_range": @@ -2113,9 +2061,7 @@ def test_loc_index_inindex_subset(self, df, take_order): actual = df.loc[vals] assert_eq(expect, actual) - def test_loc_index_notinindex_slice( - self, request, df, order, dtype, take_order - ): + def test_loc_index_notinindex_slice(self, request, df, order, dtype, take_order): pdf = df.to_pandas() lo = pdf.index[1] hi = pdf.index[-2] @@ -2198,9 +2144,7 @@ def test_loc_setitem_categorical_integer_not_position_based(): @pytest.mark.parametrize("typ", ["datetime64[ns]", "timedelta64[ns]"]) @pytest.mark.parametrize("idx_method, key", [["iloc", 0], ["loc", "a"]]) -def test_series_iloc_scalar_datetimelike_return_pd_scalar( - typ, idx_method, key -): +def test_series_iloc_scalar_datetimelike_return_pd_scalar(typ, idx_method, key): obj = cudf.Series([1, 2, 3], index=list("abc"), dtype=typ) with cudf.option_context("mode.pandas_compatible", True): result = getattr(obj, idx_method)[key] @@ -2215,9 +2159,7 @@ def test_series_iloc_scalar_datetimelike_return_pd_scalar( def test_dataframe_iloc_scalar_datetimelike_return_pd_scalar( typ, idx_method, row_key, col_key ): - obj = cudf.DataFrame( - [1, 2, 3], index=list("abc"), columns=["a"], dtype=typ - ) + obj = cudf.DataFrame([1, 2, 3], index=list("abc"), columns=["a"], dtype=typ) with cudf.option_context("mode.pandas_compatible", True): result = getattr(obj, idx_method)[row_key, col_key] expected = getattr(obj.to_pandas(), idx_method)[row_key, col_key] @@ -2237,9 +2179,7 @@ def test_series_iloc_scalar_interval_return_pd_scalar(idx_method, key): @pytest.mark.parametrize( "idx_method, row_key, col_key", [["iloc", 0, 0], ["loc", "a", "a"]] ) -def test_dataframe_iloc_scalar_interval_return_pd_scalar( - idx_method, row_key, col_key -): +def test_dataframe_iloc_scalar_interval_return_pd_scalar(idx_method, row_key, col_key): iidx = cudf.IntervalIndex.from_breaks([1, 2, 3]) obj = cudf.DataFrame({"a": iidx}, index=list("ab")) with cudf.option_context("mode.pandas_compatible", True): @@ -2249,9 +2189,7 @@ def test_dataframe_iloc_scalar_interval_return_pd_scalar( def test_scalar_loc_row_categoricalindex(): - df = cudf.DataFrame( - range(4), index=cudf.CategoricalIndex(["a", "a", "b", "c"]) - ) + df = cudf.DataFrame(range(4), index=cudf.CategoricalIndex(["a", "a", "b", "c"])) result = df.loc["a"] expected = df.to_pandas().loc["a"] assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index a0e90cc89a2..59ecd25d9d2 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -67,9 +67,7 @@ def test_interpolate_series(data, method, axis): assert_eq(expect, got, check_dtype=psr.dtype != "object") -@pytest.mark.parametrize( - "data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])] -) +@pytest.mark.parametrize("data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])]) def test_interpolate_series_unsorted_index(data, index): gsr = cudf.Series(data, index=index) psr = gsr.to_pandas() diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 7b923af1f75..4dd584f4447 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -57,9 +57,7 @@ def test_create_interval_series(data1, data2, data3, data4, closed): @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) def test_create_interval_df(data1, data2, data3, data4, closed): # df for both pandas and cudf only works when interval is in a list - expect = pd.DataFrame( - [pd.Interval(data1, data2, closed)], dtype="interval" - ) + expect = pd.DataFrame([pd.Interval(data1, data2, closed)], dtype="interval") got = cudf.DataFrame([pd.Interval(data1, data2, closed)], dtype="interval") assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py index 8d71a6c05b8..335c9511995 100644 --- a/python/cudf/cudf/tests/test_join_order.py +++ b/python/cudf/cudf/tests/test_join_order.py @@ -45,8 +45,7 @@ def expected(left, right, sort, *, how): def test_join_ordering_pandas_compat(request, left, right, sort, how): request.applymarker( pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and how == "right", + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION and how == "right", reason="TODO: Result ording of suffix'ed columns is incorrect", ) ) @@ -127,9 +126,7 @@ def test_merge_combinations( expected = expected.sort_values("key") if not other_unique: other_value_counts = other["key"].value_counts() - repeats = other_value_counts.reindex( - expected["key"].values, fill_value=1 - ) + repeats = other_value_counts.reindex(expected["key"].values, fill_value=1) repeats = repeats.astype(np.intp) expected = expected["key"].repeat(repeats.values) expected = expected.to_frame() diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index c063043b72a..43f67751e66 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -71,15 +71,11 @@ def assert_join_results_equal(expect, got, how, **kwargs): **kwargs, ) elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): - if not len( - expect.columns - ): # can't sort_values() on a df without columns + if not len(expect.columns): # can't sort_values() on a df without columns return assert_eq(expect, got, **kwargs) assert_eq( - expect.sort_values(expect.columns.to_list()).reset_index( - drop=True - ), + expect.sort_values(expect.columns.to_list()).reset_index(drop=True), got.sort_values(got.columns.to_list()).reset_index(drop=True), **kwargs, ) @@ -151,9 +147,7 @@ def _check_series(expect, got): nanfilled_equal = np.all( expect.fillna(magic).values == got.fillna(magic).to_numpy() ) - msg = "direct_equal={}, nanfilled_equal={}".format( - direct_equal, nanfilled_equal - ) + msg = "direct_equal={}, nanfilled_equal={}".format(direct_equal, nanfilled_equal) assert direct_equal or nanfilled_equal, msg @@ -168,9 +162,7 @@ def test_dataframe_join_suffix(): left = df.set_index("a") right = df.set_index("c") - msg = ( - "there are overlapping columns but lsuffix and rsuffix are not defined" - ) + msg = "there are overlapping columns but lsuffix and rsuffix are not defined" with pytest.raises(ValueError, match=msg): left.join(right) @@ -246,9 +238,7 @@ def test_dataframe_join_mismatch_cats(how): "data_col_left": [10, 20, 30, 40, 50], } ) - pdf2 = pd.DataFrame( - {"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]} - ) + pdf2 = pd.DataFrame({"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]}) pdf1["join_col"] = pdf1["join_col"].astype("category") pdf2["join_col"] = pdf2["join_col"].astype("category") @@ -317,9 +307,7 @@ def test_dataframe_merge_on(on): for col in list(pddf_joined.columns): if col.count("_y") > 0: - join_result[col] = ( - join_result[col].astype(np.float64).fillna(np.nan) - ) + join_result[col] = join_result[col].astype(np.float64).fillna(np.nan) join_result_cudf[col] = ( join_result_cudf[col].astype(np.float64).fillna(np.nan) ) @@ -331,9 +319,9 @@ def test_dataframe_merge_on(on): .reset_index(drop=True) ) - pdf_result = pddf_joined.sort_values( - list(pddf_joined.columns) - ).reset_index(drop=True) + pdf_result = pddf_joined.sort_values(list(pddf_joined.columns)).reset_index( + drop=True + ) assert_join_results_equal(cdf_result, pdf_result, how="left") @@ -476,9 +464,7 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): match="No common columns to perform merge on", ): pdf_left.merge(pdf_right) - with pytest.raises( - ValueError, match="No common columns to perform merge on" - ): + with pytest.raises(ValueError, match="No common columns to perform merge on"): gdf_left.merge(gdf_right) elif not [value for value in pdf_left if value in pdf_right]: with pytest.raises( @@ -486,9 +472,7 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): match="No common columns to perform merge on", ): pdf_left.merge(pdf_right) - with pytest.raises( - ValueError, match="No common columns to perform merge on" - ): + with pytest.raises(ValueError, match="No common columns to perform merge on"): gdf_left.merge(gdf_right) else: pdf_result = pdf_left.merge(pdf_right, how=how) @@ -548,9 +532,7 @@ def test_empty_joins(how, left_empty, right_empty): def test_merge_left_index_zero(): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) - right = pd.DataFrame( - {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] - ) + right = pd.DataFrame({"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6]) gleft = cudf.from_pandas(left) gright = cudf.from_pandas(right) pd_merge = left.merge(right, left_on="x", right_on="y") @@ -570,9 +552,7 @@ def test_merge_left_index_zero(): ) def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) - right = pd.DataFrame( - {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] - ) + right = pd.DataFrame({"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6]) gleft = cudf.from_pandas(left) gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) @@ -591,9 +571,7 @@ def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs): ) def test_merge_left_right_index_left_right_on_kwargs(kwargs): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 5, 6]) - right = pd.DataFrame( - {"y": [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7] - ) + right = pd.DataFrame({"y": [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7]) gleft = cudf.from_pandas(left) gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) @@ -684,9 +662,7 @@ def test_merge_left_right_index_left_right_on_kwargs2(kwargs): assert gd_merge.empty -@pytest.mark.parametrize( - "hows", [{"how": "inner"}, {"how": "left"}, {"how": "outer"}] -) +@pytest.mark.parametrize("hows", [{"how": "inner"}, {"how": "left"}, {"how": "outer"}]) @pytest.mark.parametrize( "ons", [ @@ -725,12 +701,8 @@ def test_merge_sort(ons, hows): gd_merge = gd_merge.drop(kwargs["on"], axis=1) if not pd_merge.empty: # check to make sure the non join key columns are the same - pd_merge = pd_merge.sort_values(list(pd_merge.columns)).reset_index( - drop=True - ) - gd_merge = gd_merge.sort_values(list(gd_merge.columns)).reset_index( - drop=True - ) + pd_merge = pd_merge.sort_values(list(pd_merge.columns)).reset_index(drop=True) + gd_merge = gd_merge.sort_values(list(gd_merge.columns)).reset_index(drop=True) assert_join_results_equal(pd_merge, gd_merge, how="left") @@ -848,9 +820,7 @@ def test_join_empty_table_dtype(): ), ( pd.Series(["dog", "cat", "fish", "bug"] * 2).astype("category"), - pd.Series(["bird", "cat", "mouse", "snake"] * 2).astype( - "category" - ), + pd.Series(["bird", "cat", "mouse", "snake"] * 2).astype("category"), ), ], ) @@ -1085,9 +1055,7 @@ def test_typecast_on_join_no_float_round(): exp_By = ["a", "b", "c", None, None] exp_join_col = cudf.Series(exp_join_data, dtype="float32") - expect = cudf.DataFrame( - {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By} - ) + expect = cudf.DataFrame({"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By}) got = gdf_l.merge(gdf_r, on="join_col", how="left") @@ -1147,12 +1115,8 @@ def test_typecast_on_join_overflow_unsafe(dtypes): def test_decimal_typecast_inner(dtype): other_data = ["a", "b", "c", "d", "e"] - join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype( - dtype - ) - join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype( - dtype - ) + join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype(dtype) + join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype(dtype) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) @@ -1188,12 +1152,8 @@ def test_decimal_typecast_inner(dtype): def test_decimal_typecast_left(dtype): other_data = ["a", "b", "c", "d"] - join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype( - dtype - ) - join_data_r = cudf.Series( - ["95.05", "62.4056", "74.22", "1456.9472"] - ).astype(dtype) + join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype(dtype) + join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1456.9472"]).astype(dtype) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) @@ -1229,12 +1189,8 @@ def test_decimal_typecast_left(dtype): ) def test_decimal_typecast_outer(dtype): other_data = ["a", "b", "c"] - join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype( - dtype - ) - join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype( - dtype - ) + join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype(dtype) + join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype(dtype) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = ["9284.103", "948.637", "1029.528", "741.248", "3627.292"] @@ -1265,12 +1221,8 @@ def test_decimal_typecast_outer(dtype): def test_mixed_decimal_typecast(dtype_l, dtype_r): other_data = ["a", "b", "c", "d"] - join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype( - dtype_r - ) - join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype( - dtype_l - ) + join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype(dtype_r) + join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype(dtype_l) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) @@ -1590,9 +1542,7 @@ def test_index_join(lhs, rhs, how, level): def test_index_join_corner_cases(): l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_pdf = pd.DataFrame( - {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} - ) + r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]}) l_df = cudf.from_pandas(l_pdf) r_df = cudf.from_pandas(r_pdf) @@ -1649,9 +1599,7 @@ def test_index_join_corner_cases(): def test_index_join_exception_cases(): l_df = cudf.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_df = cudf.DataFrame( - {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} - ) + r_df = cudf.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]}) # Join between two MultiIndex lhs = ["a", "b"] @@ -1793,9 +1741,7 @@ def test_typecast_on_join_indexes_matching_categorical(): cudf.DataFrame({"b": [2, 3, 4], "c": [4, 5, 6]}), ], ) -@pytest.mark.parametrize( - "how", ["left", "inner", "outer", "leftanti", "leftsemi"] -) +@pytest.mark.parametrize("how", ["left", "inner", "outer", "leftanti", "leftsemi"]) @pytest.mark.parametrize( "kwargs", [ @@ -1825,9 +1771,7 @@ def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): @pytest.mark.xfail(reason="Cannot sort values of list dtype") -@pytest.mark.parametrize( - "how", ["left", "inner", "right", "leftanti", "leftsemi"] -) +@pytest.mark.parametrize("how", ["left", "inner", "right", "leftanti", "leftsemi"]) def test_merge_with_lists(how): pd_left = pd.DataFrame( { @@ -1858,9 +1802,7 @@ def test_join_renamed_index(): ).set_index([0, 1]) df.index.names = ["a", "b"] # doesn't actually change df._index._data - expect = df.to_pandas().merge( - df.to_pandas(), left_index=True, right_index=True - ) + expect = df.to_pandas().merge(df.to_pandas(), left_index=True, right_index=True) got = df.merge(df, left_index=True, right_index=True, how="inner") assert_join_results_equal(expect, got, how="inner") @@ -2121,9 +2063,7 @@ def test_string_join_values_nulls(): ) def test_merge_mixed_index_columns(left_on, right_on): left = pd.DataFrame({"a": [1, 2, 1, 2], "b": [2, 3, 3, 4]}).set_index("a") - right = pd.DataFrame({"a": [1, 2, 1, 3], "b": [2, 30, 3, 4]}).set_index( - "a" - ) + right = pd.DataFrame({"a": [1, 2, 1, 3], "b": [2, 30, 3, 4]}).set_index("a") left["c"] = 10 @@ -2168,12 +2108,8 @@ def test_join_on_index_with_duplicate_names(): # overall, we *should* be able to join on them: lhs = pd.DataFrame({"a": [1, 2, 3]}) rhs = pd.DataFrame({"b": [1, 2, 3]}) - lhs.index = pd.MultiIndex.from_tuples( - [(1, 1), (1, 2), (2, 1)], names=["x", "x"] - ) - rhs.index = pd.MultiIndex.from_tuples( - [(1, 1), (1, 3), (2, 1)], names=["x", "x"] - ) + lhs.index = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["x", "x"]) + rhs.index = pd.MultiIndex.from_tuples([(1, 1), (1, 3), (2, 1)], names=["x", "x"]) expect = lhs.join(rhs, how="inner") lhs = cudf.from_pandas(lhs) @@ -2187,9 +2123,7 @@ def test_join_redundant_params(): lhs = cudf.DataFrame( {"a": [1, 2, 3], "c": [2, 3, 4]}, index=cudf.Index([0, 1, 2], name="c") ) - rhs = cudf.DataFrame( - {"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a") - ) + rhs = cudf.DataFrame({"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a")) with pytest.raises(ValueError): lhs.merge(rhs, on="a", left_index=True) with pytest.raises(ValueError): @@ -2202,11 +2136,7 @@ def test_join_redundant_params(): def test_join_multiindex_index(): # test joining a MultiIndex with an Index with overlapping name - lhs = ( - cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]}) - .set_index(["a", "b"]) - .index - ) + lhs = cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]}).set_index(["a", "b"]).index rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner") got = lhs.join(rhs, how="inner") diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 40935733f34..d33133f32f5 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -37,10 +37,7 @@ def pdf(request): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) - for typ in types - } + {f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) for typ in types} ) # Delete the name of the column index, and rename the row index test_pdf.columns.name = None @@ -91,8 +88,7 @@ def json_files(request, tmp_path_factory, pdf): index, compression, orient = request.param if index is False and orient not in ("split", "table"): pytest.skip( - "'index=False' is only valid when 'orient' is 'split' or " - "'table'" + "'index=False' is only valid when 'orient' is 'split' or " "'table'" ) if index is False and orient == "table": pytest.skip("'index=False' isn't valid when 'orient' is 'table'") @@ -175,9 +171,7 @@ def test_json_writer(tmpdir, pdf, gdf): assert_eq(pdf_string, gdf_string) -@pytest.mark.parametrize( - "lines", [True, False], ids=["lines=True", "lines=False"] -) +@pytest.mark.parametrize("lines", [True, False], ids=["lines=True", "lines=False"]) def test_cudf_json_writer(pdf, lines): # removing datetime column because pandas doesn't support it for col_name in pdf.columns: @@ -198,12 +192,9 @@ def test_cudf_json_writer(pdf, lines): def test_cudf_json_writer_read(gdf_writer_types): dtypes = { - col_name: col_name[len("col_") :] - for col_name in gdf_writer_types.columns + col_name: col_name[len("col_") :] for col_name in gdf_writer_types.columns } - gdf_string = gdf_writer_types.to_json( - orient="records", lines=True, engine="cudf" - ) + gdf_string = gdf_writer_types.to_json(orient="records", lines=True, engine="cudf") gdf2 = cudf.read_json( StringIO(gdf_string), lines=True, @@ -302,9 +293,7 @@ def test_cudf_json_writer_sinks(sink, tmp_path_factory): target = tmp_path_factory.mktemp("json") / "test_df.json" df.to_json(target, engine="cudf") if sink == "string": - assert ( - target.getvalue() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' - ) + assert target.getvalue() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' elif sink == "file": assert os.path.exists(target) with open(target, "r") as f: @@ -422,37 +411,27 @@ def test_json_read_directory(tmpdir, json_input, engine): def test_json_lines_byte_range(json_input): # include the first row and half of the second row # should parse the first two rows - will_warn = isinstance(json_input, str) and not json_input.endswith( - ".json" - ) + will_warn = isinstance(json_input, str) and not json_input.endswith(".json") with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(0, 15) - ) + df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(0, 15)) assert df.shape == (2, 3) # include half of the second row and half of the third row # should parse only the third row with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 10) - ) + df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(15, 10)) assert df.shape == (1, 3) # include half of the second row and entire third row # should parse only the third row with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 0) - ) + df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(15, 0)) assert df.shape == (1, 3) # include half of the second row till past the end of the file # should parse only the third row with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(10, 50) - ) + df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(10, 50)) assert df.shape == (1, 3) @@ -495,9 +474,7 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp): @pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.filterwarnings( - "ignore:engine='cudf_legacy' is a deprecated engine." -) +@pytest.mark.filterwarnings("ignore:engine='cudf_legacy' is a deprecated engine.") def test_json_engine_selection(): json = "[1, 2, 3]" @@ -541,9 +518,7 @@ def test_json_bool_values(): np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) -@pytest.mark.filterwarnings( - "ignore:engine='cudf_legacy' is a deprecated engine." -) +@pytest.mark.filterwarnings("ignore:engine='cudf_legacy' is a deprecated engine.") @pytest.mark.parametrize( "buffer", [ @@ -559,9 +534,7 @@ def test_json_null_literal(buffer): # first column contains a null field, type should be set to float # second column contains only empty fields, type should be set to int8 np.testing.assert_array_equal(df.dtypes, ["float64", "int8"]) - np.testing.assert_array_equal( - df["0"].to_numpy(na_value=np.nan), [1.0, np.nan] - ) + np.testing.assert_array_equal(df["0"].to_numpy(na_value=np.nan), [1.0, np.nan]) np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0]) @@ -588,12 +561,8 @@ def test_json_corner_case_with_escape_and_double_quote_char_with_pandas( ) pdf.to_json(fname, compression="infer", lines=True, orient="records") - df = cudf.read_json( - fname, compression="infer", lines=True, orient="records" - ) - pdf = pd.read_json( - fname, compression="infer", lines=True, orient="records" - ) + df = cudf.read_json(fname, compression="infer", lines=True, orient="records") + pdf = pd.read_json(fname, compression="infer", lines=True, orient="records") assert_eq(cudf.DataFrame(pdf), df) @@ -606,9 +575,7 @@ def test_json_corner_case_with_escape_and_double_quote_char_with_strings(): {"a":"\'","b":"\\t","c":"cudf"}""" ) - df = cudf.read_json( - str_buffer, compression="infer", lines=True, orient="records" - ) + df = cudf.read_json(str_buffer, compression="infer", lines=True, orient="records") expected = { "a": ['ab"cd', "\\\b", "\r\\", "'"], @@ -660,9 +627,7 @@ def test_json_to_json_special_characters(): ( cudf.DataFrame( { - "int64 col": cudf.Series( - [1, 2, None, 2323, None], dtype="int64" - ), + "int64 col": cudf.Series([1, 2, None, 2323, None], dtype="int64"), "string col": cudf.Series( ["abc", "a", None, "", None], dtype="str" ), @@ -700,9 +665,7 @@ def test_json_to_json_special_characters(): [None, True, False, None, True], dtype=pd.BooleanDtype(), ), - "categorical col": pd.Series( - [1, 2, 1, None, 2], dtype="category" - ), + "categorical col": pd.Series([1, 2, 1, None, 2], dtype="category"), "datetime col": pd.Series( [1231233, None, 2323234, None, 1], dtype="datetime64[ns]", @@ -835,9 +798,7 @@ def test_json_nested_lines(data, lines): # such that pandas would have the f1 member with null # Also, pyarrow chooses to select different ordering of a nested column # children though key-value pairs are correct. - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) + pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) assert df.to_arrow().equals(pa_table_pdf) @@ -849,9 +810,7 @@ def test_json_nested_data(): df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") pdf = pd.read_json(StringIO(json_str), orient="records") pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) + pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) assert df.to_arrow().equals(pa_table_pdf) @@ -880,9 +839,7 @@ def test_json_types_data(): df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") pdf = pd.read_json(StringIO(json_str), orient="records") pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) + pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) assert df.to_arrow().equals(pa_table_pdf) @@ -1105,13 +1062,9 @@ def test_json_dtypes_nested_data(): }, ) - pdf = pd.read_json( - StringIO(expected_json_str), orient="records", lines=True - ) + pdf = pd.read_json(StringIO(expected_json_str), orient="records", lines=True) pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) + pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) assert df.to_arrow().equals(pa_table_pdf) @@ -1287,9 +1240,7 @@ def test_json_array_of_arrays(data, lines): # for values orient in cudf json reader pdf.rename(columns={name: str(name) for name in pdf.columns}, inplace=True) # assert_eq(pdf, df) - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) + pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) assert df.to_arrow().equals(pa_table_pdf) @@ -1385,9 +1336,7 @@ def _replace_with_nulls(df, replace_items): # {} in pandas is represented as {"0": None} in cudf assert_eq(gdf, pdf) assert_eq(gdf2, pdf) - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=gdf.to_arrow().schema, safe=False - ) + pa_table_pdf = pa.Table.from_pandas(pdf, schema=gdf.to_arrow().schema, safe=False) assert gdf.to_arrow().equals(pa_table_pdf) assert gdf2.to_arrow().equals(pa_table_pdf) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index f04cb8a91a4..432081552f7 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -203,15 +203,11 @@ def test_take(data, idx): ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")), ( [["a", "b"], ["c"]], - pytest.raises( - TypeError, match="should be column of values of index types" - ), + pytest.raises(TypeError, match="should be column of values of index types"), ), ( [[[1], [0]], [[0]]], - pytest.raises( - TypeError, match="should be column of values of index types" - ), + pytest.raises(TypeError, match="should be column of values of index types"), ), ([[0, 1], None], pytest.raises(ValueError, match="contains null")), ], @@ -278,9 +274,7 @@ def key_func_builder(x, na_position): [ None, pd.Index(["a", "b", "c"]), - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] - ), + pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]), ], ) @pytest.mark.parametrize("ascending", [True, False]) @@ -363,9 +357,7 @@ def test_get_default(): assert_eq(cudf.Series([0, 3, 7]), sr.list.get(-3, default=0)) assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1)) - string_sr = cudf.Series( - [["apple", "banana"], ["carrot", "daffodil", "elephant"]] - ) + string_sr = cudf.Series([["apple", "banana"], ["carrot", "daffodil", "elephant"]]) assert_eq( cudf.Series(["default", "elephant"]), string_sr.list.get(2, default="default"), @@ -377,9 +369,7 @@ def test_get_default(): sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) assert_eq(cudf.Series([[3, 4], [7, 8]]), sr_nested.list.get(1)) assert_eq(cudf.Series([[5, 6], cudf.NA]), sr_nested.list.get(2)) - assert_eq( - cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0]) - ) + assert_eq(cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0])) def test_get_ind_sequence(): @@ -480,8 +470,7 @@ def test_contains_invalid(data, scalar): sr = cudf.Series(data) with pytest.raises( TypeError, - match="Type/Scale of search key does not " - "match list column element type.", + match="Type/Scale of search key does not " "match list column element type.", ): sr.list.contains(scalar) @@ -538,9 +527,7 @@ def test_index(data, search_key, expect): if is_scalar(search_key): got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type)) else: - got = sr.list.index( - cudf.Series(search_key, dtype=sr.dtype.element_type) - ) + got = sr.list.index(cudf.Series(search_key, dtype=sr.dtype.element_type)) assert_eq(expect, got) @@ -566,8 +553,7 @@ def test_index_invalid_type(data, search_key): sr = cudf.Series(data) with pytest.raises( TypeError, - match="Type/Scale of search key does not " - "match list column element type.", + match="Type/Scale of search key does not " "match list column element type.", ): sr.list.index(search_key) diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 53919a95115..8e0fed12c19 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -16,12 +16,8 @@ @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) def test_range_index(testrange): - index = RangeIndex( - start=testrange[0], stop=testrange[1], step=testrange[2] - ) - index_pd = pd.RangeIndex( - start=testrange[0], stop=testrange[1], step=testrange[2] - ) + index = RangeIndex(start=testrange[0], stop=testrange[1], step=testrange[2]) + index_pd = pd.RangeIndex(start=testrange[0], stop=testrange[1], step=testrange[2]) assert index.is_unique == index_pd.is_unique assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing @@ -68,9 +64,7 @@ def test_string_index(testlist): assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing -@pytest.mark.parametrize( - "testlist", [["c", "d", "e", "f"], ["z", "y", "x", "r"]] -) +@pytest.mark.parametrize("testlist", [["c", "d", "e", "f"], ["z", "y", "x", "r"]]) def test_categorical_index(testlist): # Assuming unordered categorical data cannot be "monotonic" raw_cat = pd.Categorical(testlist, ordered=True) @@ -163,12 +157,8 @@ def test_multiindex(): gdf = cudf.from_pandas(pdf) assert pdf.index.is_unique == gdf.index.is_unique - assert ( - pdf.index.is_monotonic_increasing == gdf.index.is_monotonic_increasing - ) - assert ( - pdf.index.is_monotonic_decreasing == gdf.index.is_monotonic_decreasing - ) + assert pdf.index.is_monotonic_increasing == gdf.index.is_monotonic_increasing + assert pdf.index.is_monotonic_decreasing == gdf.index.is_monotonic_decreasing @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 4926d79e734..9d33453da59 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -262,9 +262,9 @@ def test_multiindex_transpose(pdf, pdfIndex): def test_from_pandas_series(): - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} - ).set_index(["a", "b"]) + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( + ["a", "b"] + ) result = cudf.from_pandas(pdf) assert_eq(pdf, result) @@ -292,12 +292,8 @@ def test_multiindex_take(pdf, gdf, pdfIndex): assert_eq(pdf.index.take(pd.Series([0])), gdf.index.take(Series([0]))) assert_eq(pdf.index.take([0, 1]), gdf.index.take([0, 1])) - assert_eq( - pdf.index.take(np.array([0, 1])), gdf.index.take(np.array([0, 1])) - ) - assert_eq( - pdf.index.take(pd.Series([0, 1])), gdf.index.take(Series([0, 1])) - ) + assert_eq(pdf.index.take(np.array([0, 1])), gdf.index.take(np.array([0, 1]))) + assert_eq(pdf.index.take(pd.Series([0, 1])), gdf.index.take(Series([0, 1]))) def test_multiindex_getitem(pdf, gdf, pdfIndex): @@ -392,9 +388,7 @@ def test_multiindex_loc_errors(pdf, gdf, pdfIndex): with pytest.raises(KeyError): gdf.loc[("a", "store", "clouds", "foo")] with pytest.raises(IndexError): - gdf.loc[ - ("a", "store", "clouds", "fire", "x", "y") - ] # too many indexers + gdf.loc[("a", "store", "clouds", "fire", "x", "y")] # too many indexers with pytest.raises(IndexError): gdf.loc[slice(None, ("a", "store", "clouds", "fire", "x", "y"))] @@ -495,9 +489,7 @@ def test_multiindex_from_tuples(): def test_multiindex_from_dataframe(): if not hasattr(pd.MultiIndex([[]], [[]]), "codes"): pytest.skip() - pdf = pd.DataFrame( - [["a", "house"], ["a", "store"], ["b", "house"], ["b", "store"]] - ) + pdf = pd.DataFrame([["a", "house"], ["a", "store"], ["b", "house"], ["b", "store"]]) gdf = cudf.from_pandas(pdf) pmi = pd.MultiIndex.from_frame(pdf, names=["alpha", "location"]) gmi = cudf.MultiIndex.from_frame(gdf, names=["alpha", "location"]) @@ -536,9 +528,7 @@ def test_multiindex_index_and_columns(): names=["x", "y"], ) gdf.index = mi - mc = cudf.MultiIndex( - levels=[["val"], ["mean", "min"]], codes=[[0, 0], [0, 1]] - ) + mc = cudf.MultiIndex(levels=[["val"], ["mean", "min"]], codes=[[0, 0], [0, 1]]) gdf.columns = mc pdf.index = mi.to_pandas() pdf.columns = mc.to_pandas() @@ -830,43 +820,29 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): assert all((x == y) for x, y in zip(lptrs, rptrs)) elif isinstance(data, cudf.MultiIndex): - same_ref = (not deep) or ( - cudf.get_option("copy_on_write") and not deep - ) + same_ref = (not deep) or (cudf.get_option("copy_on_write") and not deep) mi1 = data mi2 = mi1.copy(deep=deep) # Assert ._levels identity lptrs = [ - lv._data._data[None].base_data.get_ptr(mode="read") - for lv in mi1._levels + lv._data._data[None].base_data.get_ptr(mode="read") for lv in mi1._levels ] rptrs = [ - lv._data._data[None].base_data.get_ptr(mode="read") - for lv in mi2._levels + lv._data._data[None].base_data.get_ptr(mode="read") for lv in mi2._levels ] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._codes identity - lptrs = [ - c.base_data.get_ptr(mode="read") - for _, c in mi1._codes._data.items() - ] - rptrs = [ - c.base_data.get_ptr(mode="read") - for _, c in mi2._codes._data.items() - ] + lptrs = [c.base_data.get_ptr(mode="read") for _, c in mi1._codes._data.items()] + rptrs = [c.base_data.get_ptr(mode="read") for _, c in mi2._codes._data.items()] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._data identity - lptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi1._data.items() - ] - rptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi2._data.items() - ] + lptrs = [d.base_data.get_ptr(mode="read") for _, d in mi1._data.items()] + rptrs = [d.base_data.get_ptr(mode="read") for _, d in mi2._data.items()] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) cudf.set_option("copy_on_write", original_cow_setting) @@ -908,9 +884,7 @@ def test_multiindex_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): presult = pdf.iloc[iloc_rows, iloc_columns] gresult = gdf.iloc[iloc_rows, iloc_columns] if isinstance(gresult, cudf.DataFrame): - assert_eq( - presult, gresult, check_index_type=False, check_column_type=False - ) + assert_eq(presult, gresult, check_index_type=False, check_column_type=False) else: assert_eq(presult, gresult, check_index_type=False, check_dtype=False) @@ -919,9 +893,7 @@ def test_multiindex_iloc_scalar(): arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] tuples = list(zip(*arrays)) idx = cudf.MultiIndex.from_tuples(tuples) - gdf = cudf.DataFrame( - {"first": cp.random.rand(4), "second": cp.random.rand(4)} - ) + gdf = cudf.DataFrame({"first": cp.random.rand(4), "second": cp.random.rand(4)}) gdf.index = idx pdf = gdf.to_pandas() @@ -970,17 +942,13 @@ def test_multicolumn_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): if isinstance(name, str) and "cudf" in name: gresult.name = name if isinstance(presult, pd.DataFrame): - assert_eq( - presult, gresult, check_index_type=False, check_column_type=False - ) + assert_eq(presult, gresult, check_index_type=False, check_column_type=False) else: assert_eq(presult, gresult, check_index_type=False, check_dtype=False) def test_multicolumn_item(): - gdf = cudf.DataFrame( - {"x": np.arange(10), "y": np.arange(10), "z": np.arange(10)} - ) + gdf = cudf.DataFrame({"x": np.arange(10), "y": np.arange(10), "z": np.arange(10)}) gdg = gdf.groupby(["x", "y"]).min() gdgT = gdg.T pdgT = gdgT.to_pandas() @@ -1066,9 +1034,7 @@ def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex): gdf.index = gdfIndex # The index is unsorted, which makes things slow but is fine for testing. with pytest.warns(pd.errors.PerformanceWarning): - assert_eq( - pdf.loc[("a",), :].sort_index(), gdf.loc[("a",), :].sort_index() - ) + assert_eq(pdf.loc[("a",), :].sort_index(), gdf.loc[("a",), :].sort_index()) assert_eq( pdf.loc[(("a"), ("store")), :].sort_index(), gdf.loc[(("a"), ("store")), :].sort_index(), @@ -1090,12 +1056,8 @@ def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex): gdf.loc[(slice(None), slice(None), "storm"), :].sort_index(), ) assert_eq( - pdf.loc[ - (slice(None), slice(None), slice(None), "smoke"), : - ].sort_index(), - gdf.loc[ - (slice(None), slice(None), slice(None), "smoke"), : - ].sort_index(), + pdf.loc[(slice(None), slice(None), slice(None), "smoke"), :].sort_index(), + gdf.loc[(slice(None), slice(None), slice(None), "smoke"), :].sort_index(), ) @@ -1422,12 +1384,8 @@ def test_multiindex_sort_values(pmidx, ascending, return_indexer): pmidx = pmidx midx = cudf.from_pandas(pmidx) - expected = pmidx.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - actual = midx.sort_values( - ascending=ascending, return_indexer=return_indexer - ) + expected = pmidx.sort_values(ascending=ascending, return_indexer=return_indexer) + actual = midx.sort_values(ascending=ascending, return_indexer=return_indexer) if return_indexer: expected_indexer = expected[1] @@ -1510,9 +1468,7 @@ def test_multiindex_set_names(idx, names, inplace): @pytest.mark.parametrize( "idx", [ - pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] - ), + pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019], ["aab", "bcd"]]), pd.MultiIndex.from_product( [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], names=[1, 0, 2], @@ -1531,9 +1487,7 @@ def test_multiindex_set_names(idx, names, inplace): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names_default_and_int_names( - idx, level, names, inplace -): +def test_multiindex_set_names_default_and_int_names(idx, level, names, inplace): pi = idx.copy() gi = cudf.from_pandas(idx) @@ -1586,9 +1540,7 @@ def test_multiindex_set_names_string_names(idx, level, names, inplace): "level, names", [(1, ["a"]), (None, "a"), ([1, 2], ["a"]), (None, ["a"])] ) def test_multiindex_set_names_error(level, names): - pi = pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] - ) + pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019], ["aab", "bcd"]]) gi = cudf.from_pandas(pi) assert_exceptions_equal( @@ -1633,9 +1585,7 @@ def test_multiindex_rename(idx, names, inplace): assert_eq(expected, actual) -@pytest.mark.parametrize( - "names", ["plain string", 123, ["str"], ["l1", "l2", "l3"]] -) +@pytest.mark.parametrize("names", ["plain string", 123, ["str"], ["l1", "l2", "l3"]]) def test_multiindex_rename_error(names): pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) gi = cudf.from_pandas(pi) @@ -1698,12 +1648,8 @@ def test_difference(): "idx1, idx2", [ ( - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] - ), + pd.MultiIndex.from_arrays([[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]), + pd.MultiIndex.from_arrays([[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]]), ), ( pd.MultiIndex.from_arrays( @@ -1748,12 +1694,8 @@ def test_union_mulitIndex(idx1, idx2, sort): "idx1, idx2", [ ( - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ), - pd.MultiIndex.from_arrays( - [[1, 3, 2, 2], ["Red", "Green", "Red", "Green"]] - ), + pd.MultiIndex.from_arrays([[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]), + pd.MultiIndex.from_arrays([[1, 3, 2, 2], ["Red", "Green", "Red", "Green"]]), ), ( pd.MultiIndex.from_arrays( @@ -1828,9 +1770,7 @@ def test_pickle_roundtrip_multiindex(names): @pytest.mark.parametrize( "pidx", [ - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ), + pd.MultiIndex.from_arrays([[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]), pd.MultiIndex.from_arrays( [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], names=["a", "b", "c"], @@ -1871,9 +1811,7 @@ def test_multiindex_index_single_row(): arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] tuples = list(zip(*arrays)) idx = cudf.MultiIndex.from_tuples(tuples) - gdf = cudf.DataFrame( - {"first": cp.random.rand(4), "second": cp.random.rand(4)} - ) + gdf = cudf.DataFrame({"first": cp.random.rand(4), "second": cp.random.rand(4)}) gdf.index = idx pdf = gdf.to_pandas() assert_eq(pdf.loc[("b", 3)], gdf.loc[("b", 3)]) @@ -1946,9 +1884,7 @@ def test_multiindex_to_series_error(): ) @pytest.mark.parametrize("allow_duplicates", [True, False]) @pytest.mark.parametrize("index", [True, False]) -def test_multiindex_to_frame_allow_duplicates( - pidx, name, allow_duplicates, index -): +def test_multiindex_to_frame_allow_duplicates(pidx, name, allow_duplicates, index): gidx = cudf.from_pandas(pidx) if name is None or ( diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 2e3be92dbeb..a189ec8bc6b 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -43,9 +43,7 @@ def test_can_cast_safely_same_kind(): to_dtype = np.dtype("float32") assert data.can_cast_safely(to_dtype) - data = cudf.Series( - [np.finfo("float32").max * 2, 1.0], dtype="float64" - )._column + data = cudf.Series([np.finfo("float32").max * 2, 1.0], dtype="float64")._column to_dtype = np.dtype("float32") assert not data.can_cast_safely(to_dtype) @@ -142,9 +140,7 @@ def test_can_cast_safely_has_nulls(): ), # Categories with nulls pd.Series([1, 2, 3], dtype=pd.CategoricalDtype(categories=[1, 2])), - pd.Series( - [5.0, 6.0], dtype=pd.CategoricalDtype(categories=[5.0, 6.0]) - ), + pd.Series([5.0, 6.0], dtype=pd.CategoricalDtype(categories=[5.0, 6.0])), pd.Series( ["2020-08-01 08:00:00", "1960-08-01 08:00:00"], dtype=np.dtype("= sizeof @@ -81,9 +79,7 @@ def test_packed_dataframe_equality_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) + df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) df["vals"] = np.random.random(len(df)) check_packed_equality(df) @@ -103,9 +99,7 @@ def test_packed_dataframe_equality_struct(): np.random.seed(0) df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) + df["keys"] = Series(list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))) df["vals"] = np.random.random(len(df)) check_packed_equality(df) @@ -149,9 +143,7 @@ def test_packed_dataframe_unique_pointers_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) + df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) df["vals"] = np.random.random(len(df)) check_packed_unique_pointers(df) @@ -171,9 +163,7 @@ def test_packed_dataframe_unique_pointers_struct(): np.random.seed(0) df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) + df["keys"] = Series(list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))) df["vals"] = np.random.random(len(df)) check_packed_unique_pointers(df) @@ -192,9 +182,7 @@ def check_packed_pickled_equality(df): assert_packed_frame_picklable(sortvaldf) # out-of-band buffers = [] - serialbytes = pickle.dumps( - pack(df), protocol=5, buffer_callback=buffers.append - ) + serialbytes = pickle.dumps(pack(df), protocol=5, buffer_callback=buffers.append) for b in buffers: assert isinstance(b, pickle.PickleBuffer) loaded = unpack(pickle.loads(serialbytes, buffers=buffers)) @@ -222,9 +210,7 @@ def test_pickle_packed_dataframe_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) + df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) df["vals"] = np.random.random(len(df)) check_packed_pickled_equality(df) @@ -244,9 +230,7 @@ def test_pickle_packed_dataframe_struct(): np.random.seed(0) df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) + df["keys"] = Series(list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))) df["vals"] = np.random.random(len(df)) check_packed_pickled_equality(df) @@ -287,9 +271,7 @@ def test_serialize_packed_dataframe_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) + df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) df["vals"] = np.random.random(len(df)) check_packed_serialized_equality(df) @@ -309,9 +291,7 @@ def test_serialize_packed_dataframe_struct(): np.random.seed(0) df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) + df["keys"] = Series(list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))) df["vals"] = np.random.random(len(df)) check_packed_serialized_equality(df) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 18efd4417a1..70cd7338b84 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -72,10 +72,7 @@ def simple_pdf(request): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) - for typ in types - }, + {f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) for typ in types}, # Need to ensure that this index is not a RangeIndex to get the # expected round-tripping behavior from Parquet reader/writer. index=pd.Index(list(range(nrows))), @@ -114,10 +111,7 @@ def build_pdf(num_columns, day_resolution_timestamps): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) - for typ in types - }, + {f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) for typ in types}, # Need to ensure that this index is not a RangeIndex to get the # expected round-tripping behavior from Parquet reader/writer. index=pd.Index(list(range(nrows))), @@ -148,9 +142,7 @@ def build_pdf(num_columns, day_resolution_timestamps): ] if day_resolution_timestamps: data = [int(d / t["dayModulus"]) * t["dayModulus"] for d in data] - test_pdf["col_" + t["name"]] = pd.Series( - np.asarray(data, dtype=t["name"]) - ) + test_pdf["col_" + t["name"]] = pd.Series(np.asarray(data, dtype=t["name"])) # Create non-numeric categorical data otherwise parquet may typecast it data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] @@ -185,9 +177,7 @@ def gdf_day_timestamps(pdf_day_timestamps): @pytest.fixture(params=["snappy", "gzip", "brotli", None, np.str_("snappy")]) def parquet_file(request, tmp_path_factory, pdf): - fname = tmp_path_factory.mktemp("parquet") / ( - str(request.param) + "_test.parquet" - ) + fname = tmp_path_factory.mktemp("parquet") / (str(request.param) + "_test.parquet") pdf.to_parquet(fname, engine="pyarrow", compression=request.param) return fname @@ -340,9 +330,7 @@ def test_parquet_reader_index_col(tmpdir, index_col, columns): @pytest.mark.parametrize("pandas_compat", [True, False]) -@pytest.mark.parametrize( - "columns", [["a"], ["d"], ["a", "b"], ["a", "d"], None] -) +@pytest.mark.parametrize("columns", [["a"], ["d"], ["a", "b"], ["a", "d"], None]) def test_parquet_reader_pandas_metadata(tmpdir, columns, pandas_compat): df = pd.DataFrame( { @@ -363,9 +351,7 @@ def test_parquet_reader_pandas_metadata(tmpdir, columns, pandas_compat): expect = pa.parquet.read_table( fname, columns=columns, use_pandas_metadata=pandas_compat ).to_pandas() - got = cudf.read_parquet( - fname, columns=columns, use_pandas_metadata=pandas_compat - ) + got = cudf.read_parquet(fname, columns=columns, use_pandas_metadata=pandas_compat) if pandas_compat or columns is None or "b" in columns: assert got.index.name == "b" @@ -388,16 +374,12 @@ def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes): # PANDAS `read_parquet()` and PyArrow `read_pandas()` always includes index # Instead, directly use PyArrow to optionally omit the index - expect = pa.parquet.read_table( - fname, use_pandas_metadata=pandas_compat - ).to_pandas() + expect = pa.parquet.read_table(fname, use_pandas_metadata=pandas_compat).to_pandas() if as_bytes: # Make sure we can handle RangeIndex parsing # in pandas when the input is `bytes` with open(fname, "rb") as f: - got = cudf.read_parquet( - f.read(), use_pandas_metadata=pandas_compat - ) + got = cudf.read_parquet(f.read(), use_pandas_metadata=pandas_compat) else: got = cudf.read_parquet(fname, use_pandas_metadata=pandas_compat) @@ -436,9 +418,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): null_frequency=0.05, generator=lambda: [ "".join( - random.sample( - string.ascii_letters, random.randint(4, 8) - ) + random.sample(string.ascii_letters, random.randint(4, 8)) ) for _ in range(40) ], @@ -497,9 +477,7 @@ def test_parquet_read_filtered_multiple_files(tmpdir): df = pd.DataFrame({"x": range(10), "y": list("aaccccddee")}) df.to_parquet(fname_1, row_group_size=2) fname_2 = tmpdir.join("filtered_multiple_files_2.parquet") - df = pd.DataFrame( - {"x": [0, 1, 9, 9, 4, 5, 6, 7, 8, 9], "y": list("aabbzzddee")} - ) + df = pd.DataFrame({"x": [0, 1, 9, 9, 4, 5, 6, 7, 8, 9], "y": list("aabbzzddee")}) df.to_parquet(fname_2, row_group_size=2) # Check filter @@ -531,9 +509,7 @@ def test_parquet_read_filtered_multiple_files(tmpdir): ([[("x", "==", 0), ("z", "==", 9), ("y", "==", "a")]], 1), ], ) -def test_parquet_read_filtered_complex_predicate( - tmpdir, predicate, expected_len -): +def test_parquet_read_filtered_complex_predicate(tmpdir, predicate, expected_len): # Generate data fname = tmpdir.join("filtered_complex_predicate.parquet") df = pd.DataFrame( @@ -688,9 +664,7 @@ def test_parquet_reader_local_filepath(): cudf.read_parquet(fname) -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] -) +@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "bytes", "url"]) def test_parquet_reader_filepath_or_buffer(parquet_path_or_buf, src): expect = pd.read_parquet(parquet_path_or_buf("filepath")) got = cudf.read_parquet(parquet_path_or_buf(src)) @@ -720,15 +694,11 @@ def test_parquet_reader_use_python_file_object( # Pass open fsspec file with fs.open(paths[0], mode="rb") as fil: - got1 = cudf.read_parquet( - fil, use_python_file_object=use_python_file_object - ) + got1 = cudf.read_parquet(fil, use_python_file_object=use_python_file_object) assert_eq(expect, got1) # Pass path only - got2 = cudf.read_parquet( - paths[0], use_python_file_object=use_python_file_object - ) + got2 = cudf.read_parquet(paths[0], use_python_file_object=use_python_file_object) assert_eq(expect, got2) @@ -752,9 +722,7 @@ def create_parquet_source(df, src_type, fname): return pathlib.Path(fname).as_uri() -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] -) +@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "bytes", "url"]) def test_parquet_reader_multiple_files(tmpdir, src): test_pdf1 = make_pdf(nrows=1000, nvalids=1000 // 2, dtype="float64") test_pdf2 = make_pdf(nrows=500, dtype="float64") @@ -884,9 +852,7 @@ def string_gen(first_val, i): return strings[int_gen(first_val, i) % len(strings)] -def list_row_gen( - gen, first_val, list_size, lists_per_row, include_validity=False -): +def list_row_gen(gen, first_val, list_size, lists_per_row, include_validity=False): """ Generate a single row for a List> column based on input parameters. @@ -952,10 +918,7 @@ def L(list_size, first_val): ] def R(first_val, lists_per_row, list_size): - return [ - L(list_size, first_val + (list_size * i)) - for i in range(lists_per_row) - ] + return [L(list_size, first_val + (list_size * i)) for i in range(lists_per_row)] return [ ( @@ -987,9 +950,7 @@ def test_parquet_reader_list_large(tmpdir): def test_parquet_reader_list_validity(tmpdir): - expect = pd.DataFrame( - {"a": list_gen(int_gen, 256, 80, 50, include_validity=True)} - ) + expect = pd.DataFrame({"a": list_gen(int_gen, 256, 80, 50, include_validity=True)}) fname = tmpdir.join("test_parquet_reader_list_validity.parquet") expect.to_parquet(fname) assert os.path.exists(fname) @@ -1036,9 +997,7 @@ def test_parquet_reader_list_large_multi_rowgroup(tmpdir): expect.reset_index(inplace=True) # round trip the dataframe to/from parquet - fname = tmpdir.join( - "test_parquet_reader_list_large_multi_rowgroup.parquet" - ) + fname = tmpdir.join("test_parquet_reader_list_large_multi_rowgroup.parquet") expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) got = cudf.read_parquet(fname) @@ -1055,9 +1014,7 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): ) # round trip the dataframe to/from parquet - fname = tmpdir.join( - "test_parquet_reader_list_large_multi_rowgroup_nulls.parquet" - ) + fname = tmpdir.join("test_parquet_reader_list_large_multi_rowgroup_nulls.parquet") expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) assert os.path.exists(fname) got = cudf.read_parquet(fname) @@ -1087,8 +1044,7 @@ def struct_gen(gen, skip_rows, num_rows, include_validity=False): def R(first_val, num_fields): return { - "col" - + str(f): (gen[f](first_val, first_val) if f % 4 != 0 else None) + "col" + str(f): (gen[f](first_val, first_val) if f % 4 != 0 else None) if include_validity else (gen[f](first_val, first_val)) for f in range(len(gen)) @@ -1548,9 +1504,7 @@ def test_parquet_reader_nested_v2(tmpdir, data): @pytest.mark.filterwarnings("ignore:Using CPU") -def test_parquet_writer_cpu_pyarrow( - tmpdir, pdf_day_timestamps, gdf_day_timestamps -): +def test_parquet_writer_cpu_pyarrow(tmpdir, pdf_day_timestamps, gdf_day_timestamps): pdf_fname = tmpdir.join("pdf.parquet") gdf_fname = tmpdir.join("gdf.parquet") @@ -1809,9 +1763,7 @@ def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs): # Know the specific row-group count for row_group_size_rows if "row_group_size_rows" in row_group_size_kwargs: - assert ( - nrow_groups == size // row_group_size_kwargs["row_group_size_rows"] - ) + assert nrow_groups == size // row_group_size_kwargs["row_group_size_rows"] assert_eq(cudf.read_parquet(fname), gdf) @@ -1975,9 +1927,7 @@ def test_parquet_writer_chunked_max_file_size( gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1] * 10000, "b": range(0, 50000)}) - df2 = cudf.DataFrame( - {"a": [1, 3, 3, 1, 3] * 10000, "b": range(50000, 100000)} - ) + df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3] * 10000, "b": range(50000, 100000)}) cw = ParquetDatasetWriter( gdf_dir, @@ -2039,9 +1989,7 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) - with ParquetDatasetWriter( - gdf_dir, partition_cols=["a"], index=False - ) as cw: + with ParquetDatasetWriter(gdf_dir, partition_cols=["a"], index=False) as cw: cw.write_table(df1) cw.write_table(df2) @@ -2109,9 +2057,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols): ) @pytest.mark.parametrize("selection", ["directory", "files", "row-groups"]) @pytest.mark.parametrize("use_cat", [True, False]) -def test_read_parquet_partitioned_filtered( - tmpdir, pfilters, selection, use_cat -): +def test_read_parquet_partitioned_filtered(tmpdir, pfilters, selection, use_cat): rng = np.random.default_rng(2) path = str(tmpdir) size = 100 @@ -2231,12 +2177,8 @@ def test_write_cudf_read_pandas_pyarrow(tmpdir, pdf): assert_eq(pd_res, cudf_res, check_index_type=not pdf.empty) - cudf_res = pa.parquet.read_table( - cudf_path, use_pandas_metadata=True - ).to_pandas() - pd_res = pa.parquet.read_table( - pandas_path, use_pandas_metadata=True - ).to_pandas() + cudf_res = pa.parquet.read_table(cudf_path, use_pandas_metadata=True).to_pandas() + pd_res = pa.parquet.read_table(pandas_path, use_pandas_metadata=True).to_pandas() assert_eq(cudf_res, pd_res, check_index_type=not pdf.empty) @@ -2367,11 +2309,7 @@ def test_parquet_nullable_boolean(tmpdir, engine): pandas_path = tmpdir.join("pandas_bools.parquet") pdf = pd.DataFrame( - { - "a": pd.Series( - [True, False, None, True, False], dtype=pd.BooleanDtype() - ) - } + {"a": pd.Series([True, False, None, True, False], dtype=pd.BooleanDtype())} ) expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]}) @@ -2459,9 +2397,7 @@ def test_parquet_no_index_empty(): def test_parquet_allnull_str(tmpdir, engine): pandas_path = tmpdir.join("pandas_allnulls.parquet") - pdf = pd.DataFrame( - {"a": pd.Series([None, None, None, None, None], dtype="str")} - ) + pdf = pd.DataFrame({"a": pd.Series([None, None, None, None, None], dtype="str")}) expected_gdf = cudf.DataFrame( {"a": cudf.Series([None, None, None, None, None], dtype="str")} ) @@ -2836,9 +2772,7 @@ def test_parquet_reader_one_level_list3(datadir): @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) -def test_to_parquet_row_group_size( - tmpdir, large_int64_gdf, size_bytes, size_rows -): +def test_to_parquet_row_group_size(tmpdir, large_int64_gdf, size_bytes, size_rows): fname = tmpdir.join("row_group_size.parquet") large_int64_gdf.to_parquet( fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 13a07ef8adc..f0834429c1e 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -53,9 +53,7 @@ def test_pickle_dataframe_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) + df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) df["vals"] = np.random.random(len(df)) check_serialization(df) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index cf9e70d85c7..68424fd8e17 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import datetime @@ -64,9 +64,7 @@ def test_query(data, fn, nulls): ] -@pytest.mark.parametrize( - "data,fn", product(params_query_data, params_query_env_fn) -) +@pytest.mark.parametrize("data,fn", product(params_query_data, params_query_env_fn)) def test_query_ref_env(data, fn): # prepare nelem, seed = data diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index 1a5f25e320f..f27b2dfeeb0 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -15,9 +15,7 @@ def pdf(): return pd.DataFrame( { "col1": np.array([5, 4, 3, 5, 8, 5, 2, 1, 6, 6]), - "col2": np.array( - [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] - ), + "col2": np.array([5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]), }, index=np.array([5, 4, 3, 2, 1, 6, 7, 8, 9, 10]), ) @@ -38,9 +36,7 @@ def test_rank_all_arguments( pdf = pdf.copy(deep=True) # for parallel pytest if numeric_only: - pdf["str"] = np.array( - ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"] - ) + pdf["str"] = np.array(["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"]) gdf = DataFrame.from_pandas(pdf) kwargs = { diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index c6ffa1d2bc7..c51987b48e8 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -83,9 +83,7 @@ def test_product(dtype, nelem): data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): - data[np.random.randint(low=0, high=nelem, size=1)] = ( - np.random.uniform() * 2 - ) + data[np.random.randint(low=0, high=nelem, size=1)] = np.random.uniform() * 2 else: data = gen_rand(dtype, nelem) @@ -369,9 +367,7 @@ def test_reductions_axis_none_warning(op): def test_reduction_column_multiindex(): - idx = cudf.MultiIndex.from_tuples( - [("a", 1), ("a", 2)], names=["foo", "bar"] - ) + idx = cudf.MultiIndex.from_tuples([("a", 1), ("a", 2)], names=["foo", "bar"]) df = cudf.DataFrame(np.array([[1, 3], [2, 4]]), columns=idx) result = df.mean() expected = df.to_pandas().mean() diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 8992c4d617b..de623f94728 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -362,9 +362,7 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): pdata = container(data) if np.dtype(data_dtype).kind not in ("f"): - data_dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[ - np.dtype(data_dtype) - ] + data_dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[np.dtype(data_dtype)] pdata = pdata.astype(data_dtype) # Explicitly using nans_as_nulls=True @@ -391,18 +389,18 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): cudf.Series(["-74.56", None, "-23.73", "34.55", "2.89", None]).astype( Decimal32Dtype(7, 2) ), - cudf.Series( - ["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan] - ).astype(Decimal64Dtype(8, 3)), - cudf.Series( - ["2.964", None, "57.432", "-989.330", None, "56.444"] - ).astype(Decimal64Dtype(8, 3)), - cudf.Series( - [np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan] - ).astype(Decimal64Dtype(10, 4)), - cudf.Series( - ["2.964", None, "54347.432", "-989.330", None, "56.444"] - ).astype(Decimal128Dtype(20, 7)), + cudf.Series(["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan]).astype( + Decimal64Dtype(8, 3) + ), + cudf.Series(["2.964", None, "57.432", "-989.330", None, "56.444"]).astype( + Decimal64Dtype(8, 3) + ), + cudf.Series([np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan]).astype( + Decimal64Dtype(10, 4) + ), + cudf.Series(["2.964", None, "54347.432", "-989.330", None, "56.444"]).astype( + Decimal128Dtype(20, 7) + ), ], ) @pytest.mark.parametrize( @@ -487,10 +485,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace): else: fill_value_cudf = fill_value - if ( - isinstance(fill_value_cudf, cudf.Series) - and gsr.dtype != fill_value_cudf.dtype - ): + if isinstance(fill_value_cudf, cudf.Series) and gsr.dtype != fill_value_cudf.dtype: assert_exceptions_equal( lfunc=psr.fillna, rfunc=gsr.fillna, @@ -671,26 +666,16 @@ def test_fillna_datetime(psr_data, fill_value, inplace): dtype="datetime64[ns]", ), # Timedelta - np.array( - [10, 100, 1000, None, None, 10, 100, 1000], dtype="datetime64[ns]" - ), - np.array( - [None, None, 10, None, 1000, 100, 10], dtype="datetime64[ns]" - ), - np.array( - [10, 100, None, None, 1000, None, None], dtype="datetime64[ns]" - ), + np.array([10, 100, 1000, None, None, 10, 100, 1000], dtype="datetime64[ns]"), + np.array([None, None, 10, None, 1000, 100, 10], dtype="datetime64[ns]"), + np.array([10, 100, None, None, 1000, None, None], dtype="datetime64[ns]"), # String np.array( ["10", "100", "1000", None, None, "10", "100", "1000"], dtype="object", ), - np.array( - [None, None, "1000", None, "10", "100", "10"], dtype="object" - ), - np.array( - ["10", "100", None, None, "1000", None, None], dtype="object" - ), + np.array([None, None, "1000", None, "10", "100", "10"], dtype="object"), + np.array(["10", "100", None, None, "1000", None, None], dtype="object"), ], ) @pytest.mark.parametrize("container", [pd.Series, pd.DataFrame]) @@ -721,9 +706,7 @@ def test_fillna_method_fixed_width_non_num(data, container, method, inplace): "df", [ pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}), - pd.DataFrame( - {"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"] - ), + pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"]), pd.DataFrame({"a": [1, 2, 3]}), ], ) @@ -978,9 +961,7 @@ def test_series_multiple_times_with_nulls(): @pytest.mark.parametrize("series_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize( - "replacement", [128, 128.0, 128.5, 32769, 32769.0, 32769.5] -) +@pytest.mark.parametrize("replacement", [128, 128.0, 128.5, 32769, 32769.0, 32769.5]) def test_numeric_series_replace_dtype(series_dtype, replacement): psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) sr = cudf.from_pandas(psr) @@ -1019,15 +1000,13 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): sr.replace([0, 1], [replacement]) # Both lists of equal length - if ( - np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"} - ) or (not can_replace): + if (np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"}) or ( + not can_replace + ): with pytest.raises(TypeError): sr.replace([2, 3], [replacement, replacement]) else: - expect = psr.replace([2, 3], [replacement, replacement]).astype( - psr.dtype - ) + expect = psr.replace([2, 3], [replacement, replacement]).astype(psr.dtype) got = sr.replace([2, 3], [replacement, replacement]) assert_eq(expect, got) @@ -1133,9 +1112,7 @@ def test_replace_df_error(): ) @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_clip(lower, upper, inplace): - pdf = pd.DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} - ) + pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]}) gdf = cudf.from_pandas(pdf) got = gdf.clip(lower=lower, upper=upper, inplace=inplace) @@ -1172,9 +1149,7 @@ def test_dataframe_category_clip(lower, upper, inplace): [([2, 7.4], [4, 7.9, "d"]), ([2, 7.4, "a"], [4, 7.9, "d"])], ) def test_dataframe_exceptions_for_clip(lower, upper): - gdf = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} - ) + gdf = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]}) with pytest.raises(ValueError): gdf.clip(lower=lower, upper=upper) @@ -1383,9 +1358,7 @@ def test_replace_nulls(gsr, old, new, expected): def test_fillna_columns_multiindex(): columns = pd.MultiIndex.from_tuples([("a", "b"), ("d", "e")]) - pdf = pd.DataFrame( - {"0": [1, 2, None, 3, None], "1": [None, None, None, None, 4]} - ) + pdf = pd.DataFrame({"0": [1, 2, None, 3, None], "1": [None, None, None, None, 4]}) pdf.columns = columns gdf = cudf.from_pandas(pdf) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 8f65bd26bd1..ea703a26a39 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -31,9 +31,7 @@ def test_null_series(nrows, dtype): if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view(mode="read").copy_to_host(), - dtype=np_dtypes_to_pandas_dtypes.get( - cudf.dtype(dtype), cudf.dtype(dtype) - ), + dtype=np_dtypes_to_pandas_dtypes.get(cudf.dtype(dtype), cudf.dtype(dtype)), ) ps[sr.isnull().to_pandas()] = pd.NA else: @@ -119,11 +117,7 @@ def test_integer_dataframe(x): pd.reset_option("display.max_columns") -@given( - st.lists( - st.integers(-9223372036854775808, 9223372036854775807), max_size=10000 - ) -) +@given(st.lists(st.integers(-9223372036854775808, 9223372036854775807), max_size=10000)) @settings(deadline=None) def test_integer_series(x): sr = cudf.Series(x, dtype=int) @@ -152,9 +146,7 @@ def test_float_series(x): def mixed_pdf(): pdf = pd.DataFrame() pdf["Integer"] = np.array([2345, 11987, 9027, 9027]) - pdf["Date"] = np.array( - ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"] - ) + pdf["Date"] = np.array(["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"]) pdf["Float"] = np.array([9.001, 8.343, 6, 2.781]) pdf["Integer2"] = np.array([2345, 106, 2088, 789277]) pdf["Category"] = np.array(["M", "F", "F", "F"]) @@ -207,9 +199,7 @@ def test_MI(): @pytest.mark.parametrize("nrows", [0, 1, 3, 5, 10]) @pytest.mark.parametrize("ncols", [0, 1, 2, 3]) def test_groupby_MI(nrows, ncols): - gdf = cudf.DataFrame( - {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} - ) + gdf = cudf.DataFrame({"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)}) pdf = gdf.to_pandas() gdg = gdf.groupby(["a", "b"], sort=True).count() pdg = pdf.groupby(["a", "b"], sort=True).count() @@ -332,11 +322,8 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): ",\n , ],\n dtype='uint32')", ), ( - cudf.Index( - [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" - ), - "Index([, 111, 22, 33, , 23, 34, 2343, ], " - "dtype='int16')", + cudf.Index([None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16"), + "Index([, 111, 22, 33, , 23, 34, 2343, ], " "dtype='int16')", ), ( cudf.Index([1, 2, 3, None], dtype="category"), @@ -655,9 +642,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( - [None, None, None, None, None], dtype="timedelta64[ns]" - ), + cudf.Series([None, None, None, None, None], dtype="timedelta64[ns]"), textwrap.dedent( """ 0 NaT @@ -670,9 +655,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( - [None, None, None, None, None], dtype="timedelta64[ms]" - ), + cudf.Series([None, None, None, None, None], dtype="timedelta64[ms]"), textwrap.dedent( """ 0 NaT @@ -685,9 +668,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( - [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ns]" - ), + cudf.Series([12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ns]"), textwrap.dedent( """ 0 0 days 00:00:00.000000012 @@ -701,9 +682,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( - [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ms]" - ), + cudf.Series([12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ms]"), textwrap.dedent( """ 0 0 days 00:00:00.012 @@ -873,11 +852,7 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): [ ( cudf.DataFrame( - { - "a": cudf.Series( - [1000000, 200000, 3000000], dtype="timedelta64[s]" - ) - } + {"a": cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[s]")} ), textwrap.dedent( """ @@ -1034,11 +1009,8 @@ def test_timedelta_dataframe_repr(df, expected_repr): "dtype='timedelta64[ms]')", ), ( - cudf.Index( - [None, None, None, None, None], dtype="timedelta64[us]" - ), - "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], " - "dtype='timedelta64[us]')", + cudf.Index([None, None, None, None, None], dtype="timedelta64[us]"), + "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], " "dtype='timedelta64[us]')", ), ( cudf.Index( @@ -1088,9 +1060,7 @@ def test_timedelta_index_repr(index, expected_repr): @pytest.mark.parametrize( "pmi", [ - pd.MultiIndex.from_tuples( - [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] - ), + pd.MultiIndex.from_tuples([(1, "red"), (1, "blue"), (2, "red"), (2, "blue")]), pd.MultiIndex.from_tuples( [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] * 10 ), @@ -1242,9 +1212,7 @@ def test_multiindex_repr(pmi, max_seq_items): cudf.DataFrame( { "a": [None, None, None, None], - "b": cudf.Series( - [None, None, None, None], dtype="timedelta64[ns]" - ), + "b": cudf.Series([None, None, None, None], dtype="timedelta64[ns]"), "c": [0.345, np.nan, 100, 10], } ) @@ -1359,9 +1327,9 @@ def test_multiindex_null_repr(gdi, expected_repr): def test_categorical_series_with_nan_repr(): - series = cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") + series = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False).astype( + "category" + ) expected_repr = textwrap.dedent( """ @@ -1393,9 +1361,9 @@ def test_categorical_series_with_nan_repr(): def test_categorical_dataframe_with_nan_repr(): - series = cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") + series = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False).astype( + "category" + ) df = cudf.DataFrame({"a": series}) expected_repr = textwrap.dedent( """ @@ -1414,9 +1382,9 @@ def test_categorical_dataframe_with_nan_repr(): def test_categorical_index_with_nan_repr(): cat_index = cudf.Index( - cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") + cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False).astype( + "category" + ) ) expected_repr = ( diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index ad6e0ac52c5..7564d865142 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -52,9 +52,7 @@ def test_series_resample_ffill(rule): rng = pd.date_range("1/1/2012", periods=10, freq="5s") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) gts = cudf.from_pandas(ts) - assert_resample_results_equal( - ts.resample(rule).ffill(), gts.resample(rule).ffill() - ) + assert_resample_results_equal(ts.resample(rule).ffill(), gts.resample(rule).ffill()) @pytest.mark.parametrize("rule", ["2s", "10s"]) @@ -62,9 +60,7 @@ def test_series_resample_bfill(rule): rng = pd.date_range("1/1/2012", periods=10, freq="5s") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) gts = cudf.from_pandas(ts) - assert_resample_results_equal( - ts.resample(rule).bfill(), gts.resample(rule).bfill() - ) + assert_resample_results_equal(ts.resample(rule).bfill(), gts.resample(rule).bfill()) @pytest.mark.parametrize("rule", ["2s", "10s"]) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index d618669755d..a1da690d0ad 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -46,9 +46,7 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): colname = "id" + str(i) data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -60,9 +58,7 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): colname = "val" + str(i) data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -77,9 +73,7 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars) # pandas' melt makes the 'variable' column of 'object' type (string) # cuDF's melt makes it Categorical because it doesn't support strings - expect["variable"] = expect["variable"].astype( - got["variable"].dtype.to_pandas() - ) + expect["variable"] = expect["variable"].astype(got["variable"].dtype.to_pandas()) assert_eq(expect, got) @@ -95,9 +89,7 @@ def test_melt_many_columns(): grid_df = pd.melt(df, id_vars=["id"], var_name="d", value_name="sales") df_d = cudf.DataFrame(mydict) - grid_df_d = cudf.melt( - df_d, id_vars=["id"], var_name="d", value_name="sales" - ) + grid_df_d = cudf.melt(df_d, id_vars=["id"], var_name="d", value_name="sales") grid_df_d["d"] = grid_df_d["d"].astype("str") assert_eq(grid_df, grid_df_d) @@ -105,9 +97,7 @@ def test_melt_many_columns(): @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) -@pytest.mark.parametrize( - "dtype", list(chain(NUMERIC_TYPES, DATETIME_TYPES, ["str"])) -) +@pytest.mark.parametrize("dtype", list(chain(NUMERIC_TYPES, DATETIME_TYPES, ["str"]))) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_df_stack(nulls, num_cols, num_rows, dtype): if dtype not in ["float32", "float64"] and nulls in ["some"]: @@ -118,9 +108,7 @@ def test_df_stack(nulls, num_cols, num_rows, dtype): colname = str(i) data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data @@ -202,13 +190,9 @@ def test_df_stack_reset_index(): @pytest.mark.parametrize("dropna", [True, False]) def test_df_stack_multiindex_column_axis(columns, index, level, dropna): if isinstance(level, list) and len(level) > 1 and not dropna: - pytest.skip( - "Stacking multiple levels with dropna==False is unsupported." - ) + pytest.skip("Stacking multiple levels with dropna==False is unsupported.") - pdf = pd.DataFrame( - data=[[1, 2, 3, 4], [2, 4, 6, 8]], columns=columns, index=index - ) + pdf = pd.DataFrame(data=[[1, 2, 3, 4], [2, 4, 6, 8]], columns=columns, index=index) gdf = cudf.from_pandas(pdf) with pytest.warns(FutureWarning): @@ -274,9 +258,7 @@ def test_df_stack_multiindex_column_axis_pd_example(level): @pytest.mark.parametrize("num_rows", [1, 2, 10, 1000]) @pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["category"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["category"]) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_interleave_columns(nulls, num_cols, num_rows, dtype): if dtype not in ["float32", "float64"] and nulls in ["some"]: @@ -288,9 +270,7 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype): data = pd.Series(np.random.randint(0, 26, num_rows)).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data @@ -302,9 +282,7 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype): else: got = gdf.interleave_columns() - expect = pd.Series(np.vstack(pdf.to_numpy()).reshape((-1,))).astype( - dtype - ) + expect = pd.Series(np.vstack(pdf.to_numpy()).reshape((-1,))).astype(dtype) assert_eq(expect, got) @@ -321,14 +299,10 @@ def test_tile(nulls, num_cols, num_rows, dtype, count): pdf = pd.DataFrame(dtype=dtype) for i in range(num_cols): colname = str(i) - data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype( - dtype - ) + data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data @@ -364,9 +338,7 @@ def _prepare_merge_sorted_test( indices = [i * chunk for i in range(0, nparts)] + [size] if index: dfs = [ - df.iloc[indices[i] : indices[i + 1]] - .copy() - .sort_index(ascending=ascending) + df.iloc[indices[i] : indices[i + 1]].copy().sort_index(ascending=ascending) for i in range(nparts) ] elif series: @@ -406,9 +378,7 @@ def test_df_merge_sorted(nparts, keys, na_position, ascending): ascending=ascending, ) - expect = df.sort_values( - keys_1, na_position=na_position, ascending=ascending - ) + expect = df.sort_values(keys_1, na_position=na_position, ascending=ascending) result = cudf.core.reshape._merge_sorted( dfs, keys=keys, na_position=na_position, ascending=ascending ) @@ -430,9 +400,7 @@ def test_df_merge_sorted_index(nparts, index, ascending): ) expect = df.sort_index(ascending=ascending) - result = cudf.core.reshape._merge_sorted( - dfs, by_index=True, ascending=ascending - ) + result = cudf.core.reshape._merge_sorted(dfs, by_index=True, ascending=ascending) assert_eq(expect.index, result.index) @@ -456,9 +424,7 @@ def test_df_merge_sorted_ignore_index(keys, na_position, ascending): ascending=ascending, ) - expect = df.sort_values( - keys_1, na_position=na_position, ascending=ascending - ) + expect = df.sort_values(keys_1, na_position=na_position, ascending=ascending) result = cudf.core.reshape._merge_sorted( dfs, keys=keys, @@ -548,9 +514,7 @@ def test_pivot_multi_values(): ) -@pytest.mark.parametrize( - "values", ["z", "z123", ["z123"], ["z", "z123", "123z"]] -) +@pytest.mark.parametrize("values", ["z", "z123", ["z123"], ["z", "z123", "123z"]]) def test_pivot_values(values): data = [ ["A", "a", 0, 0, 0], @@ -581,32 +545,24 @@ def test_pivot_values(values): 0, pytest.param( 1, - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), + marks=pytest_xfail(reason="Categorical column indexes not supported"), ), 2, "foo", pytest.param( "bar", - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), + marks=pytest_xfail(reason="Categorical column indexes not supported"), ), "baz", [], pytest.param( [0, 1], - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), + marks=pytest_xfail(reason="Categorical column indexes not supported"), ), ["foo"], pytest.param( ["foo", "bar"], - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), + marks=pytest_xfail(reason="Categorical column indexes not supported"), ), pytest.param( [0, 1, 2], @@ -646,9 +602,7 @@ def test_unstack_multiindex(level): pd.Index(range(0, 5), name="row_index"), pytest.param( pd.CategoricalIndex(["d", "e", "f", "g", "h"]), - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), + marks=pytest_xfail(reason="Categorical column indexes not supported"), ), ], ) @@ -689,9 +643,7 @@ def test_unstack_index_invalid(): def test_pivot_duplicate_error(): - gdf = cudf.DataFrame( - {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]} - ) + gdf = cudf.DataFrame({"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]}) with pytest.raises(ValueError): gdf.pivot(index="a", columns="b") with pytest.raises(ValueError): @@ -710,9 +662,7 @@ def test_pivot_duplicate_error(): } ], ) -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) +@pytest.mark.parametrize("aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]) @pytest.mark.parametrize("fill_value", [0]) def test_pivot_table_simple(data, aggfunc, fill_value): pdf = pd.DataFrame(data) @@ -748,9 +698,7 @@ def test_pivot_table_simple(data, aggfunc, fill_value): } ], ) -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) +@pytest.mark.parametrize("aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]) @pytest.mark.parametrize("fill_value", [0]) def test_dataframe_pivot_table_simple(data, aggfunc, fill_value): pdf = pd.DataFrame(data) diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 1d1d7ae8d29..1280aed6c42 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -20,9 +20,7 @@ ([1, 2, 4, 9, 9, 4], ["a", "b", "c", "d", "e", "f"]), ], ) -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "std", "var"] -) +@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "std", "var"]) @pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) @pytest.mark.parametrize("center", [True, False]) def test_rolling_series_basic(data, index, agg, nulls, center): @@ -46,9 +44,9 @@ def test_rolling_series_basic(data, index, agg, nulls, center): expect = getattr( psr.rolling(window_size, min_periods, center), agg )().fillna(-1) - got = getattr( - gsr.rolling(window_size, min_periods, center), agg - )().fillna(-1) + got = getattr(gsr.rolling(window_size, min_periods, center), agg)().fillna( + -1 + ) assert_eq(expect, got, check_dtype=False, check_freq=False) @@ -64,9 +62,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center): }, ], ) -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "std", "var"] -) +@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "std", "var"]) @pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) @pytest.mark.parametrize("center", [True, False]) def test_rolling_dataframe_basic(data, agg, nulls, center): @@ -92,9 +88,9 @@ def test_rolling_dataframe_basic(data, agg, nulls, center): expect = getattr( pdf.rolling(window_size, min_periods, center), agg )().fillna(-1) - got = getattr( - gdf.rolling(window_size, min_periods, center), agg - )().fillna(-1) + got = getattr(gdf.rolling(window_size, min_periods, center), agg)().fillna( + -1 + ) assert_eq(expect, got, check_dtype=False) @@ -278,9 +274,7 @@ def test_rolling_getitem(): def test_rolling_getitem_window(): - index = pd.DatetimeIndex( - pd.date_range("2000-01-01", "2000-01-02", freq="1h") - ) + index = pd.DatetimeIndex(pd.date_range("2000-01-01", "2000-01-02", freq="1h")) pdf = pd.DataFrame({"x": np.arange(len(index))}, index=index) gdf = cudf.from_pandas(pdf) @@ -381,9 +375,7 @@ def some_func(A): ) -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "var", "std"] -) +@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "var", "std"]) def test_rolling_groupby_simple(agg): pdf = pd.DataFrame( { @@ -394,9 +386,7 @@ def test_rolling_groupby_simple(agg): gdf = cudf.from_pandas(pdf) for window_size in range(1, len(pdf) + 1): - expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna( - -1 - ) + expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna(-1) got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) assert_eq(expect, got, check_dtype=False) @@ -406,16 +396,12 @@ def test_rolling_groupby_simple(agg): gdf = cudf.from_pandas(pdf) for window_size in range(1, len(pdf) + 1): - expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna( - -1 - ) + expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna(-1) got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "var", "std"] -) +@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "var", "std"]) def test_rolling_groupby_multi(agg): pdf = pd.DataFrame( { @@ -436,12 +422,8 @@ def test_rolling_groupby_multi(agg): assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "var", "std"] -) -@pytest.mark.parametrize( - "window_size", ["1d", "2d", "3d", "4d", "5d", "6d", "7d"] -) +@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "var", "std"]) +@pytest.mark.parametrize("window_size", ["1d", "2d", "3d", "4d", "5d", "6d", "7d"]) def test_rolling_groupby_offset(agg, window_size): pdf = pd.DataFrame( { @@ -451,9 +433,7 @@ def test_rolling_groupby_offset(agg, window_size): } ).set_index("date") gdf = cudf.from_pandas(pdf) - expect = getattr(pdf.groupby("group").rolling(window_size), agg)().fillna( - -1 - ) + expect = getattr(pdf.groupby("group").rolling(window_size), agg)().fillna(-1) got = getattr(gdf.groupby("group").rolling(window_size), agg)().fillna(-1) assert_eq(expect, got, check_dtype=False) @@ -462,9 +442,7 @@ def test_rolling_custom_index_support(): from pandas.api.indexers import BaseIndexer class CustomIndexer(BaseIndexer): - def get_window_bounds( - self, num_values, min_periods, center, closed, step=None - ): + def get_window_bounds(self, num_values, min_periods, center, closed, step=None): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index cdce17eeb76..f36afc950e8 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -124,9 +124,7 @@ def pdf_ext(scope="module"): df["Integer"] = np.array([i for i in range(size)]) df["List"] = [[i] for i in range(size)] df["Struct"] = [{"a": i} for i in range(size)] - df["String"] = (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[ - :size - ] + df["String"] = (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[:size] return df @@ -188,9 +186,7 @@ def test_read_csv_byte_range( f"s3://{bucket}/{fname}", storage_options=s3so, byte_range=(74, 73), - bytes_per_thread=bytes_per_thread - if not use_python_file_object - else None, + bytes_per_thread=bytes_per_thread if not use_python_file_object else None, header=None, names=["Integer", "Float", "Integer2", "String", "Boolean"], use_python_file_object=use_python_file_object, @@ -259,9 +255,7 @@ def test_read_parquet( # Check fsspec file-object handling buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - fs = get_fs_token_paths( - f"s3://{bucket}/{fname}", storage_options=s3so - )[0] + fs = get_fs_token_paths(f"s3://{bucket}/{fname}", storage_options=s3so)[0] with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f: got2 = cudf.read_parquet( f, @@ -303,9 +297,7 @@ def test_read_parquet_ext( ) if index: expect = ( - pdf_ext.set_index(index)[columns] - if columns - else pdf_ext.set_index(index) + pdf_ext.set_index(index)[columns] if columns else pdf_ext.set_index(index) ) else: expect = pdf_ext[columns] if columns else pdf_ext @@ -404,12 +396,8 @@ def test_write_parquet(s3_base, s3so, pdf, partition_cols): ) assert s3fs.exists(f"s3://{bucket}/{fname_pandas}") - got = pd.read_parquet( - f"s3://{bucket}/{fname_pandas}", storage_options=s3so - ) - expect = cudf.read_parquet( - f"s3://{bucket}/{fname_cudf}", storage_options=s3so - ) + got = pd.read_parquet(f"s3://{bucket}/{fname_pandas}", storage_options=s3so) + expect = cudf.read_parquet(f"s3://{bucket}/{fname_cudf}", storage_options=s3so) assert_eq(expect, got) @@ -504,9 +492,7 @@ def test_write_chunked_parquet(s3_base, s3so): bucket = "parquet" from cudf.io.parquet import ParquetDatasetWriter - with s3_context( - s3_base=s3_base, bucket=bucket, files={dirname: BytesIO()} - ) as s3fs: + with s3_context(s3_base=s3_base, bucket=bucket, files={dirname: BytesIO()}) as s3fs: with ParquetDatasetWriter( f"s3://{bucket}/{dirname}", partition_cols=["a"], diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 05a91a8fea3..9ac0404a9bc 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -203,17 +203,13 @@ def test_scalar_roundtrip(value): @pytest.mark.parametrize( "dtype", - NUMERIC_TYPES - + DATETIME_TYPES - + TIMEDELTA_TYPES - + ["object"] - + TEST_DECIMAL_TYPES, + NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] + TEST_DECIMAL_TYPES, ) def test_null_scalar(dtype): s = cudf.Scalar(None, dtype=dtype) - if cudf.api.types.is_datetime64_dtype( + if cudf.api.types.is_datetime64_dtype(dtype) or cudf.api.types.is_timedelta64_dtype( dtype - ) or cudf.api.types.is_timedelta64_dtype(dtype): + ): assert s.value is cudf.NaT else: assert s.value is cudf.NA @@ -245,9 +241,7 @@ def test_nat_to_null_scalar_succeeds(value): assert s.dtype == value.dtype -@pytest.mark.parametrize( - "value", [None, np.datetime64("NaT"), np.timedelta64("NaT")] -) +@pytest.mark.parametrize("value", [None, np.datetime64("NaT"), np.timedelta64("NaT")]) def test_generic_null_scalar_construction_fails(value): with pytest.raises(TypeError): cudf.Scalar(value) @@ -395,9 +389,7 @@ def test_device_scalar_direct_construction(value, decimal_type): @pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) def test_construct_from_scalar(value): value = cudf.utils.dtypes.to_cudf_compatible_scalar(value) - x = cudf.Scalar( - value, value.dtype if not isinstance(value, Decimal) else None - ) + x = cudf.Scalar(value, value.dtype if not isinstance(value, Decimal) else None) y = cudf.Scalar(x) assert x.value == y.value or np.isnan(x.value) and np.isnan(y.value) diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py index 3ba652ff6c0..1b4ebec781a 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/test_search.py @@ -105,9 +105,7 @@ def test_searchsorted_categorical(side): @pytest.mark.parametrize("side", ["left", "right"]) def test_searchsorted_datetime(side): - psr1 = pd.Series( - pd.date_range("20190101", "20200101", freq="400h", name="times") - ) + psr1 = pd.Series(pd.date_range("20190101", "20200101", freq="400h", name="times")) sr1 = cudf.from_pandas(psr1) psr2 = pd.Series( diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index f26d78e7783..419d4dda7a9 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -83,17 +83,13 @@ lambda: cudf.DataFrame( {"a": list(range(13)), "b": [float(x) for x in range(13)]}, index=cudf.Index( - cudf.date_range( - start="2011-01-01", end="2012-01-01", periods=13 - ) + cudf.date_range(start="2011-01-01", end="2012-01-01", periods=13) ), ), lambda: cudf.Series( list(range(13)), index=cudf.Index( - cudf.date_range( - start="2011-01-01", end="2012-01-01", periods=13 - ) + cudf.date_range(start="2011-01-01", end="2012-01-01", periods=13) ), ), lambda: cudf.TimedeltaIndex( @@ -163,9 +159,7 @@ def test_serialize_dataframe(): df = cudf.DataFrame() df["a"] = np.arange(100) df["b"] = np.arange(100, dtype=np.float32) - df["c"] = pd.Categorical( - ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] - ) + df["c"] = pd.Categorical(["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"]) outdf = cudf.DataFrame.deserialize(*df.serialize()) assert_eq(df, outdf) @@ -174,9 +168,7 @@ def test_serialize_dataframe_with_index(): df = cudf.DataFrame() df["a"] = np.arange(100) df["b"] = np.random.random(100) - df["c"] = pd.Categorical( - ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] - ) + df["c"] = pd.Categorical(["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"]) df = df.sort_values("b") outdf = cudf.DataFrame.deserialize(*df.serialize()) assert_eq(df, outdf) @@ -211,9 +203,7 @@ def test_serialize_multi_index(): gdf = cudf.DataFrame.from_pandas(pdf) gdg = gdf.groupby(["a", "b"]).sum() multiindex = gdg.index - outindex = cudf.core.multiindex.MultiIndex.deserialize( - *multiindex.serialize() - ) + outindex = cudf.core.multiindex.MultiIndex.deserialize(*multiindex.serialize()) assert_eq(multiindex, outindex) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 48194494260..5b2308465c4 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -40,9 +40,7 @@ def _series_na_data(): pd.Series(["a", "b", "u", "h", "d"]), pd.Series([None, None, np.nan, None, np.inf, -np.inf]), pd.Series([], dtype="float64"), - pd.Series( - [pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")] - ), + pd.Series([pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")]), pd.Series([np.nan]), pd.Series([None]), pd.Series(["a", "b", "", "c", None, "e"]), @@ -260,9 +258,7 @@ def test_series_concat_error_mixed_types(): ] * 25, [ - pd.Series( - [0.1, 0.002, 324.2332, 0.2342], index=["-", "+", "%", "#"] - ), + pd.Series([0.1, 0.002, 324.2332, 0.2342], index=["-", "+", "%", "#"]), pd.Series([12, 14, 15, 27], index=["d", "e", "z", "x"]), ] * 46, @@ -309,13 +305,9 @@ def test_series_concat_existing_buffers(): a5 = cudf.Series(np.array([1, 2, 3], dtype=np.int32)) a6 = cudf.Series(np.array([4.5, 5.5, 6.5], dtype=np.float64)) gs = cudf.concat([a5, a6]) - np.testing.assert_equal( - gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()]) - ) + np.testing.assert_equal(gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()])) gs = cudf.concat([cudf.Series(a6), a5]) - np.testing.assert_equal( - gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()]) - ) + np.testing.assert_equal(gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()])) def test_series_column_iter_error(): @@ -364,9 +356,7 @@ def test_series_column_iter_error(): [None, None, None, None, None], np.array(["1991-11-20", "2004-12-04"], dtype=np.datetime64), np.array(["1991-11-20", None], dtype=np.datetime64), - np.array( - ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], dtype=np.datetime64 - ), + np.array(["1991-11-20 05:15:00", "2004-12-04 10:00:00"], dtype=np.datetime64), np.array(["1991-11-20 05:15:00", None], dtype=np.datetime64), ], ) @@ -444,9 +434,7 @@ def test_series_describe_timedelta(dtype): pd.Series(["d", "e", "f"], dtype="category"), pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])), pd.Series( - pd.Categorical( - ["d", "e", "f"], categories=["f", "e", "d"], ordered=True - ) + pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"], ordered=True) ), ], ) @@ -557,16 +545,12 @@ def test_categorical_value_counts(dropna, normalize, num_elements): # gdf gdf = cudf.DataFrame() gdf["a"] = cudf.Series.from_categorical(pd_cat) - gdf_value_counts = gdf["a"].value_counts( - dropna=dropna, normalize=normalize - ) + gdf_value_counts = gdf["a"].value_counts(dropna=dropna, normalize=normalize) # pandas pdf = pd.DataFrame() pdf["a"] = pd_cat - pdf_value_counts = pdf["a"].value_counts( - dropna=dropna, normalize=normalize - ) + pdf_value_counts = pdf["a"].value_counts(dropna=dropna, normalize=normalize) # verify assert_eq( @@ -589,15 +573,11 @@ def test_series_value_counts(dropna, normalize): for size in [10**x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 - sr = cudf.Series.from_masked_array( - arr, cudf.Series(mask)._column.as_mask() - ) + sr = cudf.Series.from_masked_array(arr, cudf.Series(mask)._column.as_mask()) sr.name = "col" expect = ( - sr.to_pandas() - .value_counts(dropna=dropna, normalize=normalize) - .sort_index() + sr.to_pandas().value_counts(dropna=dropna, normalize=normalize).sort_index() ) got = sr.value_counts(dropna=dropna, normalize=normalize).sort_index() @@ -634,12 +614,8 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize): psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0, None]) gsr = cudf.from_pandas(psr) - expected = psr.value_counts( - ascending=ascending, dropna=dropna, normalize=normalize - ) - got = gsr.value_counts( - ascending=ascending, dropna=dropna, normalize=normalize - ) + expected = psr.value_counts(ascending=ascending, dropna=dropna, normalize=normalize) + got = gsr.value_counts(ascending=ascending, dropna=dropna, normalize=normalize) assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) assert_eq( @@ -656,9 +632,7 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize): cudf.Series([None]), cudf.Series([4]), cudf.Series([2, 3, -1, 0, 1], name="test name"), - cudf.Series( - [1, 2, 3, None, 2, 1], index=["a", "v", "d", "e", "f", "g"] - ), + cudf.Series([1, 2, 3, None, 2, 1], index=["a", "v", "d", "e", "f", "g"]), cudf.Series([1, 2, 3, None, 2, 1, None], name="abc"), cudf.Series(["ab", "bc", "ab", None, "bc", None, None]), cudf.Series([None, None, None, None, None], dtype="str"), @@ -860,9 +834,7 @@ def test_series_memory_usage(): ), ( cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"), - pd.Series( - [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype() - ), + pd.Series([234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype()), ), ( cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"), @@ -874,14 +846,10 @@ def test_series_memory_usage(): ), ( cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"), - pd.Series( - [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype() - ), + pd.Series([11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype()), ), ( - cudf.Series( - [32431, None, None, 32322, 0, 10, -32324, None], dtype="int64" - ), + cudf.Series([32431, None, None, 32322, 0, 10, -32324, None], dtype="int64"), pd.Series( [32431, None, None, 32322, 0, 10, -32324, None], dtype=pd.Int64Dtype(), @@ -1252,9 +1220,7 @@ def test_series_drop_raises(): [ None, ["ia", "ib", "ic", "id", "ie"], - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] - ), + pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]), ], ) def test_explode(data, ignore_index, p_index): @@ -1326,9 +1292,7 @@ def test_series_raises_float16(data): @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) -def test_series_sort_index( - index, axis, ascending, inplace, ignore_index, na_position -): +def test_series_sort_index(index, axis, ascending, inplace, ignore_index, na_position): ps = pd.Series([10, 3, 12], index=index) gs = cudf.from_pandas(ps) @@ -1412,9 +1376,7 @@ def test_equals_names(lhs, rhs): assert_eq(expect, got) -@pytest.mark.parametrize( - "data", [[True, False, None, True, False], [None, None], []] -) +@pytest.mark.parametrize("data", [[True, False, None, True, False], [None, None], []]) @pytest.mark.parametrize("bool_dtype", ["bool", "boolean", pd.BooleanDtype()]) def test_nullable_bool_dtype_series(data, bool_dtype): psr = pd.Series(data, dtype=pd.BooleanDtype()) @@ -1437,8 +1399,7 @@ def test_reset_index(level, drop, inplace, original_name, name): if not drop and inplace: pytest.skip( - "For exception checks, see " - "test_reset_index_dup_level_name_exceptions" + "For exception checks, see " "test_reset_index_dup_level_name_exceptions" ) expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) @@ -1463,8 +1424,7 @@ def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): gs = cudf.from_pandas(ps) if level == [None] or not drop and inplace: pytest.skip( - "For exception checks, see " - "test_reset_index_dup_level_name_exceptions" + "For exception checks, see " "test_reset_index_dup_level_name_exceptions" ) expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) @@ -1489,8 +1449,7 @@ def test_reset_index_named(drop, inplace, original_name, name): if not drop and inplace: pytest.skip( - "For exception checks, see " - "test_reset_index_dup_level_name_exceptions" + "For exception checks, see " "test_reset_index_dup_level_name_exceptions" ) expect = ps.reset_index(drop=drop, inplace=inplace, name=name) @@ -1690,19 +1649,13 @@ def test_series_truncate_errors(): def test_series_truncate_datetimeindex(): - dates = cudf.date_range( - "2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s" - ) + dates = cudf.date_range("2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s") csr = cudf.Series(range(len(dates)), index=dates) psr = csr.to_pandas() assert_eq( - csr.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ), - psr.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ), + csr.truncate(before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"), + psr.truncate(before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"), ) @@ -1937,18 +1890,14 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): indices = s.digitize(s_bins, right) else: indices = s.digitize(bins, right) - np.testing.assert_array_equal( - np.digitize(data, bins, right), indices.to_numpy() - ) + np.testing.assert_array_equal(np.digitize(data, bins, right), indices.to_numpy()) def test_series_digitize_invalid_bins(): s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") bins = cudf.Series([2, None, None, 50, 90], dtype="int32") - with pytest.raises( - ValueError, match="`bins` cannot contain null entries." - ): + with pytest.raises(ValueError, match="`bins` cannot contain null entries."): _ = s.digitize(bins) @@ -2137,9 +2086,7 @@ def test_series_copy(data, copy): {"a": 1}, ], ) -@pytest.mark.parametrize( - "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] -) +@pytest.mark.parametrize("index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]) def test_series_init_dict_with_index(data, index): pandas_series = pd.Series(data, index=index) cudf_series = cudf.Series(data, index=index) @@ -2148,9 +2095,7 @@ def test_series_init_dict_with_index(data, index): @pytest.mark.parametrize("data", ["abc", None, 1, 3.7]) -@pytest.mark.parametrize( - "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] -) +@pytest.mark.parametrize("index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]) def test_series_init_scalar_with_index(data, index): pandas_series = pd.Series(data, index=index) cudf_series = cudf.Series(data, index=index) @@ -2322,9 +2267,7 @@ def test_series_count_invalid_param(): s.count(skipna=True) -@pytest.mark.parametrize( - "data", [[0, 1, 2], ["a", "b", "c"], [0.324, 32.32, 3243.23]] -) +@pytest.mark.parametrize("data", [[0, 1, 2], ["a", "b", "c"], [0.324, 32.32, 3243.23]]) def test_series_setitem_nat_with_non_datetimes(data): s = cudf.Series(data) with pytest.raises(TypeError): @@ -2407,9 +2350,7 @@ def test_series_arrow_numeric_types_roundtrip(pandas_type): cudf.from_pandas(pdf) -@pytest.mark.parametrize( - "pandas_type", [pd.ArrowDtype(pa.bool_()), pd.BooleanDtype()] -) +@pytest.mark.parametrize("pandas_type", [pd.ArrowDtype(pa.bool_()), pd.BooleanDtype()]) def test_series_arrow_bool_types_roundtrip(pandas_type): ps = pd.Series([True, False, None], dtype=pandas_type) pi = pd.Index(ps) @@ -2428,9 +2369,7 @@ def test_series_arrow_bool_types_roundtrip(pandas_type): cudf.from_pandas(pdf) -@pytest.mark.parametrize( - "pandas_type", [pd.ArrowDtype(pa.string()), pd.StringDtype()] -) +@pytest.mark.parametrize("pandas_type", [pd.ArrowDtype(pa.string()), pd.StringDtype()]) def test_series_arrow_string_types_roundtrip(pandas_type): ps = pd.Series(["abc", None, "xyz"], dtype=pandas_type) pi = pd.Index(ps) @@ -2683,9 +2622,7 @@ def test_list_interval_like_maintains_dtype(): assert_eq(result, expected) -@pytest.mark.parametrize( - "klass", [cudf.Series, cudf.Index, pd.Series, pd.Index] -) +@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index, pd.Series, pd.Index]) def test_series_from_named_object_name_priority(klass): result = cudf.Series(klass([1], name="a"), name="b") assert result.name == "b" diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py index 9da08e483c9..c8b5f8ca5c0 100644 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ b/python/cudf/cudf/tests/test_seriesmap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from itertools import product from math import floor @@ -54,9 +54,7 @@ def test_series_map_callable_numeric_random(nelem): # Call map got = sr.map(lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x))) - expect = pdsr.map( - lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x)) - ) + expect = pdsr.map(lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x))) # Check assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index ff2f7bd41f2..61e5b8f4ba3 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -38,9 +38,7 @@ def test_dataframe_setitem_scaler_bool(): [pd.DataFrame({"a": [1, 2, 3]}), pd.DataFrame({"a": ["x", "y", "z"]})], ) @pytest.mark.parametrize("arg", [["a"], "a", "b"]) -@pytest.mark.parametrize( - "value", [-10, pd.DataFrame({"a": [-1, -2, -3]}), "abc"] -) +@pytest.mark.parametrize("value", [-10, pd.DataFrame({"a": [-1, -2, -3]}), "abc"]) def test_dataframe_setitem_columns(df, arg, value): gdf = cudf.from_pandas(df) cudf_replace_value = value @@ -83,22 +81,16 @@ def test_dataframe_setitem_new_columns(df, arg, value): # set_item_series inconsistency def test_series_setitem_index(): - df = pd.DataFrame( - data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] - ) + df = pd.DataFrame(data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3]) df["b"] = pd.Series(data=[12, 11, 10], index=[3, 2, 1]) - gdf = cudf.DataFrame( - data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] - ) + gdf = cudf.DataFrame(data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3]) gdf["b"] = cudf.Series(data=[12, 11, 10], index=[3, 2, 1]) assert_eq(df, gdf, check_dtype=False) @pytest.mark.parametrize("psr", [pd.Series([1, 2, 3], index=["a", "b", "c"])]) -@pytest.mark.parametrize( - "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] -) +@pytest.mark.parametrize("arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]]) def test_series_set_item(psr, arg): gsr = cudf.from_pandas(psr) @@ -122,9 +114,7 @@ def test_series_setitem_singleton_range(): @pytest.mark.parametrize( "index", [ - pd.MultiIndex.from_frame( - pd.DataFrame({"b": [3, 2, 1], "c": ["a", "b", "c"]}) - ), + pd.MultiIndex.from_frame(pd.DataFrame({"b": [3, 2, 1], "c": ["a", "b", "c"]})), ["a", "b", "c"], ], ) @@ -390,9 +380,7 @@ def test_loc_setitem_string_11298(value): @pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11944") def test_loc_setitem_list_11944(): - df = pd.DataFrame( - data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]} - ) + df = pd.DataFrame(data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]}) cdf = cudf.from_pandas(df) df.loc[df.a == "yes", "b"] = [["hello"]] cdf.loc[df.a == "yes", "b"] = [["hello"]] @@ -457,16 +445,10 @@ def test_loc_setitem_series_index_alignment_13031(other_index): pd.Series([1, 2, 3], index=pd.RangeIndex(0, 3)), pd.Series([1, 2, 3], index=pd.RangeIndex(start=2, stop=-1, step=-1)), pd.Series([1, 2, 3], index=pd.RangeIndex(start=1, stop=6, step=2)), - pd.Series( - [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-9, step=-2) - ), - pd.Series( - [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-12, step=-3) - ), + pd.Series([1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-9, step=-2)), + pd.Series([1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-12, step=-3)), pd.Series([1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=14, step=4)), - pd.Series( - [1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=-14, step=-4) - ), + pd.Series([1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=-14, step=-4)), ], ) @pytest.mark.parametrize("arg", list(range(-20, 20)) + [5.6, 3.1]) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 618c4f30bd9..bb43be9bbdf 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -30,9 +30,7 @@ sort_slice_args = [slice(1, None), slice(None, -1), slice(1, -1)] -@pytest.mark.parametrize( - "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args)) -) +@pytest.mark.parametrize("nelem,dtype", list(product(sort_nelem_args, sort_dtype_args))) def test_dataframe_sort_values(nelem, dtype): np.random.seed(0) df = DataFrame() @@ -58,9 +56,7 @@ def test_dataframe_sort_values_ignore_index(index, ignore_index): reason="Unstable sorting by pandas(numpy): https://github.com/pandas-dev/pandas/issues/57531" ) - gdf = DataFrame( - {"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]} - ) + gdf = DataFrame({"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]}) gdf = gdf.set_index(index) pdf = gdf.to_pandas() @@ -81,9 +77,7 @@ def test_series_sort_values_ignore_index(ignore_index): assert_eq(expect, got) -@pytest.mark.parametrize( - "nelem,sliceobj", list(product([10, 100], sort_slice_args)) -) +@pytest.mark.parametrize("nelem,sliceobj", list(product([10, 100], sort_slice_args))) def test_dataframe_sort_values_sliced(nelem, sliceobj): np.random.seed(0) df = pd.DataFrame() @@ -111,9 +105,7 @@ def test_series_argsort(nelem, dtype, asc): np.testing.assert_array_equal(expected, res.to_numpy()) -@pytest.mark.parametrize( - "nelem,asc", list(product(sort_nelem_args, [True, False])) -) +@pytest.mark.parametrize("nelem,asc", list(product(sort_nelem_args, [True, False]))) def test_series_sort_index(nelem, asc): np.random.seed(0) sr = Series(100 * np.random.random(nelem)) @@ -212,9 +204,7 @@ def test_dataframe_nsmallest_sliced(counts, sliceobj): @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column( - num_cols, num_rows, dtype, ascending, na_position -): +def test_dataframe_multi_column(num_cols, num_rows, dtype, ascending, na_position): np.random.seed(0) by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() @@ -229,9 +219,7 @@ def test_dataframe_multi_column( got = gdf.sort_values(by, ascending=ascending, na_position=na_position) expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) - assert_eq( - got[by].reset_index(drop=True), expect[by].reset_index(drop=True) - ) + assert_eq(got[by].reset_index(drop=True), expect[by].reset_index(drop=True)) @pytest.mark.parametrize("num_cols", [1, 2, 3]) @@ -253,9 +241,7 @@ def test_dataframe_multi_column_nulls( if nulls == "some": idx = np.array([], dtype="int64") if num_rows > 0: - idx = np.random.choice( - num_rows, size=int(num_rows / 4), replace=False - ) + idx = np.random.choice(num_rows, size=int(num_rows / 4), replace=False) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -266,21 +252,13 @@ def test_dataframe_multi_column_nulls( got = gdf.sort_values(by, ascending=ascending, na_position=na_position) expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) - assert_eq( - got[by].reset_index(drop=True), expect[by].reset_index(drop=True) - ) + assert_eq(got[by].reset_index(drop=True), expect[by].reset_index(drop=True)) -@pytest.mark.parametrize( - "ascending", list(product((True, False), (True, False))) -) +@pytest.mark.parametrize("ascending", list(product((True, False), (True, False)))) @pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column_nulls_multiple_ascending( - ascending, na_position -): - pdf = pd.DataFrame( - {"a": [3, 1, None, 2, 2, None, 1], "b": [1, 2, 3, 4, 5, 6, 7]} - ) +def test_dataframe_multi_column_nulls_multiple_ascending(ascending, na_position): + pdf = pd.DataFrame({"a": [3, 1, None, 2, 2, None, 1], "b": [1, 2, 3, 4, 5, 6, 7]}) gdf = DataFrame.from_pandas(pdf) expect = pdf.sort_values( by=["a", "b"], ascending=ascending, na_position=na_position @@ -338,12 +316,8 @@ def _check_scatter_by_map(dfs, col): _check_scatter_by_map( df.scatter_by_map("a", map_size, keep_index=keep), df["a"] ) - _check_scatter_by_map( - df.scatter_by_map("b", map_size, keep_index=keep), df["b"] - ) - _check_scatter_by_map( - df.scatter_by_map("c", map_size, keep_index=keep), df["c"] - ) + _check_scatter_by_map(df.scatter_by_map("b", map_size, keep_index=keep), df["b"]) + _check_scatter_by_map(df.scatter_by_map("c", map_size, keep_index=keep), df["c"]) with pytest.warns(UserWarning): _check_scatter_by_map( df.scatter_by_map("d", map_size, keep_index=keep), df["d"] @@ -373,12 +347,8 @@ def _check_scatter_by_map(dfs, col): isinstance(frame.index, type(df2.index)) -@pytest.mark.parametrize( - "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args)) -) -@pytest.mark.parametrize( - "kind", ["quicksort", "mergesort", "heapsort", "stable"] -) +@pytest.mark.parametrize("nelem,dtype", list(product(sort_nelem_args, sort_dtype_args))) +@pytest.mark.parametrize("kind", ["quicksort", "mergesort", "heapsort", "stable"]) def test_dataframe_sort_values_kind(nelem, dtype, kind): np.random.seed(0) df = DataFrame() diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index f18cb32a091..ace62fc53d1 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -93,9 +93,7 @@ def single_column_df_base_data(df: cudf.DataFrame) -> SpillableBuffer: def spilled_and_unspilled(manager: SpillManager) -> Tuple[int, int]: """Get bytes spilled and unspilled known by the manager""" spilled = sum(buf.size for buf in manager.buffers() if buf.is_spilled) - unspilled = sum( - buf.size for buf in manager.buffers() if not buf.is_spilled - ) + unspilled = sum(buf.size for buf in manager.buffers() if not buf.is_spilled) return spilled, unspilled @@ -322,9 +320,7 @@ def test_spill_to_device_limit(manager: SpillManager): assert single_column_df_data(df3).is_spilled -@pytest.mark.parametrize( - "manager", [{"device_memory_limit": 0}], indirect=True -) +@pytest.mark.parametrize("manager", [{"device_memory_limit": 0}], indirect=True) def test_zero_device_limit(manager: SpillManager): assert manager._device_memory_limit == 0 df1 = single_column_df() @@ -429,12 +425,8 @@ def f(sleep=False, nest=0): futures_with_spill_lock = [] futures_without_spill_lock = [] for _ in range(100): - futures_with_spill_lock.append( - executor.submit(f, sleep=True, nest=1) - ) - futures_without_spill_lock.append( - executor.submit(f, sleep=True, nest=1) - ) + futures_with_spill_lock.append(executor.submit(f, sleep=True, nest=1)) + futures_without_spill_lock.append(executor.submit(f, sleep=True, nest=1)) all(isinstance(f.result(), SpillLock) for f in futures_with_spill_lock) all(f is None for f in futures_without_spill_lock) @@ -494,9 +486,7 @@ def test_serialize_dask_dataframe(manager: SpillManager): protocol = pytest.importorskip("distributed.protocol") df1 = single_column_df(target="gpu") - header, frames = protocol.serialize( - df1, serializers=("dask",), on_error="raise" - ) + header, frames = protocol.serialize(df1, serializers=("dask",), on_error="raise") buf = single_column_df_data(df1) assert len(frames) == 1 assert isinstance(frames[0], memoryview) @@ -515,9 +505,7 @@ def test_serialize_cuda_dataframe(manager: SpillManager): protocol = pytest.importorskip("distributed.protocol") df1 = single_column_df(target="gpu") - header, frames = protocol.serialize( - df1, serializers=("cuda",), on_error="raise" - ) + header, frames = protocol.serialize(df1, serializers=("cuda",), on_error="raise") buf: SpillableBuffer = single_column_df_data(df1) assert len(buf.owner._spill_locks) == 1 assert len(frames) == 1 @@ -620,9 +608,7 @@ def test_memoryview_slice(manager: SpillManager, dtype): def test_statistics(manager: SpillManager): assert len(manager.statistics.spill_totals) == 0 - buf: SpillableBuffer = as_buffer( - data=rmm.DeviceBuffer(size=10), exposed=False - ) + buf: SpillableBuffer = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) buf.spill(target="cpu") if manager.statistics.level == 0: @@ -646,8 +632,7 @@ def test_statistics_expose(manager: SpillManager): assert len(manager.statistics.spill_totals) == 0 buffers: List[SpillableBuffer] = [ - as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) - for _ in range(10) + as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) ] # Expose the first buffer @@ -672,8 +657,7 @@ def test_statistics_expose(manager: SpillManager): # Create and spill 10 new buffers buffers: List[SpillableBuffer] = [ - as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) - for _ in range(10) + as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) ] manager.spill_to_device_limit(0) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index b9eb42906e8..6e604345897 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -109,9 +109,7 @@ def test_series_nunique(nan_as_null, dropna): got = cudf_series.nunique(dropna=dropna) assert expect == got - cudf_series = cudf.Series( - [1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null - ) + cudf_series = cudf.Series([1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null) if nan_as_null is True: pd_series = pd.Series([1.0, 2.0, 3.0, np.nan, None]) else: @@ -151,9 +149,7 @@ def test_exact_quantiles(int_method): df = pd.DataFrame(arr) gdf_series = cudf.Series(arr) - q1 = gdf_series.quantile( - quant_values, interpolation=int_method, exact=True - ) + q1 = gdf_series.quantile(quant_values, interpolation=int_method, exact=True) q2 = df.quantile(quant_values, interpolation=int_method) @@ -170,9 +166,7 @@ def test_exact_quantiles_int(int_method): df = pd.DataFrame(arr) gdf_series = cudf.Series(arr) - q1 = gdf_series.quantile( - quant_values, interpolation=int_method, exact=True - ) + q1 = gdf_series.quantile(quant_values, interpolation=int_method, exact=True) q2 = df.quantile(quant_values, interpolation=int_method) @@ -288,9 +282,7 @@ def test_kurt_skew_error(op): cudf.Series(np.zeros(100)), cudf.Series(np.repeat(np.nan, 100)), cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), - cudf.Series( - [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False - ), + cudf.Series([5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False), cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), cudf.Series([], dtype="float64"), cudf.Series([-3]), @@ -374,9 +366,7 @@ def test_series_pct_change(data, periods, fill_method): ) ): expected = ps.pct_change(periods=periods, fill_method=fill_method) - np.testing.assert_array_almost_equal( - got.to_numpy(na_value=np.nan), expected - ) + np.testing.assert_array_almost_equal(got.to_numpy(na_value=np.nan), expected) @pytest.mark.parametrize( @@ -465,14 +455,12 @@ def test_corr1d(data1, data2, method): ps1_align, ps2_align = ps1.align(ps2, join="inner") - is_singular = ( - len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0 - ) or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0) + is_singular = (len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0) or ( + len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0 + ) is_identical = ( len(ps1_align.dropna().unique()) == 1 and len(ps2_align.dropna()) > 0 - ) or ( - len(ps2_align.dropna().unique()) == 1 and len(ps1_align.dropna()) > 0 - ) + ) or (len(ps2_align.dropna().unique()) == 1 and len(ps1_align.dropna()) > 0) # Pearson correlation leads to division by 0 when either sample size is 1. # Spearman allows for size 1 samples, but will error if all data in a @@ -539,9 +527,7 @@ def test_nans_stats(data, ops, skipna): psr = pd.Series(data, dtype="float64" if len(data) == 0 else None) gsr = cudf.from_pandas(psr) - assert_eq( - getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) - ) + assert_eq(getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna)) gsr = cudf.Series( data, dtype="float64" if len(data) == 0 else None, nan_as_null=False @@ -601,9 +587,7 @@ def test_cov_corr_datetime_timedelta(data1, data2, dtype): @pytest.mark.parametrize( "data", [ - randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ), + randomdata(nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}), ], ) @pytest.mark.parametrize("null_flag", [False, True]) @@ -633,9 +617,7 @@ def test_kurtosis_df(data, null_flag, numeric_only): @pytest.mark.parametrize( "data", [ - randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ), + randomdata(nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}), ], ) @pytest.mark.parametrize("null_flag", [False, True]) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index de771a56e77..bd755070334 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -175,11 +175,7 @@ def test_string_repr(ps_gs, item): "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"] ) def test_string_astype(dtype): - if ( - dtype.startswith("int") - or dtype.startswith("uint") - or dtype.startswith("long") - ): + if dtype.startswith("int") or dtype.startswith("uint") or dtype.startswith("long"): data = ["1", "2", "3", "4", "5"] elif dtype.startswith("float"): data = [ @@ -299,9 +295,7 @@ def test_string_numeric_astype(dtype): if dtype.startswith("bool"): data = [1, 0, 1, 0, 1] elif ( - dtype.startswith("int") - or dtype.startswith("uint") - or dtype.startswith("long") + dtype.startswith("int") or dtype.startswith("uint") or dtype.startswith("long") ): data = [1, 2, 3, 4, 5] elif dtype.startswith("float"): @@ -393,9 +387,7 @@ def _cat_convert_seq_to_cudf(others): gd_others = pd_others if isinstance(gd_others, (list, tuple)): temp_tuple = [ - cudf.from_pandas(elem) - if isinstance(elem, (pd.Series, pd.Index)) - else elem + cudf.from_pandas(elem) if isinstance(elem, (pd.Series, pd.Index)) else elem for elem in gd_others ] @@ -769,9 +761,7 @@ def test_string_join(ps_gs, sep): @pytest.mark.parametrize("pat", [r"(a)", r"(f)", r"([a-z])", r"([A-Z])"]) @pytest.mark.parametrize("expand", [True, False]) -@pytest.mark.parametrize( - "flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)] -) +@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)]) def test_string_extract(ps_gs, pat, expand, flags, flags_raise): ps, gs = ps_gs expectation = raise_builder([flags_raise], NotImplementedError) @@ -892,9 +882,7 @@ def test_string_repeat(data, repeats): @pytest.mark.parametrize("repl", ["qwerty", "", " "]) @pytest.mark.parametrize("case,case_raise", [(None, 0), (True, 1), (False, 1)]) @pytest.mark.parametrize("flags,flags_raise", [(0, 0), (re.U, 1)]) -def test_string_replace( - ps_gs, pat, repl, case, case_raise, flags, flags_raise, regex -): +def test_string_replace(ps_gs, pat, repl, case, case_raise, flags, flags_raise, regex): ps, gs = ps_gs expectation = raise_builder([case_raise, flags_raise], NotImplementedError) @@ -1197,9 +1185,7 @@ def test_string_no_children_properties(): ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], ], ) -@pytest.mark.parametrize( - "index", [-100, -5, -2, -6, -1, 0, 1, 2, 3, 9, 10, 100] -) +@pytest.mark.parametrize("index", [-100, -5, -2, -6, -1, 0, 1, 2, 3, 9, 10, 100]) def test_string_get(string, index): pds = pd.Series(string) gds = cudf.Series(string) @@ -1561,10 +1547,7 @@ def test_string_rsplit_re(n, expand): # Pandas does not yet support the regex parameter for rsplit import inspect - assert ( - "regex" - not in inspect.signature(pd.Series.str.rsplit).parameters.keys() - ) + assert "regex" not in inspect.signature(pd.Series.str.rsplit).parameters.keys() expect = ps.str.rsplit(pat=" ", n=n, expand=expand) got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True) @@ -1632,23 +1615,15 @@ def test_strings_strip_tests(data, to_strip): ps = pd.Series(data) assert_eq(ps.str.strip(to_strip=to_strip), gs.str.strip(to_strip=to_strip)) - assert_eq( - ps.str.rstrip(to_strip=to_strip), gs.str.rstrip(to_strip=to_strip) - ) - assert_eq( - ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip) - ) + assert_eq(ps.str.rstrip(to_strip=to_strip), gs.str.rstrip(to_strip=to_strip)) + assert_eq(ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip)) gi = as_index(data) pi = pd.Index(data) assert_eq(pi.str.strip(to_strip=to_strip), gi.str.strip(to_strip=to_strip)) - assert_eq( - pi.str.rstrip(to_strip=to_strip), gi.str.rstrip(to_strip=to_strip) - ) - assert_eq( - pi.str.lstrip(to_strip=to_strip), gi.str.lstrip(to_strip=to_strip) - ) + assert_eq(pi.str.rstrip(to_strip=to_strip), gi.str.rstrip(to_strip=to_strip)) + assert_eq(pi.str.lstrip(to_strip=to_strip), gi.str.lstrip(to_strip=to_strip)) def test_string_strip_fail(): @@ -1992,12 +1967,8 @@ def test_string_starts_ends(data, pat): rfunc_args_and_kwargs=([pat],), ) else: - assert_eq( - ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False - ) - assert_eq( - ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False - ) + assert_eq(ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False) + assert_eq(ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False) @pytest.mark.parametrize( @@ -2335,9 +2306,7 @@ def test_string_str_match(data, pat): gs = cudf.Series(data) assert_eq(ps.str.match(pat), gs.str.match(pat)) - assert_eq( - pd.Index(pd.Index(ps).str.match(pat)), as_index(gs).str.match(pat) - ) + assert_eq(pd.Index(pd.Index(ps).str.match(pat)), as_index(gs).str.match(pat)) @pytest.mark.parametrize( @@ -2370,20 +2339,12 @@ def test_string_str_translate(data): gs.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), ) assert_eq( - pd.Index(ps).str.translate( - str.maketrans({"a": "z", "i": "$", "z": "1"}) - ), - as_index(gs).str.translate( - str.maketrans({"a": "z", "i": "$", "z": "1"}) - ), + pd.Index(ps).str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), + as_index(gs).str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), ) assert_eq( - ps.str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), - gs.str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), + ps.str.translate(str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."})), + gs.str.translate(str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."})), ) assert_eq( pd.Index(ps).str.translate( @@ -2410,9 +2371,7 @@ def test_string_str_filter_characters(): "", ] gs = cudf.Series(data) - expected = cudf.Series( - ["helloworld", "ABCD", "", "accnt", None, "150", ""] - ) + expected = cudf.Series(["helloworld", "ABCD", "", "accnt", None, "150", ""]) filter = {"a": "z", "A": "Z", "0": "9"} assert_eq(expected, gs.str.filter_characters(filter)) @@ -2659,9 +2618,7 @@ def test_istimestamp_empty(): def test_string_ip4_to_int(): - gsr = cudf.Series( - ["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"] - ) + gsr = cudf.Series(["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"]) expected = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]) got = gsr.str.ip2int() @@ -3171,28 +3128,20 @@ def test_string_get_json_object_allow_single_quotes(): ] ) assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", allow_single_quotes=True - ), + gs.str.get_json_object("$.store.book[0].author", allow_single_quotes=True), cudf.Series(["Nigel Rees"]), ) assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", allow_single_quotes=True - ), + gs.str.get_json_object("$.store.book[*].title", allow_single_quotes=True), cudf.Series(["['Sayings of the Century',\"Sword of Honour\"]"]), ) assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", allow_single_quotes=False - ), + gs.str.get_json_object("$.store.book[0].author", allow_single_quotes=False), cudf.Series([None]), ) assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", allow_single_quotes=False - ), + gs.str.get_json_object("$.store.book[*].title", allow_single_quotes=False), cudf.Series([None]), ) @@ -3384,9 +3333,7 @@ def test_str_join_lists_error(): ["-", "_", "**", None], "rep_str", "sep_str", - cudf.Series( - ["a-rep_str-b", None, "rep_str**hello**rep_str**world", None] - ), + cudf.Series(["a-rep_str-b", None, "rep_str**hello**rep_str**world", None]), ), ( cudf.Series([[None, "a"], [None], None]), @@ -3405,9 +3352,7 @@ def test_str_join_lists_error(): ], ) def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected): - actual = sr.str.join( - sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep - ) + actual = sr.str.join(sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep) assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py index 5dbb86fe27d..9566ee19ddd 100644 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ b/python/cudf/cudf/tests/test_string_udfs.py @@ -76,9 +76,7 @@ def run_udf_test(data, func, dtype): comparing it with the equivalent pandas result """ if dtype == "str": - output = rmm.DeviceBuffer( - size=len(data) * _get_extensionty_size(udf_string) - ) + output = rmm.DeviceBuffer(size=len(data) * _get_extensionty_size(udf_string)) else: dtype = np.dtype(dtype) output = cudf.core.column.column_empty(len(data), dtype=dtype) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 60d9516f385..100fc40fb97 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -226,9 +226,7 @@ def test_dataframe_to_struct(): assert_eq(expect, got) df = cudf.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) - expect = cudf.Series( - [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}, {"a": 3, "b": "z"}] - ) + expect = cudf.Series([{"a": 1, "b": "x"}, {"a": 2, "b": "y"}, {"a": 3, "b": "z"}]) got = df.to_struct() assert_eq(expect, got) @@ -345,9 +343,7 @@ def test_struct_with_datetime_and_timedelta(dtype): def test_struct_int_values(): - series = cudf.Series( - [{"a": 1, "b": 2}, {"a": 10, "b": None}, {"a": 5, "b": 6}] - ) + series = cudf.Series([{"a": 1, "b": 2}, {"a": 10, "b": None}, {"a": 5, "b": 6}]) actual_series = series.to_pandas() assert isinstance(actual_series[0]["b"], int) @@ -443,8 +439,6 @@ def test_struct_empty_children_slice(indices, values): def test_struct_iterate_error(): - s = cudf.Series( - [{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}] - ) + s = cudf.Series([{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}]) with pytest.raises(TypeError): iter(s.struct) diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index 1994536f395..e594ba6ba45 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -39,9 +39,7 @@ def arrow_arrays(request): @pytest.mark.parametrize("check_names", [True, False]) @pytest.mark.parametrize("rname", ["a", "b"]) @pytest.mark.parametrize("check_categorical", [True, False]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"]) def test_basic_assert_index_equal( rdata, exact, @@ -72,10 +70,7 @@ def test_basic_assert_index_equal( if kind is not None: if (kind == TypeError) and ( msg - == ( - "Categoricals can only be compared " - "if 'categories' are the same." - ) + == ("Categoricals can only be compared " "if 'categories' are the same.") ): kind = AssertionError with pytest.raises(kind): @@ -101,9 +96,7 @@ def test_basic_assert_index_equal( @pytest.mark.parametrize("rname", ["a", "b"]) @pytest.mark.parametrize("check_category_order", [True, False]) @pytest.mark.parametrize("check_categorical", [True, False]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] -) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"]) def test_basic_assert_series_equal( rdata, rname, @@ -163,9 +156,7 @@ def test_assert_column_equal_dtype_edge_cases(other): base = as_column([1, 2, 3]) # for these dtypes, the diff should always be 100% regardless of the values - with pytest.raises( - AssertionError, match=r".*values are different \(100.0 %\).*" - ): + with pytest.raises(AssertionError, match=r".*values are different \(100.0 %\).*"): assert_column_equal(base, other, check_dtype=False) # the exceptions are the empty and all null cases @@ -332,9 +323,7 @@ def test_series_different_type_cases(dtype, check_exact, check_dtype): sr1, sr2, check_exact=check_exact, check_dtype=check_dtype ) else: - assert_series_equal( - sr1, sr2, check_exact=check_exact, check_dtype=check_dtype - ) + assert_series_equal(sr1, sr2, check_exact=check_exact, check_dtype=check_dtype) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 0c591965361..73e85cd1b7f 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -84,9 +84,7 @@ @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) def test_timedelta_series_create(data, dtype): if dtype not in ("timedelta64[ns]"): - pytest.skip( - "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" - ) + pytest.skip("Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465") psr = pd.Series( cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype ) @@ -108,9 +106,7 @@ def test_timedelta_series_create(data, dtype): @pytest.mark.parametrize("cast_dtype", ["int64", "category"]) def test_timedelta_from_typecast(data, dtype, cast_dtype): if dtype not in ("timedelta64[ns]"): - pytest.skip( - "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" - ) + pytest.skip("Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465") psr = pd.Series( cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype ) @@ -367,9 +363,7 @@ def test_timedelta_ops_datetime_inputs( ), pd.DataFrame( { - "A": pd.Series( - pd.date_range("1994-1-1", periods=50, freq="D") - ), + "A": pd.Series(pd.date_range("1994-1-1", periods=50, freq="D")), "B": pd.Series([pd.Timedelta(days=i) for i in range(50)]), } ), @@ -664,9 +658,7 @@ def test_timedelta_reduction_ops(data, dtype, reduction_op): actual = getattr(gsr, reduction_op)() if pd.isna(expected) and pd.isna(actual): pass - elif isinstance(expected, pd.Timedelta) and isinstance( - actual, pd.Timedelta - ): + elif isinstance(expected, pd.Timedelta) and isinstance(actual, pd.Timedelta): assert ( expected.round(gsr._column.time_unit).value == actual.round(gsr._column.time_unit).value @@ -744,9 +736,7 @@ def test_timedelta_index(data, dtype): @pytest.mark.parametrize("data", _TIMEDELTA_DATA_NON_OVERFLOW) @pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) @pytest.mark.parametrize("timedelta_dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_index_datetime_index_ops( - data, datetime_dtype, timedelta_dtype -): +def test_timedelta_index_datetime_index_ops(data, datetime_dtype, timedelta_dtype): gdt = cudf.Index(data, dtype=datetime_dtype) gtd = cudf.Index(data, dtype=timedelta_dtype) @@ -846,9 +836,7 @@ def test_timedelta_datetime_index_ops_misc( ], ) @pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning:pandas") -def test_timedelta_index_ops_with_scalars( - request, data, other_scalars, dtype, op -): +def test_timedelta_index_ops_with_scalars(request, data, other_scalars, dtype, op): gtdi = cudf.Index(data=data, dtype=dtype) ptdi = gtdi.to_pandas() @@ -919,9 +907,7 @@ def test_timedelta_index_ops_with_scalars( "floordiv", ], ) -def test_timedelta_index_ops_with_cudf_scalars( - request, data, cpu_scalar, dtype, op -): +def test_timedelta_index_ops_with_cudf_scalars(request, data, cpu_scalar, dtype, op): gtdi = cudf.Index(data=data, dtype=dtype) ptdi = gtdi.to_pandas() @@ -1066,20 +1052,14 @@ def test_timedelta_fillna(data, dtype, fill_value): ), ( cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ms]"), - cudf.Series( - ["0 days 00:16:40", "0 days 00:03:20", "0 days 00:50:00"] - ), + cudf.Series(["0 days 00:16:40", "0 days 00:03:20", "0 days 00:50:00"]), ), ( cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[s]"), - cudf.Series( - ["11 days 13:46:40", "2 days 07:33:20", "34 days 17:20:00"] - ), + cudf.Series(["11 days 13:46:40", "2 days 07:33:20", "34 days 17:20:00"]), ), ( - cudf.Series( - [None, None, None, None, None], dtype="timedelta64[us]" - ), + cudf.Series([None, None, None, None, None], dtype="timedelta64[us]"), cudf.Series([None, None, None, None, None], dtype="str"), ), ( diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 4843decedba..bc66e00ffdd 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -210,9 +210,7 @@ def func(row): # we should get: # [?, ?, , , ] - gdf = cudf.DataFrame( - {"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]} - ) + gdf = cudf.DataFrame({"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]}) run_masked_udf_test(func, gdf, check_dtype=False) @@ -280,8 +278,7 @@ def func(row): request.applymarker( pytest.mark.xfail( condition=( - (gdf["data"] == 1).any() - and op in {operator.pow, operator.ipow} + (gdf["data"] == 1).any() and op in {operator.pow, operator.ipow} ), reason="https://github.com/rapidsai/cudf/issues/7478", ) @@ -540,9 +537,7 @@ def func(x): # in pandas, 1**NA == 1. In cudf, 1**NA == NA. request.applymarker( pytest.mark.xfail( - condition=( - constant is cudf.NA and op in {operator.pow, operator.ipow} - ), + condition=(constant is cudf.NA and op in {operator.pow, operator.ipow}), reason="https://github.com/rapidsai/cudf/issues/7478", ) ) @@ -562,9 +557,7 @@ def func(x): # in pandas, 1**NA == 1. In cudf, 1**NA == NA. request.applymarker( pytest.mark.xfail( - condition=( - constant in {1} and op in {operator.pow, operator.ipow} - ), + condition=(constant in {1} and op in {operator.pow, operator.ipow}), reason="https://github.com/rapidsai/cudf/issues/7478", ) ) @@ -615,9 +608,7 @@ def outer(row): y = row["b"] return inner(x, y) - gdf = cudf.DataFrame( - {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} - ) + gdf = cudf.DataFrame({"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]}) with pytest.raises(ValueError): gdf.apply(outer, axis=1) @@ -654,9 +645,7 @@ def func(row): @pytest.mark.parametrize( "unsupported_col", [ - _decimal_series( - ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) - ), + _decimal_series(["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)), cudf.Series([1, 2, 3], dtype="category"), cudf.interval_range(start=0, end=3), [[1, 2], [3, 4], [5, 6]], @@ -819,9 +808,7 @@ def f(x, c): assert precompiled.currsize == 2 -@pytest.mark.parametrize( - "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] -) +@pytest.mark.parametrize("data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]]) @pytest.mark.parametrize("operator", [float, int, bool]) def test_masked_udf_casting(operator, data): data = cudf.Series(data) @@ -1006,9 +993,7 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) - @pytest.mark.parametrize( - "concat_char", ["1", "a", "12", " ", "", ".", "@"] - ) + @pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) def test_string_udf_concat(self, str_udf_data, concat_char): def func(row): return row["str_col"] + concat_char diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index 15d9d03d4a7..51ae5aa7de5 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import itertools import operator @@ -118,8 +118,7 @@ def test_scalar_no_negative_bools(): with pytest.raises( TypeError, match=re.escape( - "Boolean scalars in cuDF do not " - "support negation, use logical not" + "Boolean scalars in cuDF do not " "support negation, use logical not" ), ): -x diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py index b21edc0477f..a3ff76988d3 100644 --- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py +++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py @@ -16,14 +16,9 @@ def datadir(datadir): def assert_equal_tokenization_outputs(hf_output, cudf_output): + assert np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0 assert ( - np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0 - ) - assert ( - np.sum( - hf_output["attention_mask"] != cudf_output["attention_mask"].get() - ) - == 0 + np.sum(hf_output["attention_mask"] != cudf_output["attention_mask"].get()) == 0 ) @@ -32,12 +27,8 @@ def assert_equal_tokenization_outputs(hf_output, cudf_output): @pytest.mark.parametrize("stride", [0, 15, 30]) @pytest.mark.parametrize("add_special_tokens", [True, False]) @pytest.mark.parametrize("do_lower_case", [True, False]) -def test_subword_tokenize( - seq_len, stride, add_special_tokens, do_lower_case, datadir -): - with open( - os.path.join(datadir, "test_sentences.txt"), encoding="utf-8" - ) as file: +def test_subword_tokenize(seq_len, stride, add_special_tokens, do_lower_case, datadir): + with open(os.path.join(datadir, "test_sentences.txt"), encoding="utf-8") as file: input_sentence_ls = [line.strip() for line in file] vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") @@ -128,9 +119,7 @@ def test_text_subword_tokenize(tmpdir): cudf_tokenizer = SubwordTokenizer(hash_file) - token_d = cudf_tokenizer( - sr, 8, 8, add_special_tokens=False, truncation=True - ) + token_d = cudf_tokenizer(sr, 8, 8, add_special_tokens=False, truncation=True) tokens, masks, metadata = ( token_d["input_ids"], token_d["attention_mask"], diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 2dccd583b23..097272c8aab 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import random import string @@ -109,9 +109,7 @@ def test_detokenize(): assert type(expected) == type(actual) assert_eq(expected, actual) - indices = cudf.Series( - [4, 0, 0, 0, 0, 4, 1, 1, 4, 2, 2, 2, 2, 4, 3], dtype=np.int8 - ) + indices = cudf.Series([4, 0, 0, 0, 0, 4, 1, 1, 4, 2, 2, 2, 2, 4, 3], dtype=np.int8) actual = strings.str.detokenize(indices, "+") expected = cudf.Series( [ @@ -681,9 +679,7 @@ def test_text_replace_tokens(): "emptyme", ], ) - targets = cudf.Series( - ["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"] - ) + targets = cudf.Series(["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"]) replacements = cudf.Series(["the", "🚒", "🚒🚒🚒🚒", "🔥🔥", ""]) expected = cudf.Series( @@ -697,9 +693,7 @@ def test_text_replace_tokens(): assert_eq(expected, actual) - sr = cudf.Series( - ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] - ) + sr = cudf.Series(["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"]) targets = cudf.Series(["🌬", "🔥", "🌊"]) replacements = "🚰" @@ -755,9 +749,7 @@ def test_text_filter_tokens(): actual = sr.str.filter_tokens(5, "🔥") assert_eq(expected, actual) - sr = cudf.Series( - ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] - ) + sr = cudf.Series(["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"]) expected = cudf.Series( ["All-we-need;is;--", "\tall-we-need0is;--", "all;we:need+is;--"] ) @@ -970,9 +962,7 @@ def test_jaccard_index(): str1.str.jaccard_index(str3, 5) -def _make_list_of_strings_of_random_length( - num_strings, min_length, max_length -): +def _make_list_of_strings_of_random_length(num_strings, min_length, max_length): return [ "".join( random.choice(string.ascii_lowercase) diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py index 494b48b3cfd..8862ffae125 100644 --- a/python/cudf/cudf/utils/_numba.py +++ b/python/cudf/cudf/utils/_numba.py @@ -120,17 +120,13 @@ def _setup_numba(): versions = safe_get_versions() if versions != NO_DRIVER: driver_version, runtime_version = versions - ptx_toolkit_version = _get_cuda_version_from_ptx_file( - _get_cc_60_ptx_file() - ) + ptx_toolkit_version = _get_cuda_version_from_ptx_file(_get_cc_60_ptx_file()) # MVC is required whenever any PTX is newer than the driver # This could be the shipped PTX file or the PTX emitted by # the version of NVVM on the user system, the latter aligning # with the runtime version - if (driver_version < ptx_toolkit_version) or ( - driver_version < runtime_version - ): + if (driver_version < ptx_toolkit_version) or (driver_version < runtime_version): if driver_version < (12, 0): patch_numba_linker_cuda_11() else: @@ -186,25 +182,19 @@ def _get_cuda_version_from_ptx_file(path): cuda_ver = ver_map.get(version) if cuda_ver is None: - raise ValueError( - f"Could not map PTX version {version} to a CUDA version" - ) + raise ValueError(f"Could not map PTX version {version} to a CUDA version") return cuda_ver class _CUDFNumbaConfig: def __enter__(self): - self.CUDA_LOW_OCCUPANCY_WARNINGS = ( - numba_config.CUDA_LOW_OCCUPANCY_WARNINGS - ) + self.CUDA_LOW_OCCUPANCY_WARNINGS = numba_config.CUDA_LOW_OCCUPANCY_WARNINGS numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 self.CAPTURED_ERRORS = numba_config.CAPTURED_ERRORS numba_config.CAPTURED_ERRORS = "new_style" def __exit__(self, exc_type, exc_value, traceback): - numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = ( - self.CUDA_LOW_OCCUPANCY_WARNINGS - ) + numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = self.CUDA_LOW_OCCUPANCY_WARNINGS numba_config.CAPTURED_ERRORS = self.CAPTURED_ERRORS diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py index 54f5ea08ee1..616f816ccc7 100644 --- a/python/cudf/cudf/utils/_ptxcompiler.py +++ b/python/cudf/cudf/utils/_ptxcompiler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -85,12 +85,8 @@ def safe_get_versions(): try: # allow user to specify driver/runtime # versions manually, if necessary - driver_version = os.environ[ - "PTXCOMPILER_KNOWN_DRIVER_VERSION" - ].split(".") - runtime_version = os.environ[ - "PTXCOMPILER_KNOWN_RUNTIME_VERSION" - ].split(".") + driver_version = os.environ["PTXCOMPILER_KNOWN_DRIVER_VERSION"].split(".") + runtime_version = os.environ["PTXCOMPILER_KNOWN_RUNTIME_VERSION"].split(".") driver_version, runtime_version = ( tuple(map(int, driver_version)), tuple(map(int, runtime_version)), diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index d57303ca122..ce3839c3644 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -66,9 +66,7 @@ @doc_apply() -def apply_rows( - df, func, incols, outcols, kwargs, pessimistic_nulls, cache_key -): +def apply_rows(df, func, incols, outcols, kwargs, pessimistic_nulls, cache_key): """Row-wise transformation Parameters @@ -116,21 +114,15 @@ def make_aggregate_nullmask(df, columns=None, op="__and__"): nullmask = column.as_column(df[k]._column.nullmask) if out_mask is None: - out_mask = column.as_column( - nullmask.copy(), dtype=utils.mask_dtype - ) + out_mask = column.as_column(nullmask.copy(), dtype=utils.mask_dtype) else: - out_mask = libcudf.binaryop.binaryop( - nullmask, out_mask, op, out_mask.dtype - ) + out_mask = libcudf.binaryop.binaryop(nullmask, out_mask, op, out_mask.dtype) return out_mask class ApplyKernelCompilerBase: - def __init__( - self, func, incols, outcols, kwargs, pessimistic_nulls, cache_key - ): + def __init__(self, func, incols, outcols, kwargs, pessimistic_nulls, cache_key): # Get signature of user function sig = pysignature(func) self.sig = sig @@ -151,15 +143,14 @@ def run(self, df, **launch_params): } else: inputs = { - k: df[k]._column.data_array_view(mode="read") - for k in self.incols + k: df[k]._column.data_array_view(mode="read") for k in self.incols } # Allocate output columns outputs = {} for k, dt in self.outcols.items(): - outputs[k] = column.column_empty( - len(df), dt, False - ).data_array_view(mode="write") + outputs[k] = column.column_empty(len(df), dt, False).data_array_view( + mode="write" + ) # Bind argument args = {} for dct in [inputs, outputs, self.kwargs]: @@ -175,9 +166,7 @@ def run(self, df, **launch_params): # Prepare output frame outdf = df.copy() for k in sorted(self.outcols): - outdf[k] = cudf.Series( - outputs[k], index=outdf.index, nan_as_null=False - ) + outdf[k] = cudf.Series(outputs[k], index=outdf.index, nan_as_null=False) if out_mask is not None: outdf._data[k] = outdf[k]._column.set_mask( out_mask.data_array_view(mode="write") @@ -202,9 +191,7 @@ def launch_kernel(self, df, args): class ApplyChunksCompiler(ApplyKernelCompilerBase): def compile(self, func, argnames, extra_argnames): # Compile kernel - kernel = _load_cache_or_make_chunk_wise_kernel( - func, argnames, extra_argnames - ) + kernel = _load_cache_or_make_chunk_wise_kernel(func, argnames, extra_argnames) return kernel def launch_kernel(self, df, args, chunks, blkct=None, tpb=None): @@ -222,9 +209,9 @@ def launch_kernel(self, df, args, chunks, blkct=None, tpb=None): def normalize_chunks(self, size, chunks): if isinstance(chunks, int): # *chunks* is the chunksize - return cuda.as_cuda_array( - cp.arange(start=0, stop=size, step=chunks) - ).view("int64") + return cuda.as_cuda_array(cp.arange(start=0, stop=size, step=chunks)).view( + "int64" + ) else: # *chunks* is an array of chunk leading offset return cuda.as_cuda_array(cp.asarray(chunks)).view("int64") @@ -259,9 +246,7 @@ def row_wise_kernel({args}): stop = "" stride = "ntid" srcidx = "{a} = {a}[{start}:{stop}:{stride}]" - body.append( - srcidx.format(a=a, start=start, stop=stop, stride=stride) - ) + body.append(srcidx.format(a=a, start=start, stop=stop, stride=stride)) body.append(f"inner({args})") @@ -311,9 +296,7 @@ def chunk_wise_kernel(nrows, chunks, {args}): body.append(indent + "start = chunks[curblk]") body.append( - indent - + "stop = chunks[curblk + 1]" - + " if curblk + 1 < chunks.size else nrows" + indent + "stop = chunks[curblk + 1]" + " if curblk + 1 < chunks.size else nrows" ) slicedargs = {} @@ -323,9 +306,7 @@ def chunk_wise_kernel(nrows, chunks, {args}): else: slicedargs[a] = str(a) body.append( - "{}inner({})".format( - indent, ", ".join(slicedargs[k] for k in argnames) - ) + "{}inner({})".format(indent, ", ".join(slicedargs[k] for k in argnames)) ) indented = ["{}{}".format(" " * 4, ln) for ln in body] diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index 020c32de9f3..dc4624a9a14 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from pickle import dumps @@ -29,16 +29,12 @@ def window_sizes_from_offset(arr, offset): window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32") if arr.size > 0: with _CUDFNumbaConfig(): - gpu_window_sizes_from_offset.forall(arr.size)( - arr, window_sizes, offset - ) + gpu_window_sizes_from_offset.forall(arr.size)(arr, window_sizes, offset) return window_sizes @cuda.jit -def gpu_grouped_window_sizes_from_offset( - arr, window_sizes, group_starts, offset -): +def gpu_grouped_window_sizes_from_offset(arr, window_sizes, group_starts, offset): i = cuda.grid(1) j = i if i < arr.size: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e9dbc23d767..d6438833846 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -50,8 +50,7 @@ np.dtype("float64"): pd.Float64Dtype(), } pandas_dtypes_to_np_dtypes = { - pd_dtype: np_dtype - for np_dtype, pd_dtype in np_dtypes_to_pandas_dtypes.items() + pd_dtype: np_dtype for np_dtype, pd_dtype in np_dtypes_to_pandas_dtypes.items() } pyarrow_dtypes_to_pandas_dtypes = { @@ -128,17 +127,11 @@ def _find_common_type_decimal(dtypes): p = s + lhs if p > cudf.Decimal64Dtype.MAX_PRECISION: - return cudf.Decimal128Dtype( - min(cudf.Decimal128Dtype.MAX_PRECISION, p), s - ) + return cudf.Decimal128Dtype(min(cudf.Decimal128Dtype.MAX_PRECISION, p), s) elif p > cudf.Decimal32Dtype.MAX_PRECISION: - return cudf.Decimal64Dtype( - min(cudf.Decimal64Dtype.MAX_PRECISION, p), s - ) + return cudf.Decimal64Dtype(min(cudf.Decimal64Dtype.MAX_PRECISION, p), s) else: - return cudf.Decimal32Dtype( - min(cudf.Decimal32Dtype.MAX_PRECISION, p), s - ) + return cudf.Decimal32Dtype(min(cudf.Decimal32Dtype.MAX_PRECISION, p), s) def cudf_dtype_from_pydata_dtype(dtype): @@ -165,9 +158,7 @@ def cudf_dtype_to_pa_type(dtype): Python dtype. """ if isinstance(dtype, cudf.CategoricalDtype): - raise NotImplementedError( - "No conversion from Categorical to pyarrow type" - ) + raise NotImplementedError("No conversion from Categorical to pyarrow type") elif isinstance( dtype, (cudf.StructDtype, cudf.ListDtype, cudf.core.dtypes.DecimalDtype), @@ -201,15 +192,12 @@ def to_cudf_compatible_scalar(val, dtype=None): If `val` is None, returns None. """ - if cudf._lib.scalar._is_null_host_scalar(val) or isinstance( - val, cudf.Scalar - ): + if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(val, cudf.Scalar): return val if not cudf.api.types._is_scalar_or_zero_d_array(val): raise ValueError( - f"Cannot convert value of type {type(val).__name__} " - "to cudf scalar" + f"Cannot convert value of type {type(val).__name__} " "to cudf scalar" ) if isinstance(val, Decimal): @@ -218,9 +206,9 @@ def to_cudf_compatible_scalar(val, dtype=None): if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0: val = val.item() - if ( - (dtype is None) and isinstance(val, str) - ) or cudf.api.types.is_string_dtype(dtype): + if ((dtype is None) and isinstance(val, str)) or cudf.api.types.is_string_dtype( + dtype + ): dtype = "str" if isinstance(val, str) and val.endswith("\x00"): @@ -232,9 +220,7 @@ def to_cudf_compatible_scalar(val, dtype=None): # the string value directly (cudf.DeviceScalar will DTRT) return val - tz_error_msg = ( - "Cannot covert a timezone-aware timestamp to timezone-naive scalar." - ) + tz_error_msg = "Cannot covert a timezone-aware timestamp to timezone-naive scalar." if isinstance(val, pd.Timestamp): if val.tz is not None: raise NotImplementedError(tz_error_msg) @@ -249,9 +235,9 @@ def to_cudf_compatible_scalar(val, dtype=None): elif isinstance(val, datetime.timedelta): val = np.timedelta64(val) - val = _maybe_convert_to_default_type( - cudf.api.types.pandas_dtype(type(val)) - ).type(val) + val = _maybe_convert_to_default_type(cudf.api.types.pandas_dtype(type(val))).type( + val + ) if dtype is not None: if isinstance(val, str) and np.dtype(dtype).kind == "M": @@ -418,9 +404,9 @@ def get_time_unit(obj): def _get_nan_for_dtype(dtype): dtype = cudf.dtype(dtype) - if pd.api.types.is_datetime64_dtype( + if pd.api.types.is_datetime64_dtype(dtype) or pd.api.types.is_timedelta64_dtype( dtype - ) or pd.api.types.is_timedelta64_dtype(dtype): + ): time_unit, _ = np.datetime_data(dtype) return dtype.type("nat", time_unit) elif dtype.kind == "f": @@ -430,9 +416,7 @@ def _get_nan_for_dtype(dtype): def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): - error = TypeError( - f"{op} not supported between {dtype_l} and {dtype_r} scalars" - ) + error = TypeError(f"{op} not supported between {dtype_l} and {dtype_r} scalars") to_numpy_ops = { "__add__": _ADD_TYPES, @@ -463,9 +447,7 @@ def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): for valid_combo in allowed: ltype, rtype, outtype = valid_combo - if np.can_cast(dtype_l.char, ltype) and np.can_cast( - dtype_r.char, rtype - ): + if np.can_cast(dtype_l.char, ltype) and np.can_cast(dtype_r.char, rtype): return outtype raise error @@ -523,20 +505,14 @@ def find_common_type(dtypes): # Aggregate same types dtypes = set(dtypes) - if any( - isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes - ): + if any(isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes): if all( cudf.api.types.is_decimal_dtype(dtype) or cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes ): return _find_common_type_decimal( - [ - dtype - for dtype in dtypes - if cudf.api.types.is_decimal_dtype(dtype) - ] + [dtype for dtype in dtypes if cudf.api.types.is_decimal_dtype(dtype)] ) else: return cudf.dtype("O") @@ -550,30 +526,24 @@ def find_common_type(dtypes): # ListDtype(int64) & ListDtype(int32) common # dtype could be ListDtype(int64). raise NotImplementedError( - "Finding a common type for `ListDtype` is currently " - "not supported" + "Finding a common type for `ListDtype` is currently " "not supported" ) if any(isinstance(dtype, cudf.StructDtype) for dtype in dtypes): if len(dtypes) == 1: return dtypes.get(0) else: raise NotImplementedError( - "Finding a common type for `StructDtype` is currently " - "not supported" + "Finding a common type for `StructDtype` is currently " "not supported" ) # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately - dt_dtypes = set( - filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes) - ) + dt_dtypes = set(filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes)) if len(dt_dtypes) > 0: dtypes = dtypes - dt_dtypes dtypes.add(np.result_type(*dt_dtypes)) - td_dtypes = set( - filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes) - ) + td_dtypes = set(filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes)) if len(td_dtypes) > 0: dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) @@ -656,16 +626,12 @@ def _maybe_convert_to_default_type(dtype): """ if cudf.get_option("default_integer_bitwidth"): if cudf.api.types.is_signed_integer_dtype(dtype): - return cudf.dtype( - f'i{cudf.get_option("default_integer_bitwidth")//8}' - ) + return cudf.dtype(f'i{cudf.get_option("default_integer_bitwidth")//8}') elif cudf.api.types.is_unsigned_integer_dtype(dtype): - return cudf.dtype( - f'u{cudf.get_option("default_integer_bitwidth")//8}' - ) - if cudf.get_option( - "default_float_bitwidth" - ) and cudf.api.types.is_float_dtype(dtype): + return cudf.dtype(f'u{cudf.get_option("default_integer_bitwidth")//8}') + if cudf.get_option("default_float_bitwidth") and cudf.api.types.is_float_dtype( + dtype + ): return cudf.dtype(f'f{cudf.get_option("default_float_bitwidth")//8}') return dtype @@ -684,9 +650,7 @@ def _dtype_can_hold_element(dtype: np.dtype, element) -> bool: return True return False - elif is_integer(element) or ( - is_float(element) and element.is_integer() - ): + elif is_integer(element) or (is_float(element) and element.is_integer()): info = np.iinfo(dtype) if info.min <= element <= info.max: return True diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py index b5387ddeb5f..386343e9e63 100644 --- a/python/cudf/cudf/utils/gpu_utils.py +++ b/python/cudf/cudf/utils/gpu_utils.py @@ -7,10 +7,7 @@ def validate_setup(): # TODO: Remove the following check once we arrive at a solution for #4827 # This is a temporary workaround to unblock internal testing # related issue: https://github.com/rapidsai/cudf/issues/4827 - if ( - "RAPIDS_NO_INITIALIZE" in os.environ - or "CUDF_NO_INITIALIZE" in os.environ - ): + if "RAPIDS_NO_INITIALIZE" in os.environ or "CUDF_NO_INITIALIZE" in os.environ: return import warnings @@ -129,8 +126,7 @@ def validate_setup(): # Driver Runtime version is >= Runtime version pass elif ( - cuda_driver_supported_rt_version >= 11000 - and cuda_runtime_version >= 11000 + cuda_driver_supported_rt_version >= 11000 and cuda_runtime_version >= 11000 ): # With cuda enhanced compatibility any code compiled # with 11.x version of cuda can now run on any diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index ef078ed8c5d..9e04e6300de 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -24,9 +24,7 @@ # Shifts for bit packing A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW) -B_SECOND_LEVEL_SHIFT_AMT = np.uint8( - 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW -) +B_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW) BITS_FOR_INNER_TABLE_SIZE = np.uint8(8) NOT_FOUND = -1 @@ -93,12 +91,8 @@ def _find_hash_for_internal(hash_bin): new_length = _new_bin_length(len(hash_bin)) while True: - a = np.random.randint( - A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH - ) - b = np.random.randint( - B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH - ) + a = np.random.randint(A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH) + b = np.random.randint(B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH) bins = _make_bins(hash_bin, new_length, a, b) max_length = len(max(bins, key=len)) @@ -115,9 +109,7 @@ def _perfect_hash(integers, max_constant): ) flattened_bins = [] - internal_table_coeffs = np.zeros( - shape=[num_top_level_bins], dtype=np.uint64 - ) + internal_table_coeffs = np.zeros(shape=[num_top_level_bins], dtype=np.uint64) offset_into_flattened_table = np.zeros( shape=[num_top_level_bins + 1], dtype=np.uint64 ) @@ -134,9 +126,7 @@ def _perfect_hash(integers, max_constant): | coeff_b << B_SECOND_LEVEL_SHIFT_AMT | bin_length ) - offset_into_flattened_table[i + 1] = ( - offset_into_flattened_table[i] + bin_length - ) + offset_into_flattened_table[i + 1] = offset_into_flattened_table[i] + bin_length flattened_bins.extend(internal_table) print( @@ -199,8 +189,7 @@ def _store_func( f.write(f"{len(hash_table)}\n") f.writelines(f"{kv}\n" for kv in hash_table) f.writelines( - f"{tok_id}\n" - for tok_id in [unk_tok_id, first_token_id, sep_token_id] + f"{tok_id}\n" for tok_id in [unk_tok_id, first_token_id, sep_token_id] ) @@ -289,8 +278,6 @@ def hash_vocab( inner_table_coeffs, offsets_into_ht, ) - assert ( - val == value - ), f"Incorrect value found. Got {val} expected {value}" + assert val == value, f"Incorrect value found. Got {val} expected {value}" print("All present tokens return correct value.") diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 925fd24e6c8..bd058cf8465 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -85,9 +85,7 @@ 0 10 hello 1 20 rapids 2 30 ai -""".format( - remote_data_sources=_docstring_remote_sources -) +""".format(remote_data_sources=_docstring_remote_sources) doc_read_avro = docfmt_partial(docstring=_docstring_read_avro) _docstring_read_parquet_metadata = """ @@ -120,9 +118,7 @@ -------- cudf.read_parquet """ -doc_read_parquet_metadata = docfmt_partial( - docstring=_docstring_read_parquet_metadata -) +doc_read_parquet_metadata = docfmt_partial(docstring=_docstring_read_parquet_metadata) _docstring_read_parquet = """ Load a Parquet dataset into a DataFrame @@ -396,9 +392,7 @@ -------- cudf.read_orc """ -doc_read_orc_statistics = docfmt_partial( - docstring=_docstring_read_orc_statistics -) +doc_read_orc_statistics = docfmt_partial(docstring=_docstring_read_orc_statistics) _docstring_read_orc = """ Load an ORC dataset into a DataFrame @@ -1416,9 +1410,7 @@ list of Filepath strings or in-memory buffers of data. compression : str Type of compression algorithm for the content - """.format( - bytes_per_thread=_BYTES_PER_THREAD_DEFAULT -) + """.format(bytes_per_thread=_BYTES_PER_THREAD_DEFAULT) doc_get_reader_filepath_or_buffer = docfmt_partial( @@ -1621,9 +1613,7 @@ def _open_remote_files( # Use fsspec.parquet module. # TODO: Use `cat_ranges` to collect "known" # parts for all files at once. - row_groups = precache_options.pop("row_groups", None) or ( - [None] * len(paths) - ) + row_groups = precache_options.pop("row_groups", None) or ([None] * len(paths)) return [ ArrowPythonFile( _set_context( @@ -1652,8 +1642,7 @@ def _open_remote_files( # Default open - Use pyarrow filesystem API pa_fs = PyFileSystem(FSSpecHandler(fs)) return [ - _set_context(pa_fs.open_input_file(fpath), context_stack) - for fpath in paths + _set_context(pa_fs.open_input_file(fpath), context_stack) for fpath in paths ] @@ -1680,9 +1669,7 @@ def get_reader_filepath_or_buffer( # Get a filesystem object if one isn't already available paths = [path_or_data] if fs is None: - fs, paths = _get_filesystem_and_paths( - path_or_data, storage_options - ) + fs, paths = _get_filesystem_and_paths(path_or_data, storage_options) if fs is None: if warn_on_raw_text_input: # Do not remove until pandas 3.0 support is added. @@ -1722,9 +1709,7 @@ def get_reader_filepath_or_buffer( ) elif warn_on_raw_text_input: # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." + assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " "deprecated and will be removed in a future version. " @@ -1979,10 +1964,7 @@ def _apply_predicate(op, val, col_stats): def _apply_filters(filters, stats): for conjunction in filters: - if all( - _apply_predicate(op, val, stats[col]) - for col, op, val in conjunction - ): + if all(_apply_predicate(op, val, stats[col]) for col, op, val in conjunction): return True return False @@ -2023,9 +2005,7 @@ def _fsspec_data_transfer( # Require `fs` if `path_or_fob` is not file-like file_like = is_file_like(path_or_fob) if fs is None and not file_like: - raise ValueError( - "fs must be defined if `path_or_fob` is not file-like" - ) + raise ValueError("fs must be defined if `path_or_fob` is not file-like") # Calculate total file size if file_like: diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py index a4404e51232..c8bf14b2dba 100644 --- a/python/cudf/cudf/utils/nvtx_annotation.py +++ b/python/cudf/cudf/utils/nvtx_annotation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. import hashlib from functools import partial @@ -25,6 +25,4 @@ def _cudf_nvtx_annotate(func, domain="cudf_python"): )(func) -_dask_cudf_nvtx_annotate = partial( - _cudf_nvtx_annotate, domain="dask_cudf_python" -) +_dask_cudf_nvtx_annotate = partial(_cudf_nvtx_annotate, domain="dask_cudf_python") diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 239438afd24..38ab3e89336 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import ast import datetime @@ -22,8 +22,7 @@ ENVREF_PREFIX = "__CUDF_ENVREF__" SUPPORTED_QUERY_TYPES = { - np.dtype(dt) - for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES + np.dtype(dt) for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES } @@ -41,9 +40,7 @@ def visit_Name(self, node): raise QuerySyntaxError("assignment is not allowed") name = node.id - chosen = ( - self.refnames if name.startswith(ENVREF_PREFIX) else self.colnames - ) + chosen = self.refnames if name.startswith(ENVREF_PREFIX) else self.colnames chosen.add(name) @@ -97,9 +94,7 @@ def query_builder(info, funcid): func: a python function of the query """ args = info["args"] - def_line = "def {funcid}({args}):".format( - funcid=funcid, args=", ".join(args) - ) + def_line = "def {funcid}({args}):".format(funcid=funcid, args=", ".join(args)) lines = [def_line, " return {}".format(info["source"])] source = "\n".join(lines) glbs = {} @@ -220,8 +215,7 @@ def query_execute(df, expr, callenv): # wait to check the types until we know which cols are used if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays): raise TypeError( - "query only supports numeric, datetime, timedelta, " - "or bool dtypes." + "query only supports numeric, datetime, timedelta, " "or bool dtypes." ) colarrays = [col.data_array_view(mode="read") for col in colarrays] diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 95621cf9519..acbf02f0359 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -163,8 +163,7 @@ def wrapper(*args, **kwargs): fn = frame.f_code.co_filename if _cudf_root in fn and _tests_root not in fn: raise RuntimeError( - f"External-only API called in {fn} at line {lineno}. " - f"{alternative}" + f"External-only API called in {fn} at line {lineno}. " f"{alternative}" ) return func(*args, **kwargs) @@ -226,9 +225,7 @@ def __getattr__(self, key): try: return self[key] except KeyError: - raise AttributeError( - f"{type(self).__name__} object has no attribute {key}" - ) + raise AttributeError(f"{type(self).__name__} object has no attribute {key}") class NotIterable: @@ -371,9 +368,7 @@ def _is_same_name(left_name, right_name): right_name, decimal.Decimal ): return left_name.is_nan() and right_name.is_nan() - if isinstance(left_name, float) and isinstance( - right_name, float - ): + if isinstance(left_name, float) and isinstance(right_name, float): return np.isnan(left_name) and np.isnan(right_name) if isinstance(left_name, np.datetime64) and isinstance( right_name, np.datetime64 diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index f017b46866f..ac5a50473d1 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -174,12 +174,12 @@ def test_groupby_apply_fallback(dataframe, groupby_udf): def test_groupby_external_series_apply_fallback(dataframe, groupby_udf): pdf, df = dataframe tm.assert_equal( - pdf.groupby( - pd.Series([1, 2, 1, 2, 1]), sort=True, group_keys=True - ).apply(groupby_udf), - df.groupby( - xpd.Series([1, 2, 1, 2, 1]), sort=True, group_keys=True - ).apply(groupby_udf), + pdf.groupby(pd.Series([1, 2, 1, 2, 1]), sort=True, group_keys=True).apply( + groupby_udf + ), + df.groupby(xpd.Series([1, 2, 1, 2, 1]), sort=True, group_keys=True).apply( + groupby_udf + ), ) @@ -374,9 +374,7 @@ def test_excel_round_trip(dataframe): excel_pdf.seek(0) excel_cudf_pandas.seek(0) - tm.assert_frame_equal( - pd.read_excel(excel_pdf), xpd.read_excel(excel_cudf_pandas) - ) + tm.assert_frame_equal(pd.read_excel(excel_pdf), xpd.read_excel(excel_cudf_pandas)) def test_hash_array(series): @@ -396,21 +394,15 @@ def test_is_sparse(): def test_is_file_like(): assert pd.api.types.is_file_like("a") == xpd.api.types.is_file_like("a") - assert pd.api.types.is_file_like(BytesIO()) == xpd.api.types.is_file_like( - BytesIO() - ) - assert pd.api.types.is_file_like( + assert pd.api.types.is_file_like(BytesIO()) == xpd.api.types.is_file_like(BytesIO()) + assert pd.api.types.is_file_like(StringIO("abc")) == xpd.api.types.is_file_like( StringIO("abc") - ) == xpd.api.types.is_file_like(StringIO("abc")) + ) def test_is_re_compilable(): - assert pd.api.types.is_re_compilable( - ".^" - ) == xpd.api.types.is_re_compilable(".^") - assert pd.api.types.is_re_compilable( - ".*" - ) == xpd.api.types.is_re_compilable(".*") + assert pd.api.types.is_re_compilable(".^") == xpd.api.types.is_re_compilable(".^") + assert pd.api.types.is_re_compilable(".*") == xpd.api.types.is_re_compilable(".*") def test_module_attribute_types(): @@ -432,12 +424,8 @@ def test_infer_freq(): def test_groupby_grouper_fallback(dataframe, groupby_udf): pdf, df = dataframe tm.assert_equal( - pdf.groupby(pd.Grouper("a"), sort=True, group_keys=True).apply( - groupby_udf - ), - df.groupby(xpd.Grouper("a"), sort=True, group_keys=True).apply( - groupby_udf - ), + pdf.groupby(pd.Grouper("a"), sort=True, group_keys=True).apply(groupby_udf), + df.groupby(xpd.Grouper("a"), sort=True, group_keys=True).apply(groupby_udf), ) @@ -526,9 +514,7 @@ def test_pyarrow_array_construction(data): assert actual_pa_array.equals(expected_pa_array) -@pytest.mark.parametrize( - "op", [">", "<", "==", "<=", ">=", "+", "%", "-", "*", "/"] -) +@pytest.mark.parametrize("op", [">", "<", "==", "<=", ">=", "+", "%", "-", "*", "/"]) def test_cudf_pandas_eval_series(op): lhs = xpd.Series([10, 11, 12]) # noqa: F841 rhs = xpd.Series([100, 1, 12]) # noqa: F841 @@ -543,9 +529,7 @@ def test_cudf_pandas_eval_series(op): tm.assert_series_equal(expected, actual) -@pytest.mark.parametrize( - "op", [">", "<", "==", "<=", ">=", "+", "%", "-", "*", "/"] -) +@pytest.mark.parametrize("op", [">", "<", "==", "<=", ">=", "+", "%", "-", "*", "/"]) def test_cudf_pandas_eval_dataframe(op): lhs = xpd.DataFrame({"a": [10, 11, 12], "b": [1, 2, 3]}) # noqa: F841 rhs = xpd.DataFrame({"a": [100, 1, 12], "b": [15, -10, 3]}) # noqa: F841 @@ -560,9 +544,7 @@ def test_cudf_pandas_eval_dataframe(op): tm.assert_frame_equal(expected, actual) -@pytest.mark.parametrize( - "expr", ["((a + b) * c % d) > e", "((a + b) * c % d)"] -) +@pytest.mark.parametrize("expr", ["((a + b) * c % d) > e", "((a + b) * c % d)"]) def test_cudf_pandas_eval_complex(expr): data = { "a": [10, 11, 12], @@ -751,9 +733,7 @@ def test_chunked_csv_reader(tmpdir, data): tm.assert_equal(pd_chunk, xpd_chunk, check_index_type=False) -@pytest.mark.parametrize( - "data", [(), (1,), (1, 2, 3), ("a", "b", "c"), (1, 2, "test")] -) +@pytest.mark.parametrize("data", [(), (1,), (1, 2, 3), ("a", "b", "c"), (1, 2, "test")]) def test_construct_from_generator(data): expect = pd.Series((x for x in data)) got = xpd.Series((x for x in data)) @@ -792,9 +772,7 @@ def test_construct_timedelta_index(): ) def test_datetime_ops(op): pd_dt_idx1 = pd.DatetimeIndex([10, 20, 30], dtype="datetime64[ns]") - cudf_pandas_dt_idx = xpd.DatetimeIndex( - [10, 20, 30], dtype="datetime64[ns]" - ) + cudf_pandas_dt_idx = xpd.DatetimeIndex([10, 20, 30], dtype="datetime64[ns]") tm.assert_equal( op(pd_dt_idx1, pd_dt_idx1), op(cudf_pandas_dt_idx, cudf_pandas_dt_idx) @@ -815,9 +793,7 @@ def test_datetime_ops(op): ) def test_timedelta_ops(op): pd_td_idx1 = pd.TimedeltaIndex([10, 20, 30], dtype="timedelta64[ns]") - cudf_pandas_td_idx = xpd.TimedeltaIndex( - [10, 20, 30], dtype="timedelta64[ns]" - ) + cudf_pandas_td_idx = xpd.TimedeltaIndex([10, 20, 30], dtype="timedelta64[ns]") tm.assert_equal( op(pd_td_idx1, pd_td_idx1), op(cudf_pandas_td_idx, cudf_pandas_td_idx) @@ -827,14 +803,10 @@ def test_timedelta_ops(op): @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_datetime_timedelta_ops(op): pd_dt_idx1 = pd.DatetimeIndex([10, 20, 30], dtype="datetime64[ns]") - cudf_pandas_dt_idx = xpd.DatetimeIndex( - [10, 20, 30], dtype="datetime64[ns]" - ) + cudf_pandas_dt_idx = xpd.DatetimeIndex([10, 20, 30], dtype="datetime64[ns]") pd_td_idx1 = pd.TimedeltaIndex([10, 20, 30], dtype="timedelta64[ns]") - cudf_pandas_td_idx = xpd.TimedeltaIndex( - [10, 20, 30], dtype="timedelta64[ns]" - ) + cudf_pandas_td_idx = xpd.TimedeltaIndex([10, 20, 30], dtype="timedelta64[ns]") tm.assert_equal( op(pd_dt_idx1, pd_td_idx1), op(cudf_pandas_dt_idx, cudf_pandas_td_idx) @@ -906,12 +878,8 @@ def test_datetime_values_dtype_roundtrip(): def test_resample(): - ser = pd.Series( - range(3), index=pd.date_range("2020-01-01", freq="D", periods=3) - ) - xser = xpd.Series( - range(3), index=xpd.date_range("2020-01-01", freq="D", periods=3) - ) + ser = pd.Series(range(3), index=pd.date_range("2020-01-01", freq="D", periods=3)) + xser = xpd.Series(range(3), index=xpd.date_range("2020-01-01", freq="D", periods=3)) expected = ser.resample("D").max() result = xser.resample("D").max() # TODO: See if as_unit can be avoided diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index 631ad2f37b2..315effe83ba 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -116,9 +116,7 @@ def func(): assert _fast_arg(func)() == (fast_x, fast_y.method()) -def test_fast_slow_arg_function_global( - monkeypatch, function_proxy, final_proxy -): +def test_fast_slow_arg_function_global(monkeypatch, function_proxy, final_proxy): fast_x, slow_x, x = function_proxy fast_y, slow_y, y = final_proxy @@ -338,9 +336,7 @@ def test_doc(fast_and_intermediate_with_doc, slow_and_intermediate_with_doc): assert inspect.getdoc(Pxy().prop) == inspect.getdoc(Slow().prop) assert inspect.getdoc(Pxy.method) == inspect.getdoc(Slow.method) assert inspect.getdoc(Pxy().method) == inspect.getdoc(Slow().method) - assert inspect.getdoc(Pxy().intermediate()) == inspect.getdoc( - Slow().intermediate() - ) + assert inspect.getdoc(Pxy().intermediate()) == inspect.getdoc(Slow().intermediate()) assert inspect.getdoc(Pxy().intermediate().method) == inspect.getdoc( Slow().intermediate().method ) @@ -538,9 +534,7 @@ def test_tuple_with_attrs_transform(): assert a != b assert b != c assert a != d - transform = partial( - _transform_arg, attribute_name="_fsproxy_fast", seen=set() - ) + transform = partial(_transform_arg, attribute_name="_fsproxy_fast", seen=set()) aprime = transform(a) bprime = transform(b) cprime = transform(c) diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py index 4921446ab6b..f0a4961b9ad 100644 --- a/python/cudf/cudf_pandas_tests/test_profiler.py +++ b/python/cudf/cudf_pandas_tests/test_profiler.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -37,11 +37,7 @@ def test_profiler(): "Series.__getitem__", } for name, func in per_function_stats.items(): - assert ( - len(func["cpu"]) == 0 - if "Time" not in name - else len(func["gpu"]) == 0 - ) + assert len(func["cpu"]) == 0 if "Time" not in name else len(func["gpu"]) == 0 per_line_stats = profiler.per_line_stats calls = [ diff --git a/python/cudf_kafka/cudf_kafka/_version.py b/python/cudf_kafka/cudf_kafka/_version.py index 5adab566da0..8e5082234ec 100644 --- a/python/cudf_kafka/cudf_kafka/_version.py +++ b/python/cudf_kafka/cudf_kafka/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,9 +15,6 @@ import importlib.resources __version__ = ( - importlib.resources.files("cudf_kafka") - .joinpath("VERSION") - .read_text() - .strip() + importlib.resources.files("cudf_kafka").joinpath("VERSION").read_text().strip() ) __git_commit__ = "" diff --git a/python/custreamz/custreamz/_version.py b/python/custreamz/custreamz/_version.py index 0f545f95f2b..800582c753b 100644 --- a/python/custreamz/custreamz/_version.py +++ b/python/custreamz/custreamz/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,9 +15,6 @@ import importlib.resources __version__ = ( - importlib.resources.files("custreamz") - .joinpath("VERSION") - .read_text() - .strip() + importlib.resources.files("custreamz").joinpath("VERSION").read_text().strip() ) __git_commit__ = "" diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py index 0def0ba746e..5cf33e16c3b 100644 --- a/python/custreamz/custreamz/kafka.py +++ b/python/custreamz/custreamz/kafka.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import confluent_kafka as ck from cudf_kafka._lib.kafka import KafkaDatasource @@ -129,8 +129,7 @@ def read_gdf( if topic is None: raise ValueError( - "ERROR: You must specify the topic " - "that you want to consume from" + "ERROR: You must specify the topic " "that you want to consume from" ) kafka_datasource = KafkaDatasource( diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py index 1cda9b71387..2abc416d2a4 100644 --- a/python/custreamz/custreamz/tests/conftest.py +++ b/python/custreamz/custreamz/tests/conftest.py @@ -14,9 +14,7 @@ def kafka_client(): s.shutdown(2) s.close() except Exception: - pytest.skip( - "A running Kafka instance must be available to run these tests" - ) + pytest.skip("A running Kafka instance must be available to run these tests") kafka_configs = { "metadata.broker.list": "localhost:9092", diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index bae4b051cae..b24cf691988 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -277,9 +277,7 @@ def test_getitem(stream): "indexer", [lambda g: g, lambda g: g[["y"]], lambda g: g[["x", "y"]]] ) def test_groupby_aggregate(agg, grouper, indexer, stream): - df = cudf.DataFrame( - {"x": (np.arange(10) // 2).astype(float), "y": [1.0, 2.0] * 5} - ) + df = cudf.DataFrame({"x": (np.arange(10) // 2).astype(float), "y": [1.0, 2.0] * 5}) a = DataFrame(example=df.iloc[:0], stream=stream) @@ -302,9 +300,7 @@ def f(x): def test_repr(stream): - df = cudf.DataFrame( - {"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10} - ) + df = cudf.DataFrame({"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10}) a = DataFrame(example=df, stream=stream) text = repr(a) @@ -321,9 +317,7 @@ def test_repr(stream): def test_repr_html(stream): - df = cudf.DataFrame( - {"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10} - ) + df = cudf.DataFrame({"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10}) a = DataFrame(example=df, stream=stream) for x in [a, a.y, a.y.mean()]: @@ -420,12 +414,8 @@ def test_setitem_overwrites(stream): (lambda df: df, lambda df: df.x), ], ) -def test_rolling_count_aggregations( - op, window, m, pre_get, post_get, kwargs, stream -): - index = pd.DatetimeIndex( - pd.date_range("2000-01-01", "2000-01-03", freq="1h") - ) +def test_rolling_count_aggregations(op, window, m, pre_get, post_get, kwargs, stream): + index = pd.DatetimeIndex(pd.date_range("2000-01-01", "2000-01-03", freq="1h")) df = cudf.DataFrame({"x": np.arange(len(index))}, index=index) expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs) @@ -620,16 +610,12 @@ def test_windowing_n(func, n, getter): @pytest.mark.parametrize("func", [lambda x: x.sum(), lambda x: x.mean()]) @pytest.mark.parametrize("value", ["10h", "1d"]) @pytest.mark.parametrize("getter", [lambda df: df, lambda df: df.x]) -@pytest.mark.parametrize( - "grouper", [lambda a: "y", lambda a: a.index, lambda a: ["y"]] -) +@pytest.mark.parametrize("grouper", [lambda a: "y", lambda a: a.index, lambda a: ["y"]]) @pytest.mark.parametrize( "indexer", [lambda g: g, lambda g: g[["x"]], lambda g: g[["x", "y"]]] ) def test_groupby_windowing_value(func, value, getter, grouper, indexer): - index = pd.DatetimeIndex( - pd.date_range("2000-01-01", "2000-01-03", freq="1h") - ) + index = pd.DatetimeIndex(pd.date_range("2000-01-01", "2000-01-03", freq="1h")) df = cudf.DataFrame( { "x": np.arange(len(index), dtype=float), @@ -753,9 +739,7 @@ def test_groupby_aggregate_with_start_state(stream): sdf = DataFrame(stream, example=example).groupby(["name"]) output0 = sdf.amount.sum(start=None).stream.gather().sink_to_list() output1 = ( - sdf.amount.mean(with_state=True, start=None) - .stream.gather() - .sink_to_list() + sdf.amount.mean(with_state=True, start=None).stream.gather().sink_to_list() ) output2 = sdf.amount.count(start=None).stream.gather().sink_to_list() @@ -763,9 +747,7 @@ def test_groupby_aggregate_with_start_state(stream): stream.emit(df) out_df0 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [50, 100]}) - out_df1 = cudf.DataFrame( - {"name": ["Alice", "Tom"], "amount": [50.0, 100.0]} - ) + out_df1 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [50.0, 100.0]}) out_df2 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [1, 1]}) assert assert_eq(output0[0].reset_index(), out_df0) assert assert_eq(output1[0][1].reset_index(), out_df1) @@ -780,9 +762,7 @@ def test_groupby_aggregate_with_start_state(stream): .sink_to_list() ) output5 = sdf.amount.count(start=output2[0]).stream.gather().sink_to_list() - df = cudf.DataFrame( - {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} - ) + df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) stream.emit(df) out_df2 = cudf.DataFrame( @@ -791,9 +771,7 @@ def test_groupby_aggregate_with_start_state(stream): out_df3 = cudf.DataFrame( {"name": ["Alice", "Linda", "Tom"], "amount": [50.0, 200.0, 100.0]} ) - out_df4 = cudf.DataFrame( - {"name": ["Alice", "Linda", "Tom"], "amount": [2, 1, 2]} - ) + out_df4 = cudf.DataFrame({"name": ["Alice", "Linda", "Tom"], "amount": [2, 1, 2]}) assert assert_eq(output3[0].reset_index(), out_df2) assert assert_eq(output4[0][1].reset_index(), out_df3) assert assert_eq(output5[0].reset_index(), out_df4) @@ -806,9 +784,7 @@ def test_reductions_with_start_state(stream): output1 = sdf.amount.count(start=3).stream.gather().sink_to_list() output2 = sdf.amount.sum(start=10).stream.gather().sink_to_list() - df = cudf.DataFrame( - {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} - ) + df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) stream.emit(df) assert output0[0] == 72.0 @@ -826,9 +802,7 @@ def test_rolling_aggs_with_start_state(stream): .sink_to_list() ) - df = cudf.DataFrame( - {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} - ) + df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) stream.emit(df) df = cudf.DataFrame({"name": ["Bob"], "amount": [250]}) stream.emit(df) @@ -872,9 +846,7 @@ def test_window_aggs_with_start_state(stream): .sink_to_list() ) - df = cudf.DataFrame( - {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} - ) + df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) stream.emit(df) df = cudf.DataFrame({"name": ["Bob"], "amount": [250]}) stream.emit(df) @@ -905,13 +877,9 @@ def test_windowed_groupby_aggs_with_start_state(stream): .sink_to_list() ) - df = cudf.DataFrame( - {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} - ) + df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) stream.emit(df) - df = cudf.DataFrame( - {"name": ["Alice", "Linda", "Bob"], "amount": [250, 300, 350]} - ) + df = cudf.DataFrame({"name": ["Alice", "Linda", "Bob"], "amount": [250, 300, 350]}) stream.emit(df) stream = Stream() diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py index 0dd62854a4e..4a3f72077bc 100644 --- a/python/dask_cudf/dask_cudf/_version.py +++ b/python/dask_cudf/dask_cudf/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,9 +15,6 @@ import importlib.resources __version__ = ( - importlib.resources.files("dask_cudf") - .joinpath("VERSION") - .read_text() - .strip() + importlib.resources.files("dask_cudf").joinpath("VERSION").read_text().strip() ) __git_commit__ = "" diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index c7b4a1c4c6a..d9466313528 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -80,9 +80,7 @@ def _nonempty_index(idx): elif isinstance(idx._column, cudf.core.column.StringColumn): return cudf.Index(["cat", "dog"], name=idx.name) elif isinstance(idx, cudf.core.index.Index): - return cudf.core.index.Index( - np.arange(2, dtype=idx.dtype), name=idx.name - ) + return cudf.core.index.Index(np.arange(2, dtype=idx.dtype), name=idx.name) raise TypeError(f"Don't know how to handle index of type {type(idx)}") @@ -102,9 +100,7 @@ def _nest_list_data(data, leaf_type): @_dask_cudf_nvtx_annotate def _get_non_empty_data(s): if isinstance(s, cudf.core.column.CategoricalColumn): - categories = ( - s.categories if len(s.categories) else [UNKNOWN_CATEGORIES] - ) + categories = s.categories if len(s.categories) else [UNKNOWN_CATEGORIES] codes = cudf.core.column.as_column( 0, dtype=cudf._lib.types.size_type_dtype, @@ -135,9 +131,7 @@ def _get_non_empty_data(s): data = data.tz_localize(str(s.dtype.tz))._column else: if pd.api.types.is_numeric_dtype(s.dtype): - data = cudf.core.column.as_column( - cp.arange(start=0, stop=2, dtype=s.dtype) - ) + data = cudf.core.column.as_column(cp.arange(start=0, stop=2, dtype=s.dtype)) else: data = cudf.core.column.as_column( cp.arange(start=0, stop=2, dtype="int64") @@ -247,9 +241,7 @@ def make_meta_object_cudf(x, index=None): return _empty_series(x[0], x[1], index=index) elif isinstance(x, (list, tuple)): if not all(isinstance(i, tuple) and len(i) == 2 for i in x): - raise ValueError( - f"Expected iterable of tuples of (name, dtype), got {x}" - ) + raise ValueError(f"Expected iterable of tuples of (name, dtype), got {x}") return cudf.DataFrame( {c: _empty_series(c, d, index=index) for (c, d) in x}, columns=[c for c, d in x], @@ -295,9 +287,7 @@ def concat_cudf( return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) -@categorical_dtype_dispatch.register( - (cudf.DataFrame, cudf.Series, cudf.BaseIndex) -) +@categorical_dtype_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex)) @_dask_cudf_nvtx_annotate def categorical_dtype_cudf(categories=None, ordered=False): return cudf.CategoricalDtype(categories=categories, ordered=ordered) @@ -339,15 +329,11 @@ def percentile_cudf(a, q, interpolation="linear"): result = cp.percentile(a.cat.codes, q, interpolation=interpolation) return ( - pd.Categorical.from_codes( - result, a.dtype.categories, a.dtype.ordered - ), + pd.Categorical.from_codes(result, a.dtype.categories, a.dtype.ordered), n, ) if np.issubdtype(a.dtype, np.datetime64): - result = a.quantile( - [i / 100.0 for i in q], interpolation=interpolation - ) + result = a.quantile([i / 100.0 for i in q], interpolation=interpolation) if q[0] == 0: # https://github.com/dask/dask/issues/6864 @@ -356,9 +342,7 @@ def percentile_cudf(a, q, interpolation="linear"): if not np.issubdtype(a.dtype, np.number): interpolation = "nearest" return ( - a.quantile( - [i / 100.0 for i in q], interpolation=interpolation - ).to_pandas(), + a.quantile([i / 100.0 for i in q], interpolation=interpolation).to_pandas(), n, ) @@ -371,9 +355,7 @@ def _get_pyarrow_schema_cudf(obj, preserve_index=None, **kwargs): f"`pyarrow_schema_dispatch`: {list(kwargs)}" ) - return _cudf_to_table( - meta_nonempty(obj), preserve_index=preserve_index - ).schema + return _cudf_to_table(meta_nonempty(obj), preserve_index=preserve_index).schema @to_pyarrow_table_dispatch.register(cudf.DataFrame) @@ -389,9 +371,7 @@ def _cudf_to_table(obj, preserve_index=None, **kwargs): if preserve_index and isinstance(obj.index, cudf.RangeIndex): obj = obj.copy() obj.index.name = ( - obj.index.name - if obj.index.name is not None - else "__index_level_0__" + obj.index.name if obj.index.name is not None else "__index_level_0__" ) obj.index = obj.index._as_int_index() @@ -420,9 +400,7 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs): @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex)) @_dask_cudf_nvtx_annotate -def union_categoricals_cudf( - to_union, sort_categories=False, ignore_order=False -): +def union_categoricals_cudf(to_union, sort_categories=False, ignore_order=False): return cudf.api.types._union_categoricals( to_union, sort_categories=False, ignore_order=False ) @@ -465,8 +443,7 @@ def group_split_cudf(df, c, k, ignore_index=False): @_dask_cudf_nvtx_annotate def sizeof_cudf_dataframe(df): return int( - sum(col.memory_usage for col in df._data.columns) - + df._index.memory_usage() + sum(col.memory_usage for col in df._data.columns) + df._index.memory_usage() ) @@ -632,9 +609,7 @@ def read_hdf(*args, **kwargs): "read_hdf is not yet implemented in cudf/dask_cudf. " "Moving to cudf from pandas. Expect poor performance!" ) - return _default_backend(dd.read_hdf, *args, **kwargs).to_backend( - "cudf" - ) + return _default_backend(dd.read_hdf, *args, **kwargs).to_backend("cudf") # Define "cudf" backend entrypoint for dask-expr diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index bfe58531a73..c22c889be92 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -102,9 +102,7 @@ def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): cache_key = uuid.uuid4() def do_apply_rows(df, func, incols, outcols, kwargs): - return df.apply_rows( - func, incols, outcols, kwargs, cache_key=cache_key - ) + return df.apply_rows(func, incols, outcols, kwargs, cache_key=cache_key) meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) return self.map_partitions( @@ -426,9 +424,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) n = num.count(split_every=split_every) name = ddf._token_prefix + "var" - result = map_partitions( - var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof - ) + result = map_partitions(var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof) if isinstance(ddf, DataFrame): result.divisions = (min(ddf.columns), max(ddf.columns)) return handle_out(out, result) @@ -471,8 +467,7 @@ def _finalize_var(vals): local_name = "local-" + name num = ddf._get_numeric_data() dsk = { - (local_name, n, 0): (_local_var, (num._name, n), skipna) - for n in range(nparts) + (local_name, n, 0): (_local_var, (num._name, n), skipna) for n in range(nparts) } # Use reduction tree @@ -486,9 +481,7 @@ def _finalize_var(vals): p_max = widths[depth - 1] lstart = split_every * group lstop = min(lstart + split_every, p_max) - node_list = [ - (local_name, p, depth - 1) for p in range(lstart, lstop) - ] + node_list = [(local_name, p, depth - 1) for p in range(lstart, lstop)] dsk[(local_name, group, depth)] = (_aggregate_var, node_list) if height == 1: group = depth = 0 @@ -635,10 +628,7 @@ def reduction( # Chunk a = f"{token or funcname(chunk)}-chunk-{token_key}" if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: - dsk = { - (a, 0, i): (chunk, key) - for i, key in enumerate(args[0].__dask_keys__()) - } + dsk = {(a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__())} else: dsk = { (a, 0, i): ( @@ -688,16 +678,13 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): from dask_cudf import QUERY_PLANNING_ON if isinstance(getattr(data, "index", None), cudf.MultiIndex): - raise NotImplementedError( - "dask_cudf does not support MultiIndex Dataframes." - ) + raise NotImplementedError("dask_cudf does not support MultiIndex Dataframes.") # Dask-expr doesn't support the `name` argument name = {} if not QUERY_PLANNING_ON: name = { - "name": name - or ("from_cudf-" + tokenize(data, npartitions or chunksize)) + "name": name or ("from_cudf-" + tokenize(data, npartitions or chunksize)) } return dd.from_pandas( diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index b2f92aeddda..4c7145ddaf1 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -43,9 +43,7 @@ def var( index = self._meta.to_pandas().var(numeric_only=True).index frame = frame[list(index)] return new_collection( - frame.expr.var( - axis, skipna, ddof, numeric_only, split_every=split_every - ) + frame.expr.var(axis, skipna, ddof, numeric_only, split_every=split_every) ) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 43ad4f0fee3..0b36218ed5a 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -193,9 +193,7 @@ def last(self, split_every=None, split_out=1): @_deprecate_shuffle_kwarg @_dask_cudf_nvtx_annotate - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): + def aggregate(self, arg, split_every=None, split_out=1, shuffle_method=None): if arg == "size": return self.size() @@ -337,9 +335,7 @@ def last(self, split_every=None, split_out=1): @_deprecate_shuffle_kwarg @_dask_cudf_nvtx_annotate - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): + def aggregate(self, arg, split_every=None, split_out=1, shuffle_method=None): if arg == "size": return self.size() @@ -601,9 +597,7 @@ def groupby_agg( split_out, token="cudf-aggregate", sort=sort, - shuffle_method=shuffle_method - if isinstance(shuffle_method, str) - else None, + shuffle_method=shuffle_method if isinstance(shuffle_method, str) else None, ) # Deal with sort/shuffle defaults @@ -616,9 +610,7 @@ def groupby_agg( ) # Determine required columns to enable column projection - required_columns = list( - set(gb_cols).union(aggs.keys()).intersection(ddf.columns) - ) + required_columns = list(set(gb_cols).union(aggs.keys()).intersection(ddf.columns)) return aca( [ddf[required_columns]], @@ -639,9 +631,7 @@ def groupby_agg( @_dask_cudf_nvtx_annotate -def _make_groupby_agg_call( - gb, aggs, split_every, split_out, shuffle_method=None -): +def _make_groupby_agg_call(gb, aggs, split_every, split_out, shuffle_method=None): """Helper method to consolidate the common `groupby_agg` call for all aggregations in one place """ @@ -676,9 +666,7 @@ def _redirect_aggs(arg): if isinstance(arg[col], list): new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] elif isinstance(arg[col], dict): - new_arg[col] = { - k: redirects.get(v, v) for k, v in arg[col].items() - } + new_arg[col] = {k: redirects.get(v, v) for k, v in arg[col].items()} else: new_arg[col] = redirects.get(arg[col], arg[col]) return new_arg @@ -756,9 +744,7 @@ def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): df[pow2_name] = df[col].astype("float64").pow(2) _agg_dict[pow2_name] = ["sum"] - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - _agg_dict - ) + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(_agg_dict) output_columns = [_make_name(name, sep=sep) for name in gb.columns] gb.columns = output_columns # Return with deterministic column ordering @@ -788,9 +774,7 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep): else: raise ValueError(f"Unexpected aggregation: {agg}") - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - agg_dict - ) + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(agg_dict) # Don't include the last aggregation in the column names output_columns = [ diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index 49fea0d7602..2ebaa875817 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from io import BufferedWriter, IOBase @@ -18,9 +18,7 @@ def _read_orc_stripe(fs, path, stripe, columns, kwargs=None): if kwargs is None: kwargs = {} with fs.open(path, "rb") as f: - df_stripe = cudf.read_orc( - f, stripes=[stripe], columns=columns, **kwargs - ) + df_stripe = cudf.read_orc(f, stripes=[stripe], columns=columns, **kwargs) return df_stripe @@ -78,17 +76,13 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): if schema is None: schema = o.schema elif schema != o.schema: - raise ValueError( - "Incompatible schemas while parsing ORC files" - ) + raise ValueError("Incompatible schemas while parsing ORC files") nstripes_per_file.append(o.nstripes) schema = _get_pyarrow_dtypes(schema, categories=None) if columns is not None: ex = set(columns) - set(schema) if ex: - raise ValueError( - f"Requested columns ({ex}) not in schema ({set(schema)})" - ) + raise ValueError(f"Requested columns ({ex}) not in schema ({set(schema)})") else: columns = list(schema) @@ -105,9 +99,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): N = 0 for path, n in zip(paths, nstripes_per_file): for stripe in ( - range(n) - if filters is None - else cudf.io.orc._filter_stripes(filters, path) + range(n) if filters is None else cudf.io.orc._filter_stripes(filters, path) ): dsk[(name, N)] = ( _read_orc_stripe, @@ -169,9 +161,7 @@ def to_orc( if hasattr(path, "name"): path = stringify_path(path) - fs, _, _ = get_fs_token_paths( - path, mode="wb", storage_options=storage_options - ) + fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index fc962670c47..bd716659ab8 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -126,9 +126,7 @@ def _read_paths( pof, engine="cudf", columns=columns, - row_groups=row_groups[i] - if row_groups - else None, + row_groups=row_groups[i] if row_groups else None, dataset_kwargs=dataset_kwargs, categorical_partitions=False, **kwargs, @@ -284,8 +282,7 @@ def read_partition( paths.append(path) rgs.append( [row_group] - if not isinstance(row_group, list) - and row_group is not None + if not isinstance(row_group, list) and row_group is not None else row_group ) last_partition_keys = partition_keys @@ -372,18 +369,14 @@ def write_partition( engine=kwargs.get("engine", "cudf"), index=kwargs.get("index", None), partition_cols=kwargs.get("partition_cols", None), - partition_file_name=kwargs.get( - "partition_file_name", None - ), + partition_file_name=kwargs.get("partition_file_name", None), partition_offsets=kwargs.get("partition_offsets", None), statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), row_group_size_bytes=kwargs.get( "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT ), - row_group_size_rows=kwargs.get( - "row_group_size_rows", None - ), + row_group_size_rows=kwargs.get("row_group_size_rows", None), storage_options=kwargs.get("storage_options", None), metadata_file_path=filename if return_metadata else None, ) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index a35a9f1be48..89cf7c82001 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -86,9 +86,9 @@ def test_csv_roundtrip_filepath(tmp_path): def test_read_csv(tmp_path): - df = dask.datasets.timeseries( - dtypes={"x": int, "y": int}, freq="120s" - ).reset_index(drop=True) + df = dask.datasets.timeseries(dtypes={"x": int, "y": int}, freq="120s").reset_index( + drop=True + ) csv_path = str(tmp_path / "data-*.csv") df.to_csv(csv_path, index=False) @@ -115,9 +115,9 @@ def test_raises_FileNotFoundError(): def test_read_csv_w_bytes(tmp_path): - df = dask.datasets.timeseries( - dtypes={"x": int, "y": int}, freq="120s" - ).reset_index(drop=True) + df = dask.datasets.timeseries(dtypes={"x": int, "y": int}, freq="120s").reset_index( + drop=True + ) df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20))) df.to_csv(tmp_path / "data-*.csv", index=False) @@ -169,11 +169,7 @@ def test_read_csv_compression_file_list(tmp_path): def test_read_csv_blocksize_none(tmp_path, compression, size): df = pd.DataFrame(dict(x=np.arange(size), y=np.arange(size))) - path = ( - tmp_path / "data.csv.gz" - if compression == "gzip" - else tmp_path / "data.csv" - ) + path = tmp_path / "data.csv.gz" if compression == "gzip" else tmp_path / "data.csv" # Types need to be specified for empty csv files if size == 0: @@ -261,6 +257,4 @@ def test_read_csv_nrows(csv_end_bad_lines): def test_read_csv_nrows_error(csv_end_bad_lines): with pytest.raises(ValueError): - dask_cudf.read_csv( - csv_end_bad_lines, nrows=2, blocksize="100 MiB" - ).compute() + dask_cudf.read_csv(csv_end_bad_lines, nrows=2, blocksize="100 MiB").compute() diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index de2a735b2ce..18a06a7dda6 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -50,9 +50,7 @@ def test_roundtrip_backend_dispatch(tmpdir): @pytest.mark.parametrize("divisions", [True, False]) def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file): tmpdir = str(tmpdir) - ddf.to_parquet( - tmpdir, write_metadata_file=write_metadata_file, engine="pyarrow" - ) + ddf.to_parquet(tmpdir, write_metadata_file=write_metadata_file, engine="pyarrow") files = sorted( (os.path.join(tmpdir, f) for f in os.listdir(tmpdir)), key=natural_sort_key, @@ -63,15 +61,11 @@ def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file): dd.assert_eq(ddf, ddf2, check_divisions=divisions) # Specify columns=['x'] - ddf2 = dask_cudf.read_parquet( - files, columns=["x"], calculate_divisions=divisions - ) + ddf2 = dask_cudf.read_parquet(files, columns=["x"], calculate_divisions=divisions) dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions) # Specify columns='y' - ddf2 = dask_cudf.read_parquet( - files, columns="y", calculate_divisions=divisions - ) + ddf2 = dask_cudf.read_parquet(files, columns="y", calculate_divisions=divisions) dd.assert_eq(ddf["y"], ddf2, check_divisions=divisions) # Now include metadata @@ -79,15 +73,11 @@ def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file): dd.assert_eq(ddf, ddf2, check_divisions=divisions) # Specify columns=['x'] (with metadata) - ddf2 = dask_cudf.read_parquet( - tmpdir, columns=["x"], calculate_divisions=divisions - ) + ddf2 = dask_cudf.read_parquet(tmpdir, columns=["x"], calculate_divisions=divisions) dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions) # Specify columns='y' (with metadata) - ddf2 = dask_cudf.read_parquet( - tmpdir, columns="y", calculate_divisions=divisions - ) + ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", calculate_divisions=divisions) dd.assert_eq(ddf["y"], ddf2, check_divisions=divisions) @@ -150,9 +140,7 @@ def test_roundtrip_from_pandas(tmpdir): def test_strings(tmpdir): fn = str(tmpdir) - dfp = pd.DataFrame( - {"a": ["aa", "bbb", "cccc"], "b": ["hello", "dog", "man"]} - ) + dfp = pd.DataFrame({"a": ["aa", "bbb", "cccc"], "b": ["hello", "dog", "man"]}) dfp.set_index("a", inplace=True, drop=True) ddf2 = dd.from_pandas(dfp, npartitions=2) ddf2.to_parquet(fn, engine="pyarrow") @@ -177,9 +165,7 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions): fn = str(tmpdir) ddf2 = dask.datasets.timeseries(freq="D") ddf2.to_parquet(fn, engine="pyarrow", write_index=index) - read_df = dask_cudf.read_parquet( - fn, index=index, calculate_divisions=divisions - ) + read_df = dask_cudf.read_parquet(fn, index=index, calculate_divisions=divisions) dd.assert_eq( ddf2, read_df, check_divisions=(divisions and index), check_index=index ) @@ -190,14 +176,10 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions): @pytest.mark.parametrize("divisions", [False, True]) def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions): fn = str(tmpdir) - ddf2 = dask_cudf.from_cudf( - cudf.datasets.timeseries(freq="D"), npartitions=4 - ) + ddf2 = dask_cudf.from_cudf(cudf.datasets.timeseries(freq="D"), npartitions=4) ddf2.name = ddf2.name.astype("object") ddf2.to_parquet(fn, write_index=index) - read_df = dask_cudf.read_parquet( - fn, index=index, calculate_divisions=divisions - ) + read_df = dask_cudf.read_parquet(fn, index=index, calculate_divisions=divisions) dd.assert_eq( ddf2, read_df, check_divisions=(divisions and index), check_index=index ) @@ -224,9 +206,7 @@ def test_filters(tmpdir): ddf.to_parquet(tmp_path, engine="pyarrow") - a = dask_cudf.read_parquet( - tmp_path, filters=[("x", ">", 4)], split_row_groups=True - ) + a = dask_cudf.read_parquet(tmp_path, filters=[("x", ">", 4)], split_row_groups=True) assert a.npartitions == 3 assert (a.x > 3).all().compute() @@ -265,17 +245,13 @@ def test_isna_filters(tmpdir, null, numeric): # Test "is" col = "i" if numeric else "j" filters = [(col, "is", null)] - out = dask_cudf.read_parquet( - tmp_path, filters=filters, split_row_groups=True - ) + out = dask_cudf.read_parquet(tmp_path, filters=filters, split_row_groups=True) assert len(out) == 2 assert list(out.x.compute().values) == [4, 5] # Test "is not" filters = [(col, "is not", null)] - out = dask_cudf.read_parquet( - tmp_path, filters=filters, split_row_groups=True - ) + out = dask_cudf.read_parquet(tmp_path, filters=filters, split_row_groups=True) assert len(out) == 8 assert list(out.x.compute().values) == [0, 1, 2, 3, 6, 7, 8, 9] @@ -296,9 +272,7 @@ def test_filters_at_row_group_level(tmpdir): # Overwrite=True can be removed for dask-expr>=0.4.1 # See: https://github.com/dask-contrib/dask-expr/issues/800 - ddf.to_parquet( - tmp_path, engine="pyarrow", row_group_size=1, overwrite=True - ) + ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=1, overwrite=True) b = dask_cudf.read_parquet( tmp_path, filters=[("x", "==", 1)], split_row_groups=True @@ -323,9 +297,7 @@ def test_roundtrip_from_dask_partitioned(tmpdir, parts, daskcudf, metadata): df.index.name = "index" if daskcudf: ddf2 = dask_cudf.from_cudf(cudf.from_pandas(df), npartitions=2) - ddf2.to_parquet( - tmpdir, write_metadata_file=metadata, partition_on=parts - ) + ddf2.to_parquet(tmpdir, write_metadata_file=metadata, partition_on=parts) else: ddf2 = dd.from_pandas(df, npartitions=2) ddf2.to_parquet( diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index f4a6fabdb60..9d7f1559680 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -129,9 +129,7 @@ def test_read_parquet(s3_base, s3so, open_file_options): buffer = BytesIO() pdf.to_parquet(path=buffer) buffer.seek(0) - with s3_context( - s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer} - ): + with s3_context(s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer}): if "open_file_func" in open_file_options: fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index f3774e20d32..9aa98b859c5 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -31,16 +31,12 @@ def wrapper(*args, **kwargs): if old_arg_value is not None: new_arg_value = old_arg_value msg = ( - "the 'shuffle' keyword is deprecated, " - "use 'shuffle_method' instead." + "the 'shuffle' keyword is deprecated, " "use 'shuffle_method' instead." ) warnings.warn(msg, FutureWarning) if kwargs.get("shuffle_method") is not None: - msg = ( - "Can only specify 'shuffle' " - "or 'shuffle_method', not both." - ) + msg = "Can only specify 'shuffle' " "or 'shuffle_method', not both." raise TypeError(msg) kwargs["shuffle_method"] = new_arg_value return func(*args, **kwargs) @@ -60,9 +56,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): if ascending: partitions = divisions.searchsorted(s, side="right") - 1 else: - partitions = ( - len(divisions) - divisions.searchsorted(s, side="right") - 1 - ) + partitions = len(divisions) - divisions.searchsorted(s, side="right") - 1 partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = ( 0 if ascending else (len(divisions) - 2) ) @@ -198,8 +192,7 @@ def finalize_tsk(tsk): name = "quantiles-1-" + token val_dsk = { - (name, i): (_quantile, key, qs) - for i, key in enumerate(df.__dask_keys__()) + (name, i): (_quantile, key, qs) for i, key in enumerate(df.__dask_keys__()) } name2 = "quantiles-2-" + token diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index ebb8e4be187..58553d14f0b 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -127,9 +127,7 @@ def test_categorical_basic(data): assert_eq(pdsr.cat.categories, dsr.cat.categories) - np.testing.assert_array_equal( - pdsr.cat.codes.values, result.cat.codes.values_host - ) + np.testing.assert_array_equal(pdsr.cat.codes.values, result.cat.codes.values_host) string = str(result) expect_str = """ @@ -230,12 +228,8 @@ def test_categorical_compare_ordered(data): assert pdsr1.cat.ordered # Test ordered operators - np.testing.assert_array_equal( - pdsr1 < pdsr2, (dsr1 < dsr2).compute().values_host - ) - np.testing.assert_array_equal( - pdsr1 > pdsr2, (dsr1 > dsr2).compute().values_host - ) + np.testing.assert_array_equal(pdsr1 < pdsr2, (dsr1 < dsr2).compute().values_host) + np.testing.assert_array_equal(pdsr1 > pdsr2, (dsr1 > dsr2).compute().values_host) ############################################################################# @@ -258,9 +252,7 @@ def test_string_slicing(data): def test_categorical_categories(): - df = DataFrame( - {"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)} - ) + df = DataFrame({"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)}) df["a"] = df["a"].astype("category") pdf = df.to_pandas(nullable=False) @@ -320,10 +312,7 @@ def data_test_non_numeric(): def data_test_nested(): - return [ - list(list(y for y in range(x % 5)) for x in range(i)) - for i in range(40) - ] + return [list(list(y for y in range(x % 5)) for x in range(i)) for i in range(40)] def data_test_sort(): @@ -534,9 +523,7 @@ def test_struct_explode(data): def test_tz_localize(): data = Series(date_range("2000-04-01", "2000-04-03", freq="H")) - expect = data.dt.tz_localize( - "US/Eastern", ambiguous="NaT", nonexistent="NaT" - ) + expect = data.dt.tz_localize("US/Eastern", ambiguous="NaT", nonexistent="NaT") got = dask_cudf.from_cudf(data, 2).dt.tz_localize( "US/Eastern", ambiguous="NaT", nonexistent="NaT" ) @@ -551,9 +538,7 @@ def test_tz_localize(): "data", [ date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("UTC"), - date_range("2000-04-01", "2000-04-03", freq="H").tz_localize( - "US/Eastern" - ), + date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("US/Eastern"), ], ) def test_tz_convert(data): diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 8a2f3414fd1..13bd444495c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -195,9 +195,7 @@ def test_set_index(nelem): # Use unique index range as the sort may not be stable-ordering x = np.arange(nelem) np.random.shuffle(x) - df = pd.DataFrame( - {"x": x, "y": np.random.randint(0, nelem, size=nelem)} - ) + df = pd.DataFrame({"x": x, "y": np.random.randint(0, nelem, size=nelem)}) ddf = dd.from_pandas(df, npartitions=2) ddf2 = ddf.to_backend("cudf") @@ -309,9 +307,7 @@ def test_rearrange_by_divisions(nelem, index): df["z"] = df["z"].astype("category") ddf1 = dd.from_pandas(df, npartitions=4) - gdf1 = dask_cudf.from_cudf( - cudf.DataFrame.from_pandas(df), npartitions=4 - ) + gdf1 = dask_cudf.from_cudf(cudf.DataFrame.from_pandas(df), npartitions=4) ddf1.index.name = index gdf1.index.name = index divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20) @@ -492,10 +488,7 @@ def test_repartition_hash_staged(npartitions): # and that the key values are preserved expect_unique = gdf[by].drop_duplicates().sort_values(by) got_unique = cudf.concat( - [ - part[by].compute().drop_duplicates() - for part in ddf_new[by].partitions - ], + [part[by].compute().drop_duplicates() for part in ddf_new[by].partitions], ignore_index=True, ).sort_values(by) dd.assert_eq(got_unique, expect_unique, check_index=False) @@ -532,10 +525,7 @@ def test_repartition_hash(by, npartitions, max_branch): # and that the key values are preserved expect_unique = gdf[by].drop_duplicates().sort_values(by) got_unique = cudf.concat( - [ - part[by].compute().drop_duplicates() - for part in ddf_new[by].partitions - ], + [part[by].compute().drop_duplicates() for part in ddf_new[by].partitions], ignore_index=True, ).sort_values(by) dd.assert_eq(got_unique, expect_unique, check_index=False) @@ -594,11 +584,7 @@ def test_concat(gdf, gddf, series): if series: gdf = gdf.x gddf = gddf.x - a = ( - cudf.concat([gdf, gdf + 1, gdf + 2]) - .sort_values() - .reset_index(drop=True) - ) + a = cudf.concat([gdf, gdf + 1, gdf + 2]).sort_values().reset_index(drop=True) b = ( dd.concat([gddf, gddf + 1, gddf + 2], interleave_partitions=True) .compute() @@ -606,11 +592,7 @@ def test_concat(gdf, gddf, series): .reset_index(drop=True) ) else: - a = ( - cudf.concat([gdf, gdf + 1, gdf + 2]) - .sort_values("x") - .reset_index(drop=True) - ) + a = cudf.concat([gdf, gdf + 1, gdf + 2]).sort_values("x").reset_index(drop=True) b = ( dd.concat([gddf, gddf + 1, gddf + 2], interleave_partitions=True) .compute() @@ -689,9 +671,7 @@ def test_hash_object_dispatch(index): ) def test_make_meta_backends(index): dtypes = ["int8", "int32", "int64", "float64"] - df = cudf.DataFrame( - {dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes} - ) + df = cudf.DataFrame({dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes}) df["strings"] = ["cat", "dog", "fish"] df["cats"] = df["strings"].astype("category") df["time_s"] = np.array( @@ -801,9 +781,7 @@ def test_dataframe_describe(): ddf = dask_cudf.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq( - ddf.describe(), pddf.describe(), check_exact=False, atol=0.0001 - ) + dd.assert_eq(ddf.describe(), pddf.describe(), check_exact=False, atol=0.0001) @xfail_dask_expr("Insufficient describe support in dask-expr") @@ -850,17 +828,13 @@ def test_index_map_partitions(): def test_merging_categorical_columns(): - df_1 = cudf.DataFrame( - {"id_1": [0, 1, 2, 3], "cat_col": ["a", "b", "f", "f"]} - ) + df_1 = cudf.DataFrame({"id_1": [0, 1, 2, 3], "cat_col": ["a", "b", "f", "f"]}) ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2) ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"]) - df_2 = cudf.DataFrame( - {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]} - ) + df_2 = cudf.DataFrame({"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]}) ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 3bb3e3b0bb8..9757f5966a4 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -62,9 +62,7 @@ def pdf(request): def test_groupby_basic(series, aggregation, pdf): gdf = cudf.DataFrame.from_pandas(pdf) gdf_grouped = gdf.groupby("xx", dropna=True) - ddf_grouped = dask_cudf.from_cudf(gdf, npartitions=5).groupby( - "xx", dropna=True - ) + ddf_grouped = dask_cudf.from_cudf(gdf, npartitions=5).groupby("xx", dropna=True) if series: gdf_grouped = gdf_grouped.x @@ -223,9 +221,7 @@ def test_reset_index_multiindex(): @pytest.mark.parametrize("split_out", [1, 2, 3]) -@pytest.mark.parametrize( - "column", ["c", "d", "e", ["b", "c"], ["b", "d"], ["b", "e"]] -) +@pytest.mark.parametrize("column", ["c", "d", "e", ["b", "c"], ["b", "d"], ["b", "e"]]) def test_groupby_split_out(split_out, column): df = pd.DataFrame( { @@ -243,26 +239,17 @@ def test_groupby_split_out(split_out, column): gddf = dask_cudf.from_cudf(gdf, npartitions=3) ddf_result = ( - ddf.groupby(column) - .a.mean(split_out=split_out) - .compute() - .sort_values() - .dropna() + ddf.groupby(column).a.mean(split_out=split_out).compute().sort_values().dropna() ) gddf_result = ( - gddf.groupby(column) - .a.mean(split_out=split_out) - .compute() - .sort_values() + gddf.groupby(column).a.mean(split_out=split_out).compute().sort_values() ) dd.assert_eq(gddf_result, ddf_result, check_index=False) @pytest.mark.parametrize("dropna", [False, True, None]) -@pytest.mark.parametrize( - "by", ["a", "b", "c", "d", ["a", "b"], ["a", "c"], ["a", "d"]] -) +@pytest.mark.parametrize("by", ["a", "b", "c", "d", ["a", "b"], ["a", "c"], ["a", "d"]]) def test_groupby_dropna_cudf(dropna, by): # NOTE: This test is borrowed from upstream dask # (dask/dask/dataframe/tests/test_groupby.py) @@ -309,16 +296,12 @@ def test_groupby_dropna_cudf(dropna, by): pytest.param( False, ["a", "b"], - marks=pytest.mark.xfail( - reason="https://github.com/dask/dask/issues/8817" - ), + marks=pytest.mark.xfail(reason="https://github.com/dask/dask/issues/8817"), ), pytest.param( False, ["a", "c"], - marks=pytest.mark.xfail( - reason="https://github.com/dask/dask/issues/8817" - ), + marks=pytest.mark.xfail(reason="https://github.com/dask/dask/issues/8817"), ), pytest.param( False, @@ -519,9 +502,7 @@ def test_groupby_reset_index_dtype(): def test_groupby_reset_index_names(): - df = cudf.datasets.randomdata( - nrows=10, dtypes={"a": str, "b": int, "c": int} - ) + df = cudf.datasets.randomdata(nrows=10, dtypes={"a": str, "b": int, "c": int}) pdf = df.to_pandas() gddf = dask_cudf.from_cudf(df, 2) @@ -543,17 +524,11 @@ def test_groupby_reset_index_string_name(): gddf = dask_cudf.from_cudf(df, npartitions=1) pddf = dd.from_pandas(pdf, npartitions=1) - g_res = ( - gddf.groupby(["key"]).agg({"value": "mean"}).reset_index(drop=False) - ) - p_res = ( - pddf.groupby(["key"]).agg({"value": "mean"}).reset_index(drop=False) - ) + g_res = gddf.groupby(["key"]).agg({"value": "mean"}).reset_index(drop=False) + p_res = pddf.groupby(["key"]).agg({"value": "mean"}).reset_index(drop=False) got = g_res.compute().sort_values(["key", "value"]).reset_index(drop=True) - expect = ( - p_res.compute().sort_values(["key", "value"]).reset_index(drop=True) - ) + expect = p_res.compute().sort_values(["key", "value"]).reset_index(drop=True) dd.assert_eq(got, expect) assert len(g_res) == len(p_res) @@ -646,9 +621,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): assert ("name", "") in gr.columns and ("a", "") in gr.columns # Check `split_out` argument - assert gr.npartitions == ( - 1 if split_out == "use_dask_default" else split_out - ) + assert gr.npartitions == (1 if split_out == "use_dask_default" else split_out) # Compute for easier multiindex handling gf = gr.compute() @@ -659,19 +632,13 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): gf = gf.reset_index(drop=False) sort_cols = [("name", ""), ("a", ""), ("c", "mean")] gf = gf.sort_values(sort_cols).reset_index(drop=True) - pf = ( - pf.reset_index(drop=False) - .sort_values(sort_cols) - .reset_index(drop=True) - ) + pf = pf.reset_index(drop=False).sort_values(sort_cols).reset_index(drop=True) dd.assert_eq(gf, pf) @xfail_dask_expr("Newer dask-expr version needed") -@pytest.mark.parametrize( - "aggregations", [(sum, "sum"), (max, "max"), (min, "min")] -) +@pytest.mark.parametrize("aggregations", [(sum, "sum"), (max, "max"), (min, "min")]) def test_groupby_agg_redirect(aggregations): pdf = pd.DataFrame( { @@ -764,9 +731,7 @@ def test_groupby_with_list_of_series(): ddf = dd.from_pandas(df.to_pandas(), npartitions=2) pgs = dd.from_pandas(gs.to_pandas(), npartitions=2) - dd.assert_eq( - gdf.groupby([ggs]).agg(["sum"]), ddf.groupby([pgs]).agg(["sum"]) - ) + dd.assert_eq(gdf.groupby([ggs]).agg(["sum"]), ddf.groupby([pgs]).agg(["sum"])) @xfail_dask_expr("Nested renamer not supported in dask-expr") @@ -805,9 +770,7 @@ def test_groupby_nested_dict(func): lambda df: df.groupby(["x", "y"]).min(), pytest.param( lambda df: df.groupby(["x", "y"]).agg("min"), - marks=pytest.mark.skip( - reason="https://github.com/dask/dask/issues/9093" - ), + marks=pytest.mark.skip(reason="https://github.com/dask/dask/issues/9093"), ), lambda df: df.groupby(["x", "y"]).y.min(), lambda df: df.groupby(["x", "y"]).y.agg("min"), @@ -831,32 +794,24 @@ def test_groupby_all_columns(func): def test_groupby_shuffle(): - df = cudf.datasets.randomdata( - nrows=640, dtypes={"a": str, "b": int, "c": int} - ) + df = cudf.datasets.randomdata(nrows=640, dtypes={"a": str, "b": int, "c": int}) gddf = dask_cudf.from_cudf(df, 8) spec = {"b": "mean", "c": "max"} expect = df.groupby("a", sort=True).agg(spec) # Sorted aggregation, single-partition output # (sort=True, split_out=1) - got = gddf.groupby("a", sort=True).agg( - spec, shuffle_method=True, split_out=1 - ) + got = gddf.groupby("a", sort=True).agg(spec, shuffle_method=True, split_out=1) dd.assert_eq(expect, got) # Sorted aggregation, multi-partition output # (sort=True, split_out=2) - got = gddf.groupby("a", sort=True).agg( - spec, shuffle_method=True, split_out=2 - ) + got = gddf.groupby("a", sort=True).agg(spec, shuffle_method=True, split_out=2) dd.assert_eq(expect, got) # Un-sorted aggregation, single-partition output # (sort=False, split_out=1) - got = gddf.groupby("a", sort=False).agg( - spec, shuffle_method=True, split_out=1 - ) + got = gddf.groupby("a", sort=False).agg(spec, shuffle_method=True, split_out=1) dd.assert_eq(expect.sort_index(), got.compute().sort_index()) # Un-sorted aggregation, multi-partition output @@ -869,9 +824,7 @@ def test_groupby_shuffle(): # Sorted aggregation fails with split_out>1 when shuffle is False # (sort=True, split_out=2, shuffle_method=False) with pytest.raises(ValueError): - gddf.groupby("a", sort=True).agg( - spec, shuffle_method=False, split_out=2 - ) + gddf.groupby("a", sort=True).agg(spec, shuffle_method=False, split_out=2) # Check shuffle kwarg deprecation with pytest.warns(match="'shuffle' keyword is deprecated"): diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py index 42ecc130298..4d3639bfb84 100644 --- a/python/dask_cudf/dask_cudf/tests/test_join.py +++ b/python/dask_cudf/dask_cudf/tests/test_join.py @@ -140,9 +140,7 @@ def gather(df, grows): @pytest.mark.parametrize("right_nrows", param_nrows) @pytest.mark.parametrize("left_nkeys", [4, 5]) @pytest.mark.parametrize("right_nkeys", [4, 5]) -def test_merge_left( - left_nrows, right_nrows, left_nkeys, right_nkeys, how="left" -): +def test_merge_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"): chunksize = 3 np.random.seed(0) @@ -167,9 +165,7 @@ def test_merge_left( def normalize(df): return ( - df.to_pandas() - .sort_values(["x", "y", "a_x", "a_y"]) - .reset_index(drop=True) + df.to_pandas().sort_values(["x", "y", "a_x", "a_y"]).reset_index(drop=True) ) # dask_cudf @@ -187,9 +183,7 @@ def normalize(df): @pytest.mark.parametrize("right_nrows", [5, 10]) @pytest.mark.parametrize("left_nkeys", [4]) @pytest.mark.parametrize("right_nkeys", [4]) -def test_merge_1col_left( - left_nrows, right_nrows, left_nkeys, right_nkeys, how="left" -): +def test_merge_1col_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"): chunksize = 3 np.random.seed(0) @@ -209,11 +203,7 @@ def test_merge_1col_left( ) expect = left.merge(right, on=["x"], how=how) - expect = ( - expect.to_pandas() - .sort_values(["x", "a_x", "a_y"]) - .reset_index(drop=True) - ) + expect = expect.to_pandas().sort_values(["x", "a_x", "a_y"]).reset_index(drop=True) # dask_cudf left = dask_cudf.from_cudf(left, chunksize=chunksize) @@ -272,18 +262,14 @@ def test_indexed_join(how): # occasionally order is not correct (possibly do to hashing in the merge) d = d.sort_values("x") # index is preserved - dg = dg.sort_values( - "x" - ) # index is reset -- sort_values will slow test down + dg = dg.sort_values("x") # index is reset -- sort_values will slow test down dd.assert_eq(d, dg, check_index=False) @pytest.mark.parametrize("how", ["left", "inner"]) def test_how(how): - left = cudf.DataFrame( - {"x": [1, 2, 3, 4, None], "y": [1.0, 2.0, 3.0, 4.0, 0.0]} - ) + left = cudf.DataFrame({"x": [1, 2, 3, 4, None], "y": [1.0, 2.0, 3.0, 4.0, 0.0]}) right = cudf.DataFrame({"x": [2, 3, None, 2], "y": [20, 30, 0, 20]}) dleft = dd.from_pandas(left, npartitions=2) @@ -325,12 +311,8 @@ def test_single_dataframe_merge(daskify): @pytest.mark.parametrize("how", ["inner", "left"]) @pytest.mark.parametrize("on", ["id_1", ["id_1"], ["id_1", "id_2"]]) def test_on(how, on): - left = cudf.DataFrame( - {"id_1": [1, 2, 3, 4, 5], "id_2": [1.0, 2.0, 3.0, 4.0, 0.0]} - ) - right = cudf.DataFrame( - {"id_1": [2, 3, None, 2], "id_2": [2.0, 3.0, 4.0, 20]} - ) + left = cudf.DataFrame({"id_1": [1, 2, 3, 4, 5], "id_2": [1.0, 2.0, 3.0, 4.0, 0.0]}) + right = cudf.DataFrame({"id_1": [2, 3, None, 2], "id_2": [2.0, 3.0, 4.0, 20]}) dleft = dd.from_pandas(left, npartitions=2) dright = dd.from_pandas(right, npartitions=3) diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index c3056f2607c..333c64eb73b 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -63,9 +63,7 @@ def test_series_reduce(reducer): ), ], ) -@pytest.mark.parametrize( - "op", ["max", "min", "sum", "prod", "mean", "var", "std"] -) +@pytest.mark.parametrize("op", ["max", "min", "sum", "prod", "mean", "var", "std"]) def test_rowwise_reductions(data, op): gddf = dask_cudf.from_cudf(data, npartitions=10) pddf = gddf.to_backend("pandas") diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 9184ad996ad..a4dea8901fe 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -22,9 +22,7 @@ "c", pytest.param( "d", - marks=xfail_dask_expr( - "Dask-expr fails to sort by categorical column." - ), + marks=xfail_dask_expr("Dask-expr fails to sort by categorical column."), ), ["a", "b"], ["c", "d"], @@ -93,12 +91,8 @@ def test_sort_values_with_nulls(data, by, ascending, na_position): ddf = dd.from_pandas(df, npartitions=5) with dask.config.set(scheduler="single-threaded"): - got = ddf.sort_values( - by=by, ascending=ascending, na_position=na_position - ) - expect = df.sort_values( - by=by, ascending=ascending, na_position=na_position - ) + got = ddf.sort_values(by=by, ascending=ascending, na_position=na_position) + expect = df.sort_values(by=by, ascending=ascending, na_position=na_position) # cudf ordering for nulls is non-deterministic dd.assert_eq(got[by], expect[by], check_index=False) From 45444f149797dd661d0ed6467c8801d7faacc7db Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Mar 2024 15:04:46 -0700 Subject: [PATCH 2/9] Replace black with ruff-format --- pyproject.toml | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4048eb9452c..9b67284dd33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,22 +1,4 @@ -[tool.black] -line-length = 79 -target-version = ["py39"] -include = '\.py?$' -force-exclude = ''' -/( - thirdparty | - \.eggs | - \.git | - \.hg | - \.mypy_cache | - \.tox | - \.venv | - _build | - buck-out | - build | - dist -)/ -''' +# Copyright (c) 2019-2024, NVIDIA CORPORATION. [tool.pydocstyle] # Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather From 3fb9cad4199279efaa61538eb7cede0d4b38cca3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Mar 2024 15:18:52 -0700 Subject: [PATCH 3/9] Go back to 79 line length --- pyproject.toml | 2 +- python/cudf/benchmarks/API/bench_dataframe.py | 21 +- python/cudf/benchmarks/API/bench_functions.py | 12 +- .../cudf/benchmarks/API/bench_multiindex.py | 6 +- python/cudf/benchmarks/API/cases_functions.py | 20 +- python/cudf/benchmarks/common/utils.py | 6 +- python/cudf/benchmarks/conftest.py | 17 +- .../cudf/benchmarks/internal/bench_column.py | 8 +- python/cudf/cudf/_fuzz_testing/avro.py | 9 +- python/cudf/cudf/_fuzz_testing/csv.py | 38 +- python/cudf/cudf/_fuzz_testing/fuzzer.py | 4 +- python/cudf/cudf/_fuzz_testing/io.py | 10 +- python/cudf/cudf/_fuzz_testing/json.py | 14 +- python/cudf/cudf/_fuzz_testing/main.py | 6 +- python/cudf/cudf/_fuzz_testing/orc.py | 20 +- python/cudf/cudf/_fuzz_testing/parquet.py | 16 +- .../_fuzz_testing/tests/fuzz_test_json.py | 14 +- .../cudf/_fuzz_testing/tests/fuzz_test_orc.py | 10 +- .../_fuzz_testing/tests/fuzz_test_parquet.py | 6 +- python/cudf/cudf/_fuzz_testing/utils.py | 16 +- python/cudf/cudf/_typing.py | 8 +- python/cudf/cudf/_version.py | 6 +- python/cudf/cudf/api/extensions/accessor.py | 4 +- python/cudf/cudf/api/types.py | 31 +- python/cudf/cudf/core/_base_index.py | 72 ++- .../cudf/cudf/core/_internals/expressions.py | 5 +- python/cudf/cudf/core/_internals/timezones.py | 27 +- python/cudf/cudf/core/_internals/where.py | 10 +- python/cudf/cudf/core/abc.py | 8 +- python/cudf/cudf/core/algorithms.py | 4 +- python/cudf/cudf/core/buffer/buffer.py | 12 +- python/cudf/cudf/core/buffer/spill_manager.py | 16 +- .../cudf/cudf/core/buffer/spillable_buffer.py | 19 +- python/cudf/cudf/core/buffer/utils.py | 7 +- python/cudf/cudf/core/column/categorical.py | 176 ++++-- python/cudf/cudf/core/column/column.py | 222 +++++-- python/cudf/cudf/core/column/datetime.py | 79 ++- python/cudf/cudf/core/column/decimal.py | 36 +- python/cudf/cudf/core/column/interval.py | 12 +- python/cudf/cudf/core/column/lists.py | 53 +- python/cudf/cudf/core/column/methods.py | 8 +- python/cudf/cudf/core/column/numerical.py | 82 ++- .../cudf/cudf/core/column/numerical_base.py | 34 +- python/cudf/cudf/core/column/string.py | 373 ++++++++--- python/cudf/cudf/core/column/struct.py | 24 +- python/cudf/cudf/core/column/timedelta.py | 104 +-- python/cudf/cudf/core/column_accessor.py | 43 +- python/cudf/cudf/core/common.py | 6 +- python/cudf/cudf/core/copy_types.py | 10 +- python/cudf/cudf/core/cut.py | 18 +- python/cudf/cudf/core/dataframe.py | 594 +++++++++++++----- python/cudf/cudf/core/df_protocol.py | 91 ++- python/cudf/cudf/core/dtypes.py | 51 +- python/cudf/cudf/core/frame.py | 106 +++- python/cudf/cudf/core/groupby/groupby.py | 184 ++++-- python/cudf/cudf/core/index.py | 156 +++-- python/cudf/cudf/core/indexed_frame.py | 276 +++++--- python/cudf/cudf/core/indexing_utils.py | 20 +- python/cudf/cudf/core/join/_join_helpers.py | 22 +- python/cudf/cudf/core/join/join.py | 47 +- python/cudf/cudf/core/mixins/mixin_factory.py | 16 +- python/cudf/cudf/core/multiindex.py | 152 +++-- python/cudf/cudf/core/resample.py | 18 +- python/cudf/cudf/core/reshape.py | 45 +- python/cudf/cudf/core/scalar.py | 37 +- python/cudf/cudf/core/series.py | 203 ++++-- python/cudf/cudf/core/single_column_frame.py | 24 +- python/cudf/cudf/core/subword_tokenizer.py | 22 +- python/cudf/cudf/core/tools/datetimes.py | 48 +- python/cudf/cudf/core/tools/numeric.py | 10 +- python/cudf/cudf/core/udf/groupby_lowering.py | 46 +- python/cudf/cudf/core/udf/groupby_typing.py | 34 +- python/cudf/cudf/core/udf/groupby_utils.py | 12 +- python/cudf/cudf/core/udf/masked_lowering.py | 38 +- python/cudf/cudf/core/udf/masked_typing.py | 25 +- python/cudf/cudf/core/udf/row_function.py | 10 +- python/cudf/cudf/core/udf/strings_lowering.py | 34 +- python/cudf/cudf/core/udf/strings_typing.py | 8 +- python/cudf/cudf/core/udf/utils.py | 28 +- python/cudf/cudf/core/window/rolling.py | 30 +- python/cudf/cudf/datasets.py | 8 +- python/cudf/cudf/io/avro.py | 6 +- python/cudf/cudf/io/csv.py | 16 +- python/cudf/cudf/io/dlpack.py | 7 +- python/cudf/cudf/io/json.py | 18 +- python/cudf/cudf/io/orc.py | 47 +- python/cudf/cudf/io/parquet.py | 111 +++- python/cudf/cudf/options.py | 16 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 18 +- python/cudf/cudf/pandas/fast_slow_proxy.py | 28 +- python/cudf/cudf/pandas/module_accelerator.py | 40 +- python/cudf/cudf/pandas/profiler.py | 30 +- .../pandas/scripts/analyze-test-failures.py | 6 +- .../pandas/scripts/summarize-test-results.py | 14 +- python/cudf/cudf/testing/_utils.py | 16 +- python/cudf/cudf/testing/dataset_generator.py | 38 +- python/cudf/cudf/testing/testing.py | 62 +- .../tests/indexes/datetime/test_indexing.py | 10 +- .../indexes/datetime/test_time_specific.py | 4 +- .../cudf/cudf/tests/indexes/test_interval.py | 60 +- .../cudf/cudf/tests/input_output/test_text.py | 30 +- .../cudf/tests/series/test_datetimelike.py | 34 +- python/cudf/cudf/tests/test_array_function.py | 8 +- python/cudf/cudf/tests/test_array_ufunc.py | 20 +- .../test_avro_reader_fastavro_integration.py | 32 +- python/cudf/cudf/tests/test_binops.py | 212 +++++-- python/cudf/cudf/tests/test_categorical.py | 38 +- python/cudf/cudf/tests/test_column.py | 23 +- .../cudf/cudf/tests/test_column_accessor.py | 8 +- python/cudf/cudf/tests/test_concat.py | 175 ++++-- python/cudf/cudf/tests/test_contains.py | 6 +- python/cudf/cudf/tests/test_copying.py | 20 +- python/cudf/cudf/tests/test_csv.py | 136 +++- python/cudf/cudf/tests/test_cuda_apply.py | 7 +- .../cudf/tests/test_cuda_array_interface.py | 11 +- python/cudf/cudf/tests/test_cut.py | 12 +- python/cudf/cudf/tests/test_dask.py | 6 +- python/cudf/cudf/tests/test_dataframe.py | 551 +++++++++++----- python/cudf/cudf/tests/test_dataframe_copy.py | 46 +- python/cudf/cudf/tests/test_datasets.py | 4 +- python/cudf/cudf/tests/test_datetime.py | 58 +- python/cudf/cudf/tests/test_decimal.py | 24 +- python/cudf/cudf/tests/test_doctests.py | 10 +- python/cudf/cudf/tests/test_dropna.py | 8 +- python/cudf/cudf/tests/test_dtypes.py | 37 +- python/cudf/cudf/tests/test_duplicates.py | 16 +- python/cudf/cudf/tests/test_feather.py | 7 +- python/cudf/cudf/tests/test_groupby.py | 257 ++++++-- python/cudf/cudf/tests/test_hash_vocab.py | 6 +- python/cudf/cudf/tests/test_hdf.py | 12 +- python/cudf/cudf/tests/test_hdfs.py | 30 +- python/cudf/cudf/tests/test_index.py | 105 +++- python/cudf/cudf/tests/test_indexing.py | 124 +++- python/cudf/cudf/tests/test_interpolate.py | 4 +- python/cudf/cudf/tests/test_interval.py | 4 +- python/cudf/cudf/tests/test_join_order.py | 7 +- python/cudf/cudf/tests/test_joining.py | 144 +++-- python/cudf/cudf/tests/test_json.py | 103 ++- python/cudf/cudf/tests/test_list.py | 30 +- python/cudf/cudf/tests/test_monotonic.py | 20 +- python/cudf/cudf/tests/test_multiindex.py | 134 ++-- python/cudf/cudf/tests/test_numerical.py | 36 +- python/cudf/cudf/tests/test_numpy_interop.py | 10 +- python/cudf/cudf/tests/test_onehot.py | 24 +- python/cudf/cudf/tests/test_options.py | 29 +- python/cudf/cudf/tests/test_orc.py | 113 +++- python/cudf/cudf/tests/test_pack.py | 40 +- python/cudf/cudf/tests/test_parquet.py | 129 +++- python/cudf/cudf/tests/test_pickling.py | 4 +- python/cudf/cudf/tests/test_query.py | 6 +- python/cudf/cudf/tests/test_rank.py | 8 +- python/cudf/cudf/tests/test_reductions.py | 8 +- python/cudf/cudf/tests/test_replace.py | 83 ++- python/cudf/cudf/tests/test_repr.py | 80 ++- python/cudf/cudf/tests/test_resampling.py | 8 +- python/cudf/cudf/tests/test_reshape.py | 104 ++- python/cudf/cudf/tests/test_rolling.py | 56 +- python/cudf/cudf/tests/test_s3.py | 28 +- python/cudf/cudf/tests/test_scalar.py | 18 +- python/cudf/cudf/tests/test_search.py | 4 +- python/cudf/cudf/tests/test_serialize.py | 20 +- python/cudf/cudf/tests/test_series.py | 129 +++- python/cudf/cudf/tests/test_seriesmap.py | 6 +- python/cudf/cudf/tests/test_setitem.py | 36 +- python/cudf/cudf/tests/test_sorting.py | 60 +- python/cudf/cudf/tests/test_spilling.py | 34 +- python/cudf/cudf/tests/test_stats.py | 42 +- python/cudf/cudf/tests/test_string.py | 107 +++- python/cudf/cudf/tests/test_string_udfs.py | 4 +- python/cudf/cudf/tests/test_struct.py | 14 +- python/cudf/cudf/tests/test_testing.py | 21 +- python/cudf/cudf/tests/test_timedelta.py | 40 +- python/cudf/cudf/tests/test_udf_masked_ops.py | 31 +- python/cudf/cudf/tests/test_unaops.py | 5 +- .../cudf/tests/text/test_subword_tokenizer.py | 21 +- .../cudf/cudf/tests/text/test_text_methods.py | 22 +- python/cudf/cudf/utils/_numba.py | 20 +- python/cudf/cudf/utils/_ptxcompiler.py | 10 +- python/cudf/cudf/utils/applyutils.py | 51 +- python/cudf/cudf/utils/cudautils.py | 10 +- python/cudf/cudf/utils/dtypes.py | 96 ++- python/cudf/cudf/utils/gpu_utils.py | 8 +- python/cudf/cudf/utils/hash_vocab_utils.py | 27 +- python/cudf/cudf/utils/ioutils.py | 32 +- python/cudf/cudf/utils/nvtx_annotation.py | 6 +- python/cudf/cudf/utils/queryutils.py | 16 +- python/cudf/cudf/utils/utils.py | 11 +- .../cudf_pandas_tests/test_cudf_pandas.py | 82 ++- .../cudf_pandas_tests/test_fast_slow_proxy.py | 14 +- .../cudf/cudf_pandas_tests/test_profiler.py | 8 +- python/cudf_kafka/cudf_kafka/_version.py | 7 +- python/custreamz/custreamz/_version.py | 7 +- python/custreamz/custreamz/kafka.py | 5 +- python/custreamz/custreamz/tests/conftest.py | 4 +- .../custreamz/tests/test_dataframes.py | 64 +- python/dask_cudf/dask_cudf/_version.py | 7 +- python/dask_cudf/dask_cudf/backends.py | 51 +- python/dask_cudf/dask_cudf/core.py | 27 +- .../dask_cudf/dask_cudf/expr/_collection.py | 4 +- python/dask_cudf/dask_cudf/groupby.py | 32 +- python/dask_cudf/dask_cudf/io/orc.py | 22 +- python/dask_cudf/dask_cudf/io/parquet.py | 15 +- .../dask_cudf/dask_cudf/io/tests/test_csv.py | 22 +- .../dask_cudf/io/tests/test_parquet.py | 56 +- .../dask_cudf/dask_cudf/io/tests/test_s3.py | 4 +- python/dask_cudf/dask_cudf/sorting.py | 15 +- .../dask_cudf/tests/test_accessor.py | 29 +- python/dask_cudf/dask_cudf/tests/test_core.py | 46 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 89 ++- python/dask_cudf/dask_cudf/tests/test_join.py | 34 +- .../dask_cudf/tests/test_reductions.py | 4 +- python/dask_cudf/dask_cudf/tests/test_sort.py | 12 +- 212 files changed, 7030 insertions(+), 2522 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9b67284dd33..c0f6e328b4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ exclude = [ # TODO: Remove this in a follow-up where we fix __all__. "__init__.py", ] -line-length = 88 +line-length = 79 [tool.ruff.per-file-ignores] # Lots of pytest implicitly injected attributes in conftest-patch.py diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index 9b51d5bd5f2..59d73015962 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -30,7 +30,9 @@ def bench_eval_func(benchmark, expr, dataframe): [2, 3, 4], ) def bench_merge(benchmark, dataframe, num_key_cols): - benchmark(dataframe.merge, dataframe, on=list(dataframe.columns[:num_key_cols])) + benchmark( + dataframe.merge, dataframe, on=list(dataframe.columns[:num_key_cols]) + ) # TODO: Some of these cases could be generalized to an IndexedFrame benchmark @@ -65,7 +67,9 @@ def random_state(request): def bench_sample(benchmark, dataframe, axis, frac, random_state): if axis == 1 and isinstance(random_state, cupy.random.RandomState): pytest.skip("Unsupported params.") - benchmark(dataframe.sample, frac=frac, axis=axis, random_state=random_state) + benchmark( + dataframe.sample, frac=frac, axis=axis, random_state=random_state + ) @benchmark_with_object(cls="dataframe", dtype="int") @@ -117,7 +121,10 @@ def bench_groupby(benchmark, dataframe, num_key_cols): [ "sum", ["sum", "mean"], - {f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"] for i in range(6)}, + { + f"{string.ascii_lowercase[i]}": ["sum", "mean", "count"] + for i in range(6) + }, ], ) @pytest.mark.parametrize( @@ -147,7 +154,9 @@ def bench_groupby_sample( kwargs = {"frac": target_sample_frac, "replace": replace} else: minsize = grouper.size().min() - target_size = numpy.round(target_sample_frac * minsize, decimals=0).astype(int) + target_size = numpy.round( + target_sample_frac * minsize, decimals=0 + ).astype(int) kwargs = {"n": target_size, "replace": replace} benchmark(grouper.sample, **kwargs) @@ -156,7 +165,9 @@ def bench_groupby_sample( @benchmark_with_object(cls="dataframe", dtype="int") @pytest.mark.parametrize("num_cols_to_sort", [1]) def bench_sort_values(benchmark, dataframe, num_cols_to_sort): - benchmark(dataframe.sort_values, list(dataframe.columns[:num_cols_to_sort])) + benchmark( + dataframe.sort_values, list(dataframe.columns[:num_cols_to_sort]) + ) @benchmark_with_object(cls="dataframe", dtype="int") diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py index 9ab1a55ff33..93109838900 100644 --- a/python/cudf/benchmarks/API/bench_functions.py +++ b/python/cudf/benchmarks/API/bench_functions.py @@ -9,7 +9,9 @@ from utils import benchmark_with_object -@pytest_cases.parametrize_with_cases("objs", prefix="concat", cases="cases_functions") +@pytest_cases.parametrize_with_cases( + "objs", prefix="concat", cases="cases_functions" +) @pytest.mark.parametrize( "axis", [ @@ -19,7 +21,9 @@ @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("ignore_index", [True, False]) def bench_concat_axis_1(benchmark, objs, axis, join, ignore_index): - benchmark(cudf.concat, objs=objs, axis=axis, join=join, ignore_index=ignore_index) + benchmark( + cudf.concat, objs=objs, axis=axis, join=join, ignore_index=ignore_index + ) @pytest.mark.parametrize("size", [10_000, 100_000]) @@ -47,7 +51,9 @@ def bench_get_dummies_simple(benchmark, prefix): "col3": cudf.Series(list(range(100, 110)), dtype="category"), } ) - benchmark(cudf.get_dummies, df, columns=["col1", "col2", "col3"], prefix=prefix) + benchmark( + cudf.get_dummies, df, columns=["col1", "col2", "col3"], prefix=prefix + ) @benchmark_with_object(cls="dataframe", dtype="int", cols=6) diff --git a/python/cudf/benchmarks/API/bench_multiindex.py b/python/cudf/benchmarks/API/bench_multiindex.py index 6d4d6ec0942..6268bcc4267 100644 --- a/python/cudf/benchmarks/API/bench_multiindex.py +++ b/python/cudf/benchmarks/API/bench_multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. """Benchmarks of MultiIndex methods.""" @@ -31,7 +31,9 @@ def bench_from_pandas(benchmark, pidx): def bench_constructor(benchmark, midx): - benchmark(cudf.MultiIndex, codes=midx.codes, levels=midx.levels, names=midx.names) + benchmark( + cudf.MultiIndex, codes=midx.codes, levels=midx.levels, names=midx.names + ) def bench_from_frame(benchmark, midx): diff --git a/python/cudf/benchmarks/API/cases_functions.py b/python/cudf/benchmarks/API/cases_functions.py index 627447fbc12..6bc66aa4a9b 100644 --- a/python/cudf/benchmarks/API/cases_functions.py +++ b/python/cudf/benchmarks/API/cases_functions.py @@ -28,7 +28,9 @@ def concat_case_contiguous_indexes(nr): @pytest_cases.parametrize("nr", NUM_ROWS) def concat_case_contiguous_indexes_different_cols(nr): return [ - cudf.DataFrame({"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)}), + cudf.DataFrame( + {"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)} + ), cudf.DataFrame( {"c": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), @@ -115,22 +117,30 @@ def concat_case_unique_columns(nr): @pytest_cases.parametrize("nr", NUM_ROWS) def concat_case_unique_columns_with_different_range_index(nr): return [ - cudf.DataFrame({"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)}), + cudf.DataFrame( + {"a": cupy.tile([1, 2, 3], nr), "b": cupy.tile([4, 5, 7], nr)} + ), cudf.DataFrame( {"c": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), ), - cudf.DataFrame({"d": cupy.tile([1, 2, 3], nr), "e": cupy.tile([4, 5, 7], nr)}), + cudf.DataFrame( + {"d": cupy.tile([1, 2, 3], nr), "e": cupy.tile([4, 5, 7], nr)} + ), cudf.DataFrame( {"f": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), ), - cudf.DataFrame({"g": cupy.tile([1, 2, 3], nr), "h": cupy.tile([4, 5, 7], nr)}), + cudf.DataFrame( + {"g": cupy.tile([1, 2, 3], nr), "h": cupy.tile([4, 5, 7], nr)} + ), cudf.DataFrame( {"i": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), ), - cudf.DataFrame({"j": cupy.tile([1, 2, 3], nr), "k": cupy.tile([4, 5, 7], nr)}), + cudf.DataFrame( + {"j": cupy.tile([1, 2, 3], nr), "k": cupy.tile([4, 5, 7], nr)} + ), cudf.DataFrame( {"l": cupy.tile([4, 5, 7], nr)}, index=cudf.RangeIndex(start=nr * 3, stop=nr * 2 * 3), diff --git a/python/cudf/benchmarks/common/utils.py b/python/cudf/benchmarks/common/utils.py index 1b79882ca17..363316f0930 100644 --- a/python/cudf/benchmarks/common/utils.py +++ b/python/cudf/benchmarks/common/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. """Common utilities for fixture creation and benchmarking.""" @@ -42,7 +42,9 @@ def make_boolean_mask_column(size): return cudf.core.column.as_column(rstate.randint(0, 2, size).astype(bool)) -def benchmark_with_object(cls, *, dtype="int", nulls=None, cols=None, rows=None): +def benchmark_with_object( + cls, *, dtype="int", nulls=None, cols=None, rows=None +): """Pass "standard" cudf fixtures to functions without renaming parameters. The fixture generation logic in conftest.py provides a plethora of useful diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 4c568f9bdc8..7b2b71cf216 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -93,7 +93,10 @@ def make_dataframe(nr, nc, column_generator=column_generator): string.ascii_lowercase ), "make_dataframe only supports a maximum of 26 columns" return cudf.DataFrame( - {f"{string.ascii_lowercase[i]}": column_generator(nr) for i in range(nc)} + { + f"{string.ascii_lowercase[i]}": column_generator(nr) + for i in range(nc) + } ) for nr in NUM_ROWS: @@ -105,7 +108,9 @@ def make_dataframe(nr, nc, column_generator=column_generator): # https://github.com/smarie/python-pytest-cases/issues/278 # Once that is fixed we could remove all the extraneous `request` # fixtures in these fixtures. - def series_nulls_false(request, nr=nr, column_generator=column_generator): + def series_nulls_false( + request, nr=nr, column_generator=column_generator + ): return cudf.Series(column_generator(nr)) make_fixture( @@ -115,7 +120,9 @@ def series_nulls_false(request, nr=nr, column_generator=column_generator): fixtures, ) - def series_nulls_true(request, nr=nr, column_generator=column_generator): + def series_nulls_true( + request, nr=nr, column_generator=column_generator + ): s = cudf.Series(column_generator(nr)) s.iloc[::2] = None return s @@ -128,7 +135,9 @@ def series_nulls_true(request, nr=nr, column_generator=column_generator): ) # For now, not bothering to include a nullable index fixture. - def index_nulls_false(request, nr=nr, column_generator=column_generator): + def index_nulls_false( + request, nr=nr, column_generator=column_generator + ): return cudf.Index(column_generator(nr)) make_fixture( diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py index cacd5574b57..8da769b7858 100644 --- a/python/cudf/benchmarks/internal/bench_column.py +++ b/python/cudf/benchmarks/internal/bench_column.py @@ -31,7 +31,9 @@ def bench_unique_single_column(benchmark, column): @pytest.mark.parametrize("nullify", [True, False]) @pytest.mark.parametrize("gather_how", ["sequence", "reverse", "random"]) def bench_take(benchmark, column, gather_how, nullify): - gather_map = make_gather_map(column.size * 0.4, column.size, gather_how)._column + gather_map = make_gather_map( + column.size * 0.4, column.size, gather_how + )._column benchmark(column.take, gather_map, nullify=nullify) @@ -105,6 +107,8 @@ def setitem_case_int_column_align_to_col_size(column): # column (len(val) != len(key) and len == num_true) -@pytest_cases.parametrize_with_cases("column,key,value", cases=".", prefix="setitem") +@pytest_cases.parametrize_with_cases( + "column,key,value", cases=".", prefix="setitem" +) def bench_setitem(benchmark, column, key, value): benchmark(column.__setitem__, key, value) diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py index ed647e45528..d9974037daa 100644 --- a/python/cudf/cudf/_fuzz_testing/avro.py +++ b/python/cudf/cudf/_fuzz_testing/avro.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import copy import io @@ -69,14 +69,17 @@ def generate_input(self): - cudf.utils.dtypes.TIMEDELTA_TYPES ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " + f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 54bac6c5f26..5b49143fd5a 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -53,13 +53,16 @@ def generate_input(self): seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) - dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " + f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -81,12 +84,18 @@ def set_rand_params(self, params): col_val = np.random.choice( [ None, - np.unique(np.random.choice(self._df.columns, col_size)), + np.unique( + np.random.choice(self._df.columns, col_size) + ), ] ) - params_dict[param] = col_val if col_val is None else list(col_val) + params_dict[param] = ( + col_val if col_val is None else list(col_val) + ) elif param == "dtype": - dtype_val = np.random.choice([None, self._df.dtypes.to_dict()]) + dtype_val = np.random.choice( + [None, self._df.dtypes.to_dict()] + ) if dtype_val is not None: dtype_val = { col_name: "category" @@ -101,9 +110,13 @@ def set_rand_params(self, params): ) params_dict[param] = header_val elif param == "skiprows": - params_dict[param] = np.random.randint(low=0, high=len(self._df)) + params_dict[param] = np.random.randint( + low=0, high=len(self._df) + ) elif param == "skipfooter": - params_dict[param] = np.random.randint(low=0, high=len(self._df)) + params_dict[param] = np.random.randint( + low=0, high=len(self._df) + ) elif param == "nrows": nrows_val = np.random.choice( [None, np.random.randint(low=0, high=len(self._df))] @@ -145,13 +158,16 @@ def generate_input(self): seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) - dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " + f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -172,7 +188,9 @@ def set_rand_params(self, params): col_size = self._rand(len(self._current_buffer.columns)) params_dict[param] = list( np.unique( - np.random.choice(self._current_buffer.columns, col_size) + np.random.choice( + self._current_buffer.columns, col_size + ) ) ) elif param == "chunksize": diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py index ccea536df00..ee1b2c1f1c4 100644 --- a/python/cudf/cudf/_fuzz_testing/fuzzer.py +++ b/python/cudf/cudf/_fuzz_testing/fuzzer.py @@ -71,7 +71,9 @@ def write_crash(self, error): crash_log_path = error_file_name + "_crash.log" with open(crash_path, "w") as f: - json.dump(self._data_handler.current_params, f, sort_keys=True, indent=4) + json.dump( + self._data_handler.current_params, f, sort_keys=True, indent=4 + ) logging.info(f"Crash params was written to {crash_path}") diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py index e757e2b602b..ffb7171a855 100644 --- a/python/cudf/cudf/_fuzz_testing/io.py +++ b/python/cudf/cudf/_fuzz_testing/io.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import copy import json @@ -49,7 +49,9 @@ def __init__( else: for i in os.listdir(path): file_name = os.path.join(path, i) - if os.path.isfile(file_name) and file_name.endswith("_crash.json"): + if os.path.isfile(file_name) and file_name.endswith( + "_crash.json" + ): self._load_params(file_name) self._regression = bool(self._inputs) self._idx = 0 @@ -74,7 +76,9 @@ def current_params(self): def get_next_regression_params(self): if self._idx >= len(self._inputs): - logging.info("Reached the end of all crash.json files to run..Exiting..") + logging.info( + "Reached the end of all crash.json files to run..Exiting.." + ) sys.exit(0) param = self._inputs[self._idx] dtypes_meta = param["dtypes_meta"] diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index 800c4baa851..bffd508b2ef 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -79,13 +79,16 @@ def generate_input(self): # issue is fixed: # https://github.com/rapidsai/cudf/issues/7086 # dtypes_list.extend(["list"]) - dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " + f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -151,13 +154,16 @@ def generate_input(self): # issue is fixed: # https://github.com/rapidsai/cudf/issues/7086 # dtypes_list.extend(["list"]) - dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " + f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) diff --git a/python/cudf/cudf/_fuzz_testing/main.py b/python/cudf/cudf/_fuzz_testing/main.py index beb68ab2e43..54e49b63e41 100644 --- a/python/cudf/cudf/_fuzz_testing/main.py +++ b/python/cudf/cudf/_fuzz_testing/main.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from cudf._fuzz_testing import fuzzer @@ -20,7 +20,9 @@ def __init__(self, func, params=None, data_handle=None, **kwargs): params=params, write_data_on_failure=kwargs.get("write_data_on_failure", True), max_lists_length=kwargs.get("max_lists_length", None), - max_lists_nesting_depth=kwargs.get("max_lists_nesting_depth", None), + max_lists_nesting_depth=kwargs.get( + "max_lists_nesting_depth", None + ), ) def __call__(self, *args, **kwargs): diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 8136c022be5..ecddc72fa85 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import copy import io @@ -63,7 +63,9 @@ def generate_input(self): - {"datetime64[ns]"} ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) @@ -71,7 +73,8 @@ def generate_input(self): self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " + f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -109,7 +112,9 @@ def set_rand_params(self, params): map( int, np.unique( - np.random.choice(stripes, orcFile.nstripes) + np.random.choice( + stripes, orcFile.nstripes + ) ), ) ), @@ -173,14 +178,17 @@ def generate_input(self): - cudf.utils.dtypes.DATETIME_TYPES ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " + f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index caf2e2b7a93..2d934e4816d 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import logging import random @@ -60,14 +60,17 @@ def generate_input(self): | {"list", "decimal64"} ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " + f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) @@ -142,13 +145,16 @@ def generate_input(self): - {"uint32"} | {"list", "decimal64"} ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta(self, dtypes_list) + dtypes_meta, num_rows, num_cols = _generate_rand_meta( + self, dtypes_list + ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( - f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" + f"Generating DataFrame with rows: {num_rows} " + f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py index a34b3051608..2f5e6204f7c 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. import io import sys @@ -44,7 +44,9 @@ def json_writer_test(pdf): # https://github.com/rapidsai/cudf/issues/6429 # compare_content(pdf_buffer, gdf_buffer) - actual = cudf.read_json(gdf_buffer, engine="cudf", lines=True, orient="records") + actual = cudf.read_json( + gdf_buffer, engine="cudf", lines=True, orient="records" + ) expected = pd.read_json(pdf_buffer, lines=True, orient="records") expected.columns = expected.columns.astype("str") assert_eq(actual, expected) @@ -60,8 +62,12 @@ def json_writer_test(pdf): def json_writer_test_params(pdf, compression, dtype): gdf = cudf.from_pandas(pdf) - pdf_buffer = pdf.to_json(lines=True, orient="records", compression=compression) - gdf_buffer = gdf.to_json(lines=True, orient="records", compression=compression) + pdf_buffer = pdf.to_json( + lines=True, orient="records", compression=compression + ) + gdf_buffer = gdf.to_json( + lines=True, orient="records", compression=compression + ) # TODO: Uncomment once this is fixed: # https://github.com/rapidsai/cudf/issues/6429 diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py index d67d3989b2d..977038d1fcb 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import io import sys @@ -54,14 +54,18 @@ def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): ) def orc_reader_stripes_test(input_tuple, columns, stripes): _, file_buffer = input_tuple - expected_pdf = orc_to_pandas(file_io_obj=io.BytesIO(file_buffer), stripes=stripes) + expected_pdf = orc_to_pandas( + file_io_obj=io.BytesIO(file_buffer), stripes=stripes + ) if columns is not None and len(columns) > 0: # ORC reader picks columns if only # there are any elements in `columns` expected_pdf = expected_pdf[columns] - gdf = cudf.read_orc(io.BytesIO(file_buffer), columns=columns, stripes=stripes) + gdf = cudf.read_orc( + io.BytesIO(file_buffer), columns=columns, stripes=stripes + ) compare_dataframe(expected_pdf, gdf) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py index 115c1b67518..3d070576a12 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import sys @@ -72,7 +72,9 @@ def parquet_writer_test(pdf): "compression": ["snappy", None], }, ) -def parquet_writer_test_rowgroup_index_compression(pdf, compression, row_group_size): +def parquet_writer_test_rowgroup_index_compression( + pdf, compression, row_group_size +): pd_file_name = "cpu_pdf.parquet" gd_file_name = "gpu_pdf.parquet" diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 4378989d6e3..6e53195ac2d 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -74,7 +74,9 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["lists_max_length"] = obj._max_lists_length if obj._max_lists_nesting_depth is None: - meta["nesting_max_depth"] = np.random.randint(1, np.iinfo("int64").max) + meta["nesting_max_depth"] = np.random.randint( + 1, np.iinfo("int64").max + ) else: meta["nesting_max_depth"] = obj._max_lists_nesting_depth @@ -93,9 +95,13 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["max_null_frequency"] = obj._max_struct_null_frequency if obj._max_struct_types_at_each_level is None: - meta["max_types_at_each_level"] = np.random.randint(low=1, high=10) + meta["max_types_at_each_level"] = np.random.randint( + low=1, high=10 + ) else: - meta["max_types_at_each_level"] = obj._max_struct_types_at_each_level + meta[ + "max_types_at_each_level" + ] = obj._max_struct_types_at_each_level elif dtype == "decimal64": meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION @@ -117,7 +123,9 @@ def run_test(funcs, args): try: funcs[function_name_to_run]() except KeyError: - print(f"Provided function name({function_name_to_run}) does not exist.") + print( + f"Provided function name({function_name_to_run}) does not exist." + ) def pyarrow_to_pandas(table): diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index bab9514d622..206173919e1 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -24,7 +24,9 @@ DtypeObj = Union["ExtensionDtype", np.dtype] # scalars -DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", Period, Timestamp, Timedelta) +DatetimeLikeScalar = TypeVar( + "DatetimeLikeScalar", Period, Timestamp, Timedelta +) ScalarLike = Any # columns @@ -39,4 +41,6 @@ # Groupby aggregation AggType = Union[str, Callable] -MultiColumnAggType = Union[AggType, Iterable[AggType], Dict[Any, Iterable[AggType]]] +MultiColumnAggType = Union[ + AggType, Iterable[AggType], Dict[Any, Iterable[AggType]] +] diff --git a/python/cudf/cudf/_version.py b/python/cudf/cudf/_version.py index 8e4105d9a2e..ecf6ddd8e3b 100644 --- a/python/cudf/cudf/_version.py +++ b/python/cudf/cudf/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,5 +14,7 @@ import importlib.resources -__version__ = importlib.resources.files("cudf").joinpath("VERSION").read_text().strip() +__version__ = ( + importlib.resources.files("cudf").joinpath("VERSION").read_text().strip() +) __git_commit__ = "" diff --git a/python/cudf/cudf/api/extensions/accessor.py b/python/cudf/cudf/api/extensions/accessor.py index 73e0594dd1c..e4988c1fa68 100644 --- a/python/cudf/cudf/api/extensions/accessor.py +++ b/python/cudf/cudf/api/extensions/accessor.py @@ -117,7 +117,9 @@ ) doc_register_index_accessor = docfmt_partial( - docstring=_docstring_register_accessor.format(klass="Index", example=_index_example) + docstring=_docstring_register_accessor.format( + klass="Index", example=_index_example + ) ) doc_register_series_accessor = docfmt_partial( diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 86fa4f24c4f..417d8b0922a 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -108,7 +108,10 @@ def is_string_dtype(obj): Whether or not the array or dtype is of the string dtype. """ return ( - (isinstance(obj, (cudf.Index, cudf.Series)) and obj.dtype == cudf.dtype("O")) + ( + isinstance(obj, (cudf.Index, cudf.Series)) + and obj.dtype == cudf.dtype("O") + ) or (isinstance(obj, cudf.core.column.StringColumn)) or ( pd.api.types.is_string_dtype(obj) @@ -171,9 +174,9 @@ def _is_scalar_or_zero_d_array(val): bool Return True if given object is scalar. """ - return (isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0) or is_scalar( - val - ) + return ( + isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0 + ) or is_scalar(val) # TODO: We should be able to reuse the pandas function for this, need to figure @@ -239,7 +242,9 @@ def _union_categoricals( ) if sort_categories: sorted_categories = result_col.categories.sort_values(ascending=True) - result_col = result_col.reorder_categories(new_categories=sorted_categories) + result_col = result_col.reorder_categories( + new_categories=sorted_categories + ) return cudf.Index(result_col) @@ -288,7 +293,9 @@ def is_bool_dtype(arr_or_dtype): else: return pd_types.is_bool_dtype(arr_or_dtype=arr_or_dtype.dtype) elif isinstance(arr_or_dtype, cudf.CategoricalDtype): - return pd_types.is_bool_dtype(arr_or_dtype=arr_or_dtype.categories.dtype) + return pd_types.is_bool_dtype( + arr_or_dtype=arr_or_dtype.categories.dtype + ) else: return pd_types.is_bool_dtype(arr_or_dtype=arr_or_dtype) @@ -499,7 +506,9 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: elif isinstance(dtype_to_check, pd.CategoricalDtype): if dtype_to_check.categories is None: return False - return _is_pandas_nullable_extension_dtype(dtype_to_check.categories.dtype) + return _is_pandas_nullable_extension_dtype( + dtype_to_check.categories.dtype + ) elif isinstance(dtype_to_check, pd.IntervalDtype): return _is_pandas_nullable_extension_dtype(dtype_to_check.subtype) return False @@ -516,14 +525,18 @@ def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: is_datetime_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype) is_datetime64_any_dtype = pd_types.is_datetime64_any_dtype is_datetime64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype) -is_datetime64_ns_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_ns_dtype) +is_datetime64_ns_dtype = _wrap_pandas_is_dtype_api( + pd_types.is_datetime64_ns_dtype +) is_extension_array_dtype = pd_types.is_extension_array_dtype is_int64_dtype = pd_types.is_int64_dtype is_period_dtype = pd_types.is_period_dtype is_signed_integer_dtype = pd_types.is_signed_integer_dtype is_timedelta_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype) is_timedelta64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype) -is_timedelta64_ns_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_ns_dtype) +is_timedelta64_ns_dtype = _wrap_pandas_is_dtype_api( + pd_types.is_timedelta64_ns_dtype +) is_unsigned_integer_dtype = pd_types.is_unsigned_integer_dtype is_sparse = pd_types.is_sparse # is_list_like = pd_types.is_list_like diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 59489700f8f..de44f392eef 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -277,7 +277,9 @@ def __getitem__(self, key): def __contains__(self, item): return item in self._values - def _copy_type_metadata(self, other: Self, *, override_dtypes=None) -> Self: + def _copy_type_metadata( + self, other: Self, *, override_dtypes=None + ) -> Self: raise NotImplementedError def get_level_values(self, level): @@ -346,7 +348,9 @@ def names(self, values): num_values = len(values) if num_values > 1: - raise ValueError("Length of new names must be 1, got %d" % num_values) + raise ValueError( + "Length of new names must be 1, got %d" % num_values + ) self.name = values[0] @@ -601,7 +605,9 @@ def union(self, other, sort=None): ) if cudf.get_option("mode.pandas_compatible"): - if (is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype)) or ( + if ( + is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype) + ) or ( not is_bool_dtype(self.dtype) and is_bool_dtype(other.dtype) ): # Bools + other types will result in mixed type. @@ -619,13 +625,17 @@ def union(self, other, sort=None): raise MixedTypeError("Cannot perform union with mixed types") if not len(other) or self.equals(other): - common_dtype = cudf.utils.dtypes.find_common_type([self.dtype, other.dtype]) + common_dtype = cudf.utils.dtypes.find_common_type( + [self.dtype, other.dtype] + ) res = self._get_reconciled_name_object(other).astype(common_dtype) if sort: return res.sort_values() return res elif not len(self): - common_dtype = cudf.utils.dtypes.find_common_type([self.dtype, other.dtype]) + common_dtype = cudf.utils.dtypes.find_common_type( + [self.dtype, other.dtype] + ) res = other._get_reconciled_name_object(self).astype(common_dtype) if sort: return res.sort_values() @@ -785,7 +795,9 @@ def fillna(self, value, downcast=None): Index([1, 2, 3, 4], dtype='int64') """ if downcast is not None: - raise NotImplementedError("`downcast` parameter is not yet supported") + raise NotImplementedError( + "`downcast` parameter is not yet supported" + ) return super().fillna(value=value) @@ -843,7 +855,9 @@ def to_frame(self, index=True, name=no_default): else: col_name = name - return cudf.DataFrame({col_name: self._values}, index=self if index else None) + return cudf.DataFrame( + {col_name: self._values}, index=self if index else None + ) def to_arrow(self): """Convert to a suitable Arrow object.""" @@ -1475,7 +1489,9 @@ def _union(self, other, sort=None): self_df["order"] = self_df.index other_df["order"] = other_df.index res = self_df.merge(other_df, on=[0], how="outer") - res = res.sort_values(by=res._data.to_pandas_index()[1:], ignore_index=True) + res = res.sort_values( + by=res._data.to_pandas_index()[1:], ignore_index=True + ) union_result = cudf.core.index._index_from_data({0: res._data[0]}) if sort in {None, True} and len(other): @@ -1593,7 +1609,9 @@ def sort_values( else: return index_sorted - def join(self, other, how="left", level=None, return_indexers=False, sort=False): + def join( + self, other, how="left", level=None, return_indexers=False, sort=False + ): """ Compute join_index and indexers to conform data structures to the new index. @@ -1768,8 +1786,16 @@ def find_label_range(self, loc: slice) -> slice: start_side, stop_side = "right", "left" else: start_side, stop_side = "left", "right" - istart = None if start is None else self.get_slice_bound(start, side=start_side) - istop = None if stop is None else self.get_slice_bound(stop, side=stop_side) + istart = ( + None + if start is None + else self.get_slice_bound(start, side=start_side) + ) + istop = ( + None + if stop is None + else self.get_slice_bound(stop, side=stop_side) + ) if step < 0: # Fencepost istart = None if istart is None else max(istart - 1, 0) @@ -1840,7 +1866,9 @@ def get_slice_bound( except ValueError: raise KeyError(f"{label=} not in index") if left != right: - raise KeyError(f"Cannot get slice bound for non-unique label {label=}") + raise KeyError( + f"Cannot get slice bound for non-unique label {label=}" + ) if side == "left": return left else: @@ -1862,7 +1890,9 @@ def __array_function__(self, func, types, args, kwargs): # check if we don't handle any of the types (including sub-class) for t in types: - if not any(issubclass(t, handled_type) for handled_type in handled_types): + if not any( + issubclass(t, handled_type) for handled_type in handled_types + ): return NotImplemented if hasattr(cudf_index_module, fname): @@ -1913,7 +1943,9 @@ def from_pandas(cls, index: pd.Index, nan_as_null=no_default): Index([10.0, 20.0, 30.0, nan], dtype='float64') """ if nan_as_null is no_default: - nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None + nan_as_null = ( + False if cudf.get_option("mode.pandas_compatible") else None + ) if not isinstance(index, pd.Index): raise TypeError("not a pandas.Index") @@ -2061,7 +2093,9 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): if not is_integer_dtype(gather_map.dtype): gather_map = gather_map.astype(size_type_dtype) - if not _gather_map_is_valid(gather_map, len(self), check_bounds, nullify): + if not _gather_map_is_valid( + gather_map, len(self), check_bounds, nullify + ): raise IndexError("Gather map index is out of bounds.") return self._from_columns_like_self( @@ -2094,9 +2128,13 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None): """ if axis not in {0, "index"}: - raise NotImplementedError("Gather along column axis is not yet supported.") + raise NotImplementedError( + "Gather along column axis is not yet supported." + ) if not allow_fill or fill_value is not None: - raise NotImplementedError("`allow_fill` and `fill_value` are unsupported.") + raise NotImplementedError( + "`allow_fill` and `fill_value` are unsupported." + ) return self._gather(indices) diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py index b69c456b7ba..5cb9f0363e0 100644 --- a/python/cudf/cudf/core/_internals/expressions.py +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import ast import functools @@ -208,7 +208,8 @@ def visit_Call(self, node): # Assuming only unary functions are supported, which is checked above. if len(node.args) != 1 or node.keywords: raise ValueError( - f"Function {node.func} only accepts one positional " "argument." + f"Function {node.func} only accepts one positional " + "argument." ) self.visit(node.args[0]) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 6f260424b5e..4e2fad08d56 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -78,7 +78,9 @@ def _find_and_read_tzfile_tzdata(zone_name): def _read_tzfile_as_frame(tzdir, zone_name): - transition_times_and_offsets = make_timezone_transition_table(tzdir, zone_name) + transition_times_and_offsets = make_timezone_transition_table( + tzdir, zone_name + ) if not transition_times_and_offsets: # this happens for UTC-like zones @@ -89,11 +91,15 @@ def _read_tzfile_as_frame(tzdir, zone_name): ) return DataFrame._from_data( - dict(zip(["transition_times", "offsets"], transition_times_and_offsets)) + dict( + zip(["transition_times", "offsets"], transition_times_and_offsets) + ) ) -def _find_ambiguous_and_nonexistent(data: DatetimeColumn, zone_name: str) -> Tuple: +def _find_ambiguous_and_nonexistent( + data: DatetimeColumn, zone_name: str +) -> Tuple: """ Recognize ambiguous and nonexistent timestamps for the given timezone. @@ -107,7 +113,9 @@ def _find_ambiguous_and_nonexistent(data: DatetimeColumn, zone_name: str) -> Tup """ tz_data_for_zone = get_tz_data(zone_name) transition_times = tz_data_for_zone["transition_times"] - offsets = tz_data_for_zone["offsets"].astype(f"timedelta64[{data._time_unit}]") + offsets = tz_data_for_zone["offsets"].astype( + f"timedelta64[{data._time_unit}]" + ) if len(offsets) == 1: # no transitions return False, False @@ -163,12 +171,17 @@ def localize( data: DatetimeColumn, zone_name: str, ambiguous, nonexistent ) -> DatetimeTZColumn: if ambiguous != "NaT": - raise NotImplementedError("Only ambiguous='NaT' is currently supported") + raise NotImplementedError( + "Only ambiguous='NaT' is currently supported" + ) if nonexistent != "NaT": - raise NotImplementedError("Only nonexistent='NaT' is currently supported") + raise NotImplementedError( + "Only nonexistent='NaT' is currently supported" + ) if isinstance(data, DatetimeTZColumn): raise ValueError( - "Already localized. " "Use `tz_convert` to convert between time zones." + "Already localized. " + "Use `tz_convert` to convert between time zones." ) dtype = pd.DatetimeTZDtype(data._time_unit, zone_name) ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index dd6069cbb4e..ef6b10f66c1 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import warnings from typing import Tuple, Union @@ -81,7 +81,9 @@ def _check_and_cast_columns_with_other( ) return _normalize_categorical(source_col, other.astype(source_dtype)) - if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast(other, source_dtype): + if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast( + other, source_dtype + ): common_dtype = source_dtype elif ( isinstance(source_col, cudf.core.column.NumericalColumn) @@ -114,7 +116,9 @@ def _make_categorical_like(result, column): if isinstance(column, cudf.core.column.CategoricalColumn): result = cudf.core.column.build_categorical_column( categories=column.categories, - codes=cudf.core.column.build_column(result.base_data, dtype=result.dtype), + codes=cudf.core.column.build_column( + result.base_data, dtype=result.dtype + ), mask=result.base_mask, size=result.size, offset=result.offset, diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index 044b709266c..ce6bb83bc77 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -42,7 +42,9 @@ def serialize(self): :meta private: """ - raise NotImplementedError("Subclasses of Serializable must implement serialize") + raise NotImplementedError( + "Subclasses of Serializable must implement serialize" + ) @classmethod def deserialize(cls, header, frames): @@ -97,7 +99,9 @@ def device_serialize(self): for f in frames ) header["type-serialized"] = pickle.dumps(type(self)) - header["is-cuda"] = [hasattr(f, "__cuda_array_interface__") for f in frames] + header["is-cuda"] = [ + hasattr(f, "__cuda_array_interface__") for f in frames + ] header["lengths"] = [f.nbytes for f in frames] return header, frames diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index deb98d2f39f..33cec21caa5 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -138,7 +138,9 @@ def _index_or_values_interpolation(column, index=None): return column to_interp = IndexedFrame(data={None: column}, index=index) - known_x_and_y = to_interp._apply_boolean_mask(BooleanMask(~mask, len(to_interp))) + known_x_and_y = to_interp._apply_boolean_mask( + BooleanMask(~mask, len(to_interp)) + ) known_x = known_x_and_y._index._column.values known_y = known_x_and_y._data.columns[0].values diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 55780142001..8d278c9c065 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -160,7 +160,9 @@ def _from_device_memory(cls, data: Any, exposed: bool) -> Self: ret._ptr = data.ptr ret._size = data.size else: - ret._ptr, ret._size = get_ptr_and_size(data.__cuda_array_interface__) + ret._ptr, ret._size = get_ptr_and_size( + data.__cuda_array_interface__ + ) if ret.size < 0: raise ValueError("size cannot be negative") return ret @@ -256,7 +258,9 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int: """ return self._ptr - def memoryview(self, *, offset: int = 0, size: Optional[int] = None) -> memoryview: + def memoryview( + self, *, offset: int = 0, size: Optional[int] = None + ) -> memoryview: """Read-only access to the buffer through host memory.""" size = self._size if size is None else size host_buf = host_memory_allocation(size) @@ -304,7 +308,9 @@ def __init__( if offset < 0: raise ValueError("offset cannot be negative") if offset + size > owner.size: - raise ValueError("offset+size cannot be greater than the size of owner") + raise ValueError( + "offset+size cannot be greater than the size of owner" + ) self._owner = owner self._offset = offset self._size = size diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index cc1eecf827d..3e654e01401 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -21,7 +21,9 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.string import format_bytes -_spill_cudf_nvtx_annotate = partial(_cudf_nvtx_annotate, domain="cudf_python-spill") +_spill_cudf_nvtx_annotate = partial( + _cudf_nvtx_annotate, domain="cudf_python-spill" +) def get_traceback() -> str: @@ -380,15 +382,21 @@ def spill_to_device_limit(self, device_limit: Optional[int] = None) -> int: int The number of bytes spilled. """ - limit = self._device_memory_limit if device_limit is None else device_limit + limit = ( + self._device_memory_limit if device_limit is None else device_limit + ) if limit is None: return 0 - unspilled = sum(buf.size for buf in self.buffers() if not buf.is_spilled) + unspilled = sum( + buf.size for buf in self.buffers() if not buf.is_spilled + ) return self.spill_device_memory(nbytes=unspilled - limit) def __repr__(self) -> str: spilled = sum(buf.size for buf in self.buffers() if buf.is_spilled) - unspilled = sum(buf.size for buf in self.buffers() if not buf.is_spilled) + unspilled = sum( + buf.size for buf in self.buffers() if not buf.is_spilled + ) unspillable = 0 for buf in self.buffers(): if not (buf.is_spilled or buf.spillable): diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 35c5a3e1abe..b25af13679c 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -111,7 +111,8 @@ def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None: manager = get_global_manager() if manager is None: raise ValueError( - f"cannot create {self.__class__} without " "a global spill manager" + f"cannot create {self.__class__} without " + "a global spill manager" ) self._manager = manager @@ -197,7 +198,9 @@ def spill(self, target: str = "cpu") -> None: return if not self.spillable: - raise ValueError(f"Cannot in-place move an unspillable buffer: {self}") + raise ValueError( + f"Cannot in-place move an unspillable buffer: {self}" + ) if (ptr_type, target) == ("gpu", "cpu"): with annotate( @@ -206,7 +209,9 @@ def spill(self, target: str = "cpu") -> None: domain="cudf_python-spill", ): host_mem = host_memory_allocation(self.size) - rmm._lib.device_buffer.copy_ptr_to_host(self._ptr, host_mem) + rmm._lib.device_buffer.copy_ptr_to_host( + self._ptr, host_mem + ) self._ptr_desc["memoryview"] = host_mem self._ptr = 0 self._owner = None @@ -338,7 +343,9 @@ def __cuda_array_interface__(self) -> dict: "version": 0, } - def memoryview(self, *, offset: int = 0, size: Optional[int] = None) -> memoryview: + def memoryview( + self, *, offset: int = 0, size: Optional[int] = None + ) -> memoryview: size = self._size if size is None else size with self.lock: if self.spillable: @@ -347,7 +354,9 @@ def memoryview(self, *, offset: int = 0, size: Optional[int] = None) -> memoryvi else: assert self._ptr_desc["type"] == "gpu" ret = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host(self._ptr + offset, ret) + rmm._lib.device_buffer.copy_ptr_to_host( + self._ptr + offset, ret + ) return ret def __str__(self) -> str: diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 0d4dee7255f..c2ec7effd13 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -106,7 +106,9 @@ def as_buffer( # the Buffer (and its sub-classes) do not have to. if isinstance(data, int): if size is None: - raise ValueError("size must be specified when `data` is an integer") + raise ValueError( + "size must be specified when `data` is an integer" + ) data = cuda_array_interface_wrapper(ptr=data, size=size, owner=owner) elif size is not None or owner is not None: raise ValueError( @@ -149,7 +151,8 @@ def as_buffer( and get_spill_lock() is None ): raise ValueError( - "An owning spillable buffer must " "either be exposed or spill locked." + "An owning spillable buffer must " + "either be exposed or spill locked." ) ptr, size = get_ptr_and_size(data.__cuda_array_interface__) base_ptr = owner.get_ptr(mode="read") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f245cb90d4a..88bb4521a5b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -100,7 +100,9 @@ class CategoricalAccessor(ColumnMethods): def __init__(self, parent: SeriesOrSingleColumnIndex): if not isinstance(parent.dtype, CategoricalDtype): - raise AttributeError("Can only use .cat accessor with a 'category' dtype") + raise AttributeError( + "Can only use .cat accessor with a 'category' dtype" + ) super().__init__(parent=parent) @property @@ -115,7 +117,11 @@ def codes(self) -> "cudf.Series": """ Return Series of codes as well as the index. """ - index = self._parent.index if isinstance(self._parent, cudf.Series) else None + index = ( + self._parent.index + if isinstance(self._parent, cudf.Series) + else None + ) return cudf.Series(self._column.codes, index=index) @property @@ -261,7 +267,9 @@ def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: f"type-cast new_categories to the same type as " f"existing categories." ) - common_dtype = find_common_type([old_categories.dtype, new_categories.dtype]) + common_dtype = find_common_type( + [old_categories.dtype, new_categories.dtype] + ) new_categories = new_categories.astype(common_dtype) old_categories = old_categories.astype(common_dtype) @@ -549,7 +557,9 @@ def __init__( @property def base_size(self) -> int: - return int((self.base_children[0].size) / self.base_children[0].dtype.itemsize) + return int( + (self.base_children[0].size) / self.base_children[0].dtype.itemsize + ) def __contains__(self, item: ScalarLike) -> bool: try: @@ -607,7 +617,9 @@ def categories(self) -> ColumnBase: @categories.setter def categories(self, value): - self._dtype = CategoricalDtype(categories=value, ordered=self.dtype.ordered) + self._dtype = CategoricalDtype( + categories=value, ordered=self.dtype.ordered + ) @property def codes(self) -> NumericalColumn: @@ -624,9 +636,9 @@ def ordered(self, value: bool): self.dtype.ordered = value def __setitem__(self, key, value): - if cudf.api.types.is_scalar(value) and cudf._lib.scalar._is_null_host_scalar( + if cudf.api.types.is_scalar( value - ): + ) and cudf._lib.scalar._is_null_host_scalar(value): to_add_categories = 0 else: if cudf.api.types.is_scalar(value): @@ -671,20 +683,26 @@ def _fill( return self if inplace else self.copy() fill_code = self._encode(fill_value) - fill_scalar = cudf._lib.scalar.as_device_scalar(fill_code, self.codes.dtype) + fill_scalar = cudf._lib.scalar.as_device_scalar( + fill_code, self.codes.dtype + ) result = self if inplace else self.copy() libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) return result - def slice(self, start: int, stop: int, stride: Optional[int] = None) -> Self: + def slice( + self, start: int, stop: int, stride: Optional[int] = None + ) -> Self: codes = self.codes.slice(start, stop, stride) return cast( Self, cudf.core.column.build_categorical_column( categories=self.categories, - codes=cudf.core.column.build_column(codes.base_data, dtype=codes.dtype), + codes=cudf.core.column.build_column( + codes.base_data, dtype=codes.dtype + ), mask=codes.base_mask, ordered=self.ordered, size=codes.size, @@ -711,7 +729,9 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: if not isinstance(other, CategoricalColumn): return NotImplemented if other.dtype != self.dtype: - raise TypeError("Categoricals can only compare with the same type") + raise TypeError( + "Categoricals can only compare with the same type" + ) return other ary = column.as_column( @@ -797,7 +817,9 @@ def to_arrow(self) -> pa.Array: """Convert to PyArrow Array.""" # arrow doesn't support unsigned codes signed_type = ( - min_signed_type(self.codes.max()) if self.codes.size > 0 else np.int8 + min_signed_type(self.codes.max()) + if self.codes.size > 0 + else np.int8 ) codes = self.codes.astype(signed_type) categories = self.categories @@ -826,9 +848,13 @@ def values(self): raise NotImplementedError("cudf.Categorical is not yet implemented") def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": - return self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) + return ( + self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) + ) - def data_array_view(self, *, mode="write") -> cuda.devicearray.DeviceNDArray: + def data_array_view( + self, *, mode="write" + ) -> cuda.devicearray.DeviceNDArray: return self.codes.data_array_view(mode=mode) def unique(self) -> CategoricalColumn: @@ -872,7 +898,9 @@ def find_and_replace( f"got to_replace dtype: {to_replace_col.dtype} and " f"value dtype: {replacement_col.dtype}" ) - df = cudf.DataFrame._from_data({"old": to_replace_col, "new": replacement_col}) + df = cudf.DataFrame._from_data( + {"old": to_replace_col, "new": replacement_col} + ) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: fill_value = ( @@ -889,7 +917,9 @@ def find_and_replace( if fill_value in self.categories: # type: ignore replaced = self.fillna(fill_value) else: - new_categories = self.categories.append(column.as_column([fill_value])) + new_categories = self.categories.append( + column.as_column([fill_value]) + ) replaced = self._set_categories(new_categories) replaced = replaced.fillna(fill_value) df = df.dropna(subset=["old"]) @@ -898,7 +928,9 @@ def find_and_replace( else: replaced = self if df._data["new"].null_count > 0: - drop_values = df._data["old"].apply_boolean_mask(df._data["new"].isnull()) + drop_values = df._data["old"].apply_boolean_mask( + df._data["new"].isnull() + ) cur_categories = replaced.categories new_categories = cur_categories.apply_boolean_mask( ~cudf.Series(cur_categories.isin(drop_values)) @@ -928,7 +960,9 @@ def find_and_replace( # map it to the new label it is to be replaced by dtype_replace = cudf.Series._from_data({None: replacement_col}) dtype_replace[dtype_replace.isin(cats_col)] = None - new_cats_col = cats_col.find_and_replace(to_replace_col, dtype_replace._column) + new_cats_col = cats_col.find_and_replace( + to_replace_col, dtype_replace._column + ) # anything we mapped to None, we want to now filter out since # those categories don't exist anymore @@ -953,11 +987,15 @@ def find_and_replace( # The index of this frame is now the old ints, but the column # named 'index', which came from the filtered categories, # contains the new ints that we need to map to - to_replace_col = column.as_column(catmap.index).astype(replaced.codes.dtype) + to_replace_col = column.as_column(catmap.index).astype( + replaced.codes.dtype + ) replacement_col = catmap._data["index"].astype(replaced.codes.dtype) replaced = column.as_column(replaced.codes) - output = libcudf.replace.replace(replaced, to_replace_col, replacement_col) + output = libcudf.replace.replace( + replaced, to_replace_col, replacement_col + ) result = column.build_categorical_column( categories=new_cats["cats"], @@ -1048,11 +1086,15 @@ def fillna( self.categories, is_unique=True, ) - fill_value = column.as_column(fill_value.codes).astype(self.codes.dtype) + fill_value = column.as_column(fill_value.codes).astype( + self.codes.dtype + ) return super().fillna(fill_value, method=method) - def indices_of(self, value: ScalarLike) -> cudf.core.column.NumericalColumn: + def indices_of( + self, value: ScalarLike + ) -> cudf.core.column.NumericalColumn: return self.codes.indices_of(self._encode(value)) @property @@ -1067,14 +1109,18 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if isinstance(dtype, str) and dtype == "category": return self if ( - isinstance(dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype)) + isinstance( + dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype) + ) and (dtype.categories is None) and (dtype.ordered is None) ): return self if isinstance(dtype, pd.CategoricalDtype): - dtype = CategoricalDtype(categories=dtype.categories, ordered=dtype.ordered) + dtype = CategoricalDtype( + categories=dtype.categories, ordered=dtype.ordered + ) if not isinstance(dtype, CategoricalDtype): raise ValueError("dtype must be CategoricalDtype") @@ -1091,14 +1137,26 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: return self._get_decategorized_column().as_numerical_column(dtype) - def as_string_column(self, dtype, format: str | None = None) -> StringColumn: - return self._get_decategorized_column().as_string_column(dtype, format=format) + def as_string_column( + self, dtype, format: str | None = None + ) -> StringColumn: + return self._get_decategorized_column().as_string_column( + dtype, format=format + ) - def as_datetime_column(self, dtype, format: str | None = None) -> DatetimeColumn: - return self._get_decategorized_column().as_datetime_column(dtype, format) + def as_datetime_column( + self, dtype, format: str | None = None + ) -> DatetimeColumn: + return self._get_decategorized_column().as_datetime_column( + dtype, format + ) - def as_timedelta_column(self, dtype, format: str | None = None) -> TimeDeltaColumn: - return self._get_decategorized_column().as_timedelta_column(dtype, format) + def as_timedelta_column( + self, dtype, format: str | None = None + ) -> TimeDeltaColumn: + return self._get_decategorized_column().as_timedelta_column( + dtype, format + ) def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): @@ -1112,7 +1170,9 @@ def _get_decategorized_column(self) -> ColumnBase: def copy(self, deep: bool = True) -> Self: result_col = super().copy(deep=deep) if deep: - result_col.categories = libcudf.copying.copy_column(self.dtype._categories) + result_col.categories = libcudf.copying.copy_column( + self.dtype._categories + ) return result_col @cached_property @@ -1142,7 +1202,9 @@ def _concat( # improved as the concatenation API is solidified. # Find the first non-null column: - head = next((obj for obj in objs if obj.null_count != len(obj)), objs[0]) + head = next( + (obj for obj in objs if obj.null_count != len(obj)), objs[0] + ) # Combine and de-dupe the categories cats = column.concat_columns([o.categories for o in objs]).unique() @@ -1152,7 +1214,8 @@ def _concat( newsize = sum(map(len, codes)) if newsize > libcudf.MAX_COLUMN_SIZE: raise MemoryError( - f"Result of concat cannot have " f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + f"Result of concat cannot have " + f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype, masked=True) @@ -1163,17 +1226,23 @@ def _concat( return column.build_categorical_column( categories=column.as_column(cats), - codes=column.build_column(codes_col.base_data, dtype=codes_col.dtype), + codes=column.build_column( + codes_col.base_data, dtype=codes_col.dtype + ), mask=codes_col.base_mask, size=codes_col.size, offset=codes_col.offset, ) - def _with_type_metadata(self: CategoricalColumn, dtype: Dtype) -> CategoricalColumn: + def _with_type_metadata( + self: CategoricalColumn, dtype: Dtype + ) -> CategoricalColumn: if isinstance(dtype, CategoricalDtype): return column.build_categorical_column( categories=dtype.categories._values, - codes=column.build_column(self.codes.base_data, dtype=self.codes.dtype), + codes=column.build_column( + self.codes.base_data, dtype=self.codes.dtype + ), mask=self.codes.base_mask, ordered=dtype.ordered, size=self.codes.size, @@ -1222,7 +1291,9 @@ def set_categories( # return a column full of Nulls. out_col = _create_empty_categorical_column( self, - CategoricalDtype(categories=new_categories, ordered=ordered), + CategoricalDtype( + categories=new_categories, ordered=ordered + ), ) elif ( not out_col._categories_equal(new_categories, ordered=True) @@ -1234,7 +1305,9 @@ def set_categories( ) return out_col - def _categories_equal(self, new_categories: ColumnBase, ordered=False) -> bool: + def _categories_equal( + self, new_categories: ColumnBase, ordered=False + ) -> bool: cur_categories = self.categories if len(new_categories) != len(cur_categories): return False @@ -1242,8 +1315,12 @@ def _categories_equal(self, new_categories: ColumnBase, ordered=False) -> bool: return False # if order doesn't matter, sort before the equals call below if not ordered: - cur_categories = cudf.Series(cur_categories).sort_values(ignore_index=True) - new_categories = cudf.Series(new_categories).sort_values(ignore_index=True) + cur_categories = cudf.Series(cur_categories).sort_values( + ignore_index=True + ) + new_categories = cudf.Series(new_categories).sort_values( + ignore_index=True + ) return cur_categories.equals(new_categories) def _set_categories( @@ -1272,12 +1349,18 @@ def _set_categories( new_cats = cudf.Series(new_cats)._column.unique() cur_codes = self.codes - max_cat_size = len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats) + max_cat_size = ( + len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats) + ) out_code_dtype = min_unsigned_type(max_cat_size) cur_order = column.as_column(range(len(cur_codes))) - old_codes = column.as_column(range(len(cur_cats)), dtype=out_code_dtype) - new_codes = column.as_column(range(len(new_cats)), dtype=out_code_dtype) + old_codes = column.as_column( + range(len(cur_cats)), dtype=out_code_dtype + ) + new_codes = column.as_column( + range(len(new_cats)), dtype=out_code_dtype + ) new_df = cudf.DataFrame._from_data( data={"new_codes": new_codes, "cats": new_cats} @@ -1302,7 +1385,9 @@ def _set_categories( # codes can't have masks, so take mask out before moving in return column.build_categorical_column( categories=new_cats, - codes=column.build_column(new_codes.base_data, dtype=new_codes.dtype), + codes=column.build_column( + new_codes.base_data, dtype=new_codes.dtype + ), mask=new_codes.base_mask, size=new_codes.size, offset=new_codes.offset, @@ -1321,7 +1406,8 @@ def reorder_categories( # current set of categories. if not self._categories_equal(new_categories, ordered=False): raise ValueError( - "items in new_categories are not the same as in " "old categories" + "items in new_categories are not the same as in " + "old categories" ) return self._set_categories(new_categories, ordered=ordered) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 6a05293e500..f13d8cf12f7 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -204,12 +204,16 @@ def to_pandas( # This default implementation does not handle nulls in any meaningful # way if arrow_type and nullable: - raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: - return pd.Series(pd.arrays.ArrowExtensionArray(pa_array), index=index) + return pd.Series( + pd.arrays.ArrowExtensionArray(pa_array), index=index + ) else: pd_series = pa_array.to_pandas() @@ -306,7 +310,9 @@ def to_arrow(self) -> pa.Array: 4 ] """ - return libcudf.interop.to_arrow([self], [("None", self.dtype)])["None"].chunk(0) + return libcudf.interop.to_arrow([self], [("None", self.dtype)])[ + "None" + ].chunk(0) @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: @@ -333,7 +339,10 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: data = pa.table([array], [None]) - if isinstance(array.type, pa.TimestampType) and array.type.tz is not None: + if ( + isinstance(array.type, pa.TimestampType) + and array.type.tz is not None + ): raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" ) @@ -461,7 +470,9 @@ def copy(self, deep: bool = True) -> Self: else self.base_mask.copy(deep=False), size=self.size, offset=self.offset, - children=tuple(col.copy(deep=False) for col in self.base_children), + children=tuple( + col.copy(deep=False) for col in self.base_children + ), ), ) @@ -482,7 +493,9 @@ def view(self, dtype: Dtype) -> ColumnBase: dtype = cudf.dtype(dtype) if dtype.kind in ("o", "u", "s"): - raise TypeError("Bytes viewed as str without metadata is ambiguous") + raise TypeError( + "Bytes viewed as str without metadata is ambiguous" + ) if self.dtype.itemsize == dtype.itemsize: return build_column( @@ -495,7 +508,9 @@ def view(self, dtype: Dtype) -> ColumnBase: else: if self.null_count > 0: - raise ValueError("Can not produce a view of a column with nulls") + raise ValueError( + "Can not produce a view of a column with nulls" + ) if (self.size * self.dtype.itemsize) % dtype.itemsize: raise ValueError( @@ -530,7 +545,9 @@ def element_indexing(self, index: int): return pd.Timedelta(result) return result - def slice(self, start: int, stop: int, stride: Optional[int] = None) -> Self: + def slice( + self, start: int, stop: int, stride: Optional[int] = None + ) -> Self: stride = 1 if stride is None else stride if start < 0: start = start + len(self) @@ -630,7 +647,9 @@ def _scatter_by_column( if is_bool_dtype(key.dtype): # `key` is boolean mask if len(key) != len(self): - raise ValueError("Boolean mask must be of same length as column") + raise ValueError( + "Boolean mask must be of same length as column" + ) if isinstance(value, ColumnBase) and len(self) == len(value): # Both value and key are aligned to self. Thus, the values # corresponding to the false values in key should be @@ -655,9 +674,9 @@ def _scatter_by_column( 0 ]._with_type_metadata(self.dtype) else: - return libcudf.copying.scatter([value], key, [self])[0]._with_type_metadata( - self.dtype - ) + return libcudf.copying.scatter([value], key, [self])[ + 0 + ]._with_type_metadata(self.dtype) def _check_scatter_key_length( self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase] @@ -709,7 +728,9 @@ def notnull(self) -> ColumnBase: return result - def indices_of(self, value: ScalarLike | Self) -> cudf.core.column.NumericalColumn: + def indices_of( + self, value: ScalarLike | Self + ) -> cudf.core.column.NumericalColumn: """ Find locations of value in the column @@ -914,11 +935,15 @@ def is_unique(self) -> bool: @property def is_monotonic_increasing(self) -> bool: - return not self.has_nulls() and libcudf.sort.is_sorted([self], [True], None) + return not self.has_nulls() and libcudf.sort.is_sorted( + [self], [True], None + ) @property def is_monotonic_decreasing(self) -> bool: - return not self.has_nulls() and libcudf.sort.is_sorted([self], [False], None) + return not self.has_nulls() and libcudf.sort.is_sorted( + [self], [False], None + ) def sort_values( self: ColumnBase, @@ -933,7 +958,9 @@ def distinct_count(self, dropna: bool = True) -> int: try: return self._distinct_count[dropna] except KeyError: - self._distinct_count[dropna] = cpp_distinct_count(self, ignore_nulls=dropna) + self._distinct_count[dropna] = cpp_distinct_count( + self, ignore_nulls=dropna + ) return self._distinct_count[dropna] def can_cast_safely(self, to_dtype: Dtype) -> bool: @@ -993,8 +1020,12 @@ def as_categorical_column(self, dtype) -> ColumnBase: # Re-label self w.r.t. the provided categories if ( - isinstance(dtype, cudf.CategoricalDtype) and dtype._categories is not None - ) or (isinstance(dtype, pd.CategoricalDtype) and dtype.categories is not None): + isinstance(dtype, cudf.CategoricalDtype) + and dtype._categories is not None + ) or ( + isinstance(dtype, pd.CategoricalDtype) + and dtype.categories is not None + ): labels = self._label_encoding(cats=as_column(dtype.categories)) return build_categorical_column( @@ -1024,7 +1055,9 @@ def as_categorical_column(self, dtype) -> ColumnBase: ordered=ordered, ) - def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": raise NotImplementedError def as_datetime_column( @@ -1032,7 +1065,9 @@ def as_datetime_column( ) -> "cudf.core.column.DatetimeColumn": raise NotImplementedError - def as_interval_column(self, dtype: Dtype) -> "cudf.core.column.IntervalColumn": + def as_interval_column( + self, dtype: Dtype + ) -> "cudf.core.column.IntervalColumn": raise NotImplementedError def as_timedelta_column( @@ -1055,12 +1090,16 @@ def apply_boolean_mask(self, mask) -> ColumnBase: if not is_bool_dtype(mask.dtype): raise ValueError("boolean_mask is not boolean type.") - return apply_boolean_mask([self], mask)[0]._with_type_metadata(self.dtype) + return apply_boolean_mask([self], mask)[0]._with_type_metadata( + self.dtype + ) def argsort( self, ascending: bool = True, na_position: str = "last" ) -> "cudf.core.column.NumericalColumn": - return libcudf.sort.order_by([self], [ascending], na_position, stable=True) + return libcudf.sort.order_by( + [self], [ascending], na_position, stable=True + ) def __arrow_array__(self, type=None): raise TypeError( @@ -1072,7 +1111,8 @@ def __arrow_array__(self, type=None): @property def __cuda_array_interface__(self): raise NotImplementedError( - f"dtype {self.dtype} is not yet supported via " "`__cuda_array_interface__`" + f"dtype {self.dtype} is not yet supported via " + "`__cuda_array_interface__`" ) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -1101,7 +1141,9 @@ def unique(self) -> ColumnBase: """ Get unique values in the data """ - return drop_duplicates([self], keep="first")[0]._with_type_metadata(self.dtype) + return drop_duplicates([self], keep="first")[0]._with_type_metadata( + self.dtype + ) def serialize(self) -> Tuple[dict, list]: # data model: @@ -1138,7 +1180,9 @@ def serialize(self) -> Tuple[dict, list]: header["mask"] = mask_header frames.extend(mask_frames) if self.children: - child_headers, child_frames = zip(*(c.serialize() for c in self.children)) + child_headers, child_frames = zip( + *(c.serialize() for c in self.children) + ) header["subheaders"] = list(child_headers) frames.extend(chain(*child_frames)) header["size"] = self.size @@ -1184,9 +1228,13 @@ def unpack(header, frames) -> Tuple[Any, list]: ) def unary_operator(self, unaryop: str): - raise TypeError(f"Operation {unaryop} not supported for dtype {self.dtype}.") + raise TypeError( + f"Operation {unaryop} not supported for dtype {self.dtype}." + ) - def normalize_binop_value(self, other: ScalarLike) -> Union[ColumnBase, ScalarLike]: + def normalize_binop_value( + self, other: ScalarLike + ) -> Union[ColumnBase, ScalarLike]: raise NotImplementedError def _reduce( @@ -1205,7 +1253,9 @@ def _reduce( The minimum number of entries for the reduction, otherwise the reduction returns NaN. """ - preprocessed = self._process_for_reduction(skipna=skipna, min_count=min_count) + preprocessed = self._process_for_reduction( + skipna=skipna, min_count=min_count + ) if isinstance(preprocessed, ColumnBase): return libcudf.reduce.reduce(op, preprocessed, **kwargs) return preprocessed @@ -1307,7 +1357,9 @@ def _return_sentinel_column(): except ValueError: return _return_sentinel_column() - left_gather_map, right_gather_map = cpp_join([self], [cats], how="left") + left_gather_map, right_gather_map = cpp_join( + [self], [cats], how="left" + ) codes = libcudf.copying.gather( [as_column(range(len(cats)), dtype=dtype)], right_gather_map, @@ -1339,7 +1391,9 @@ def column_empty_like( and dtype == column.dtype ): catcolumn = cast("cudf.core.column.CategoricalColumn", column) - codes = column_empty_like(catcolumn.codes, masked=masked, newsize=newsize) + codes = column_empty_like( + catcolumn.codes, masked=masked, newsize=newsize + ) return build_column( data=None, dtype=dtype, @@ -1351,7 +1405,9 @@ def column_empty_like( return column_empty(row_count, dtype, masked) -def column_empty_like_same_mask(column: ColumnBase, dtype: Dtype) -> ColumnBase: +def column_empty_like_same_mask( + column: ColumnBase, dtype: Dtype +) -> ColumnBase: """Create a new empty Column with the same length and the same mask. Parameters @@ -1381,7 +1437,9 @@ def column_empty( elif isinstance(dtype, ListDtype): data = None children = ( - as_column(0, length=row_count + 1, dtype=libcudf.types.size_type_dtype), + as_column( + 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype + ), column_empty(row_count, dtype=dtype.element_type), ) elif isinstance(dtype, CategoricalDtype): @@ -1400,7 +1458,9 @@ def column_empty( elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): data = as_buffer(rmm.DeviceBuffer(size=0)) children = ( - as_column(0, length=row_count + 1, dtype=libcudf.types.size_type_dtype), + as_column( + 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype + ), ) else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) @@ -1410,7 +1470,9 @@ def column_empty( else: mask = None - return build_column(data, dtype, mask=mask, size=row_count, children=children) + return build_column( + data, dtype, mask=mask, size=row_count, children=children + ) def build_column( @@ -1669,7 +1731,8 @@ def as_column( If None (default), treats NaN values in arbitrary as null if there is no mask passed along with it. If True, combines the mask and NaNs to form a new validity mask. If False, leaves NaN values as is. - Only applies when arbitrary is not a cudf object (Index, Series, Column). + Only applies when arbitrary is not a cudf object + (Index, Series, Column). dtype : optional Optionally typecast the constructed Column to the given dtype. @@ -1702,7 +1765,9 @@ def as_column( as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")), ) if cudf.get_option("default_integer_bitwidth") and dtype is None: - dtype = cudf.dtype(f'i{cudf.get_option("default_integer_bitwidth")//8}') + dtype = cudf.dtype( + f'i{cudf.get_option("default_integer_bitwidth")//8}' + ) if dtype is not None: return column.astype(dtype) return column @@ -1780,7 +1845,10 @@ def as_column( "yet supported in pyarrow, see: " "https://github.com/apache/arrow/issues/20213" ) - elif pa.types.is_timestamp(arbitrary.type) and arbitrary.type.tz is not None: + elif ( + pa.types.is_timestamp(arbitrary.type) + and arbitrary.type.tz is not None + ): raise NotImplementedError( "cuDF does not yet support timezone-aware datetimes" ) @@ -1807,9 +1875,9 @@ def as_column( # be of `object` dtype. new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype()) - if cudf.get_option("mode.pandas_compatible") and new_dtype == cudf.dtype( - "O" - ): + if cudf.get_option( + "mode.pandas_compatible" + ) and new_dtype == cudf.dtype("O"): # We internally raise if we do `astype("object")`, hence # need to cast to `str` since this is safe to do so because # it is a null-array. @@ -1821,7 +1889,9 @@ def as_column( return col - elif isinstance(arbitrary, (pd.Series, pd.Index, pd.api.extensions.ExtensionArray)): + elif isinstance( + arbitrary, (pd.Series, pd.Index, pd.api.extensions.ExtensionArray) + ): if isinstance(arbitrary.dtype, (pd.SparseDtype, pd.PeriodDtype)): raise NotImplementedError( f"cuDF does not yet support {type(arbitrary.dtype).__name__}" @@ -1840,7 +1910,9 @@ def as_column( ) or ( isinstance(arbitrary.dtype, pd.CategoricalDtype) - and isinstance(arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype) + and isinstance( + arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype + ) ) ): raise NotImplementedError( @@ -1859,7 +1931,9 @@ def as_column( dtype=dtype, length=length, ) - elif isinstance(arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)): + elif isinstance( + arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype) + ): data = as_column( pa.array(arbitrary, from_pandas=True), nan_as_null=nan_as_null, @@ -1869,7 +1943,9 @@ def as_column( elif isinstance( arbitrary.dtype, pd.api.extensions.ExtensionDtype ) and not isinstance(arbitrary, NumpyExtensionArray): - raise NotImplementedError("Custom pandas ExtensionDtypes are not supported") + raise NotImplementedError( + "Custom pandas ExtensionDtypes are not supported" + ) elif arbitrary.dtype.kind in "fiubmM": # numpy dtype like if isinstance(arbitrary, NumpyExtensionArray): @@ -1899,9 +1975,12 @@ def as_column( "empty", "boolean", ): - raise TypeError(f"Cannot convert a {inferred_dtype} of object type") + raise TypeError( + f"Cannot convert a {inferred_dtype} of object type" + ) elif nan_as_null is False and ( - pd.isna(arbitrary).any() and inferred_dtype not in ("decimal", "empty") + pd.isna(arbitrary).any() + and inferred_dtype not in ("decimal", "empty") ): # Decimal can hold float("nan") # All np.nan is not restricted by type @@ -1964,7 +2043,11 @@ def as_column( arbitrary = np.asarray(arbitrary) # Handle case that `arbitrary` elements are cupy arrays - if shape and shape[0] and hasattr(arbitrary[0], "__cuda_array_interface__"): + if ( + shape + and shape[0] + and hasattr(arbitrary[0], "__cuda_array_interface__") + ): return as_column( cupy.asarray(arbitrary, dtype=arbitrary[0].dtype), nan_as_null=nan_as_null, @@ -2061,7 +2144,9 @@ def as_column( data = data.astype(cudf.dtype(dtype)) elif (view := as_memoryview(arbitrary)) is not None: - return as_column(np.asarray(view), dtype=dtype, nan_as_null=nan_as_null) + return as_column( + np.asarray(view), dtype=dtype, nan_as_null=nan_as_null + ) # Start of arbitrary that's not handed above but dtype provided elif isinstance(dtype, pd.DatetimeTZDtype): raise NotImplementedError( @@ -2121,7 +2206,8 @@ def as_column( and pa_array.type.tz is not None ): raise NotImplementedError( - "cuDF does not yet support timezone-aware " "datetimes" + "cuDF does not yet support timezone-aware " + "datetimes" ) if is_bool_dtype(dtype): # Need this special case handling for bool dtypes, @@ -2132,7 +2218,9 @@ def as_column( if np_dtype.kind in {"m", "M"}: unit = np.datetime_data(np_dtype)[0] if unit not in {"ns", "us", "ms", "s", "D"}: - raise NotImplementedError(f"{dtype=} is not supported.") + raise NotImplementedError( + f"{dtype=} is not supported." + ) pa_type = np_to_pa_dtype(np_dtype) else: # By default cudf constructs a 64-bit column. Setting @@ -2142,14 +2230,18 @@ def as_column( cudf.get_option("default_integer_bitwidth") and infer_dtype(arbitrary) == "integer" ): - pa_type = np_to_pa_dtype(_maybe_convert_to_default_type("int")) + pa_type = np_to_pa_dtype( + _maybe_convert_to_default_type("int") + ) if cudf.get_option("default_float_bitwidth") and infer_dtype( arbitrary ) in ( "floating", "mixed-integer-float", ): - pa_type = np_to_pa_dtype(_maybe_convert_to_default_type("float")) + pa_type = np_to_pa_dtype( + _maybe_convert_to_default_type("float") + ) pyarrow_array = pa.array( arbitrary, @@ -2189,7 +2281,9 @@ def as_column( elif ( isinstance(arbitrary, Sequence) and len(arbitrary) > 0 - and any(cudf.utils.dtypes.is_column_like(arb) for arb in arbitrary) + and any( + cudf.utils.dtypes.is_column_like(arb) for arb in arbitrary + ) ): # TODO: I think can be removed; covered by # elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)): @@ -2233,7 +2327,9 @@ def _construct_array( if inferred_dtype == "interval": # Only way to construct an Interval column. return pd.array(arbitrary) - elif inferred_dtype == "string" and getattr(dtype, "kind", None) == "M": + elif ( + inferred_dtype == "string" and getattr(dtype, "kind", None) == "M" + ): # We may have date-like strings with timezones try: with warnings.catch_warnings(): @@ -2252,7 +2348,9 @@ def _construct_array( arbitrary = np.asarray( arbitrary, - dtype=native_dtype if native_dtype is None else np.dtype(native_dtype), + dtype=native_dtype + if native_dtype is None + else np.dtype(native_dtype), ) return arbitrary @@ -2274,7 +2372,9 @@ def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]: col = as_column(mask) mask = bools_to_mask(col) else: - raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") + raise NotImplementedError( + f"Cannot infer mask from typestr {typestr}" + ) return mask @@ -2334,7 +2434,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # Notice, we can always cast pure null columns not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)] if len(not_null_col_dtypes) and all( - _is_non_decimal_numeric_dtype(dtyp) and np.issubdtype(dtyp, np.datetime64) + _is_non_decimal_numeric_dtype(dtyp) + and np.issubdtype(dtyp, np.datetime64) for dtyp in not_null_col_dtypes ): common_dtype = find_common_type(not_null_col_dtypes) @@ -2362,7 +2463,9 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: if all(isinstance(o.dtype, CategoricalDtype) for o in objs): return cudf.core.column.categorical.CategoricalColumn._concat( cast( - MutableSequence[cudf.core.column.categorical.CategoricalColumn], + MutableSequence[ + cudf.core.column.categorical.CategoricalColumn + ], objs, ) ) @@ -2370,7 +2473,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: newsize = sum(map(len, objs)) if newsize > libcudf.MAX_COLUMN_SIZE: raise MemoryError( - f"Result of concat cannot have " f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + f"Result of concat cannot have " + f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) elif newsize == 0: return column_empty(0, head.dtype, masked=True) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index ccf1e248c1b..9a5d9dcd47a 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -175,9 +175,12 @@ def _get_datetime_format(col, dtype, time_unit): sub_second_res_len = 0 has_nanos = time_unit in {"ns"} and col.get_dt_field("nanosecond").any() - has_micros = time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any() + has_micros = ( + time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any() + ) has_millis = ( - time_unit in {"ns", "us", "ms"} and col.get_dt_field("millisecond").any() + time_unit in {"ns", "us", "ms"} + and col.get_dt_field("millisecond").any() ) has_seconds = col.get_dt_field("second").any() has_minutes = col.get_dt_field("minute").any() @@ -266,7 +269,9 @@ def __contains__(self, item: ScalarLike) -> bool: # np.datetime64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item_as_dt64.astype("int64") in self.as_numerical_column("int64") + return item_as_dt64.astype("int64") in self.as_numerical_column( + "int64" + ) @property def time_unit(self) -> str: @@ -313,7 +318,9 @@ def values(self): """ Return a CuPy representation of the DateTimeColumn. """ - raise NotImplementedError("DateTime Arrays is not yet implemented in cudf") + raise NotImplementedError( + "DateTime Arrays is not yet implemented in cudf" + ) def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) @@ -354,7 +361,9 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if other_time_unit not in {"s", "ms", "ns", "us"}: other_time_unit = "ns" - return cudf.Scalar(None, dtype=f"datetime64[{other_time_unit}]") + return cudf.Scalar( + None, dtype=f"datetime64[{other_time_unit}]" + ) other = other.astype(self.dtype) return cudf.Scalar(other) @@ -415,9 +424,13 @@ def as_datetime_column( def as_timedelta_column( self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": - raise TypeError(f"cannot astype a datetimelike from {self.dtype} to {dtype}") + raise TypeError( + f"cannot astype a datetimelike from {self.dtype} to {dtype}" + ) - def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": col = column.build_column( data=self.base_data, dtype=np.int64, @@ -441,18 +454,22 @@ def as_string_column( if format in _DATETIME_SPECIAL_FORMATS: names = as_column(_DATETIME_NAMES) else: - names = cudf.core.column.column_empty(0, dtype="object", masked=False) - if len(self) > 0: - return string._datetime_to_str_typecast_functions[cudf.dtype(self.dtype)]( - self, format, names + names = cudf.core.column.column_empty( + 0, dtype="object", masked=False ) + if len(self) > 0: + return string._datetime_to_str_typecast_functions[ + cudf.dtype(self.dtype) + ](self, format, names) else: return cast( "cudf.core.column.StringColumn", column.column_empty(0, dtype="object", masked=False), ) - def mean(self, skipna=None, min_count: int = 0, dtype=np.float64) -> ScalarLike: + def mean( + self, skipna=None, min_count: int = 0, dtype=np.float64 + ) -> ScalarLike: return pd.Timestamp( self.as_numerical_column("int64").mean( skipna=skipna, min_count=min_count, dtype=dtype @@ -485,7 +502,9 @@ def cov(self, other: DatetimeColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov(other.as_numerical_column("int64")) + return self.as_numerical_column("int64").cov( + other.as_numerical_column("int64") + ) def corr(self, other: DatetimeColumn) -> float: if not isinstance(other, DatetimeColumn): @@ -510,7 +529,9 @@ def quantile( return_scalar=return_scalar, ) if return_scalar: - return pd.Timestamp(result, unit=self.time_unit).as_unit(self.time_unit) + return pd.Timestamp(result, unit=self.time_unit).as_unit( + self.time_unit + ) return result.astype(self.dtype) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: @@ -562,7 +583,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: "__ne__", }: out_dtype = cudf.dtype(np.bool_) - if isinstance(other, ColumnBase) and not isinstance(other, DatetimeColumn): + if isinstance(other, ColumnBase) and not isinstance( + other, DatetimeColumn + ): result = _all_bools_with_nulls( self, other, bool_fill_value=op == "__ne__" ) @@ -576,9 +599,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: result_col = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) if out_dtype != cudf.dtype(np.bool_) and op == "__add__": return result_col - elif cudf.get_option("mode.pandas_compatible") and out_dtype == cudf.dtype( - np.bool_ - ): + elif cudf.get_option( + "mode.pandas_compatible" + ) and out_dtype == cudf.dtype(np.bool_): return result_col.fillna(op == "__ne__") else: return result_col @@ -599,7 +622,9 @@ def fillna( return super().fillna(fill_value, method) - def indices_of(self, value: ScalarLike) -> cudf.core.column.NumericalColumn: + def indices_of( + self, value: ScalarLike + ) -> cudf.core.column.NumericalColumn: value = column.as_column( pd.to_datetime(value), dtype=self.dtype ).as_numerical_column("int64") @@ -630,7 +655,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: if max_dist <= np.timedelta64(max_int, to_res).astype( self_delta_dtype - ) and min_dist <= np.timedelta64(max_int, to_res).astype(self_delta_dtype): + ) and min_dist <= np.timedelta64(max_int, to_res).astype( + self_delta_dtype + ): return True else: return False @@ -681,7 +708,9 @@ def to_pandas( arrow_type: bool = False, ) -> pd.Series: if arrow_type and nullable: - raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") elif arrow_type: @@ -726,7 +755,9 @@ def as_string_column( return self._local_time.as_string_column(dtype, format) def get_dt_field(self, field: str) -> ColumnBase: - return libcudf.datetime.extract_datetime_component(self._local_time, field) + return libcudf.datetime.extract_datetime_component( + self._local_time, field + ) def __repr__(self): # Arrow prints the UTC timestamps, but we want to print the @@ -735,5 +766,7 @@ def __repr__(self): pa.timestamp(self.dtype.unit, str(self.dtype.tz)) ) return ( - f"{object.__repr__(self)}\n" f"{arr.to_string()}\n" f"dtype: {self.dtype}" + f"{object.__repr__(self)}\n" + f"{arr.to_string()}\n" + f"dtype: {self.dtype}" ) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 7c6ffca866c..b83a6ded416 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -69,7 +69,9 @@ def as_string_column( def __pow__(self, other): if isinstance(other, int): if other == 0: - res = cudf.core.column.as_column(1, dtype=self.dtype, length=len(self)) + res = cudf.core.column.as_column( + 1, dtype=self.dtype, length=len(self) + ) if self.nullable: res = res.set_mask(self.mask) return res @@ -183,7 +185,9 @@ def normalize_binop_value(self, other): metadata = other.as_tuple() precision = max(len(metadata.digits), metadata.exponent) scale = -metadata.exponent - return cudf.Scalar(other, dtype=self.dtype.__class__(precision, scale)) + return cudf.Scalar( + other, dtype=self.dtype.__class__(precision, scale) + ) return NotImplemented def _decimal_quantile( @@ -191,13 +195,17 @@ def _decimal_quantile( ) -> ColumnBase: quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls - indices = libcudf.sort.order_by([self], [True], "first", stable=True).slice( - self.null_count, len(self) + indices = libcudf.sort.order_by( + [self], [True], "first", stable=True + ).slice(self.null_count, len(self)) + result = libcudf.quantiles.quantile( + self, quant, interpolation, indices, exact ) - result = libcudf.quantiles.quantile(self, quant, interpolation, indices, exact) return result._with_type_metadata(self.dtype) - def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": return libcudf.unary.cast(self, dtype) @@ -231,9 +239,15 @@ def to_arrow(self): data_buf_128[::4] = data_buf_32 # use striding again to set the remaining bits of each 128-bit chunk: # 0 for non-negative values, -1 for negative values: - data_buf_128[1::4] = np.piecewise(data_buf_32, [data_buf_32 < 0], [-1, 0]) - data_buf_128[2::4] = np.piecewise(data_buf_32, [data_buf_32 < 0], [-1, 0]) - data_buf_128[3::4] = np.piecewise(data_buf_32, [data_buf_32 < 0], [-1, 0]) + data_buf_128[1::4] = np.piecewise( + data_buf_32, [data_buf_32 < 0], [-1, 0] + ) + data_buf_128[2::4] = np.piecewise( + data_buf_32, [data_buf_32 < 0], [-1, 0] + ) + data_buf_128[3::4] = np.piecewise( + data_buf_32, [data_buf_32 < 0], [-1, 0] + ) data_buf = pa.py_buffer(data_buf_128) mask_buf = ( self.base_mask @@ -312,7 +326,9 @@ def to_arrow(self): data_buf_128[::2] = data_buf_64 # use striding again to set the remaining bits of each 128-bit chunk: # 0 for non-negative values, -1 for negative values: - data_buf_128[1::2] = np.piecewise(data_buf_64, [data_buf_64 < 0], [-1, 0]) + data_buf_128[1::2] = np.piecewise( + data_buf_64, [data_buf_64 < 0], [-1, 0] + ) data_buf = pa.py_buffer(data_buf_128) mask_buf = ( self.base_mask diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 3786b8cf8fa..7bd693966dc 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -65,7 +65,9 @@ def from_struct_column(cls, struct_column: StructColumn, closed="right"): first_field_name = next(iter(struct_column.dtype.fields.keys())) return IntervalColumn( size=struct_column.size, - dtype=IntervalDtype(struct_column.dtype.fields[first_field_name], closed), + dtype=IntervalDtype( + struct_column.dtype.fields[first_field_name], closed + ), mask=struct_column.base_mask, offset=struct_column.offset, null_count=struct_column.null_count, @@ -76,7 +78,9 @@ def copy(self, deep=True): struct_copy = super().copy(deep=deep) return IntervalColumn( size=struct_copy.size, - dtype=IntervalDtype(struct_copy.dtype.fields["left"], self.dtype.closed), + dtype=IntervalDtype( + struct_copy.dtype.fields["left"], self.dtype.closed + ), mask=struct_copy.base_mask, offset=struct_copy.offset, null_count=struct_copy.null_count, @@ -115,7 +119,9 @@ def to_pandas( # types into pandas (trying to convert the underlying numerical columns # directly is problematic), so we're stuck with this for now. if arrow_type and nullable: - raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") elif arrow_type: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 64a8f8b617c..1c2bcbef2ec 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -71,9 +71,9 @@ def memory_usage(self): child0_size = ( current_base_child.size + 1 - current_offset ) * current_base_child.base_children[0].dtype.itemsize - current_offset = current_base_child.base_children[0].element_indexing( - current_offset - ) + current_offset = current_base_child.base_children[ + 0 + ].element_indexing(current_offset) n += child0_size current_base_child = current_base_child.base_children[1] @@ -117,7 +117,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return concatenate_rows([self, other]) else: raise NotImplementedError( - "Lists concatenation for this operation is not yet" "supported" + "Lists concatenation for this operation is not yet" + "supported" ) else: raise TypeError("can only concatenate list to list") @@ -184,7 +185,9 @@ def _with_type_metadata( self: "cudf.core.column.ListColumn", dtype: Dtype ) -> "cudf.core.column.ListColumn": if isinstance(dtype, ListDtype): - elements = self.base_children[1]._with_type_metadata(dtype.element_type) + elements = self.base_children[1]._with_type_metadata( + dtype.element_type + ) return ListColumn( dtype=dtype, mask=self.base_mask, @@ -296,12 +299,16 @@ def to_pandas( # Can't rely on Column.to_pandas implementation for lists. # Need to perform `to_pylist` to preserve list types. if arrow_type and nullable: - raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: - return pd.Series(pd.arrays.ArrowExtensionArray(pa_array), index=index) + return pd.Series( + pd.arrays.ArrowExtensionArray(pa_array), index=index + ) else: return pd.Series(pa_array.tolist(), dtype="object", index=index) @@ -315,7 +322,9 @@ class ListMethods(ColumnMethods): def __init__(self, parent: ParentType): if not isinstance(parent.dtype, ListDtype): - raise AttributeError("Can only use .list accessor with a 'list' dtype") + raise AttributeError( + "Can only use .list accessor with a 'list' dtype" + ) super().__init__(parent=parent) def get( @@ -383,11 +392,15 @@ def get( if not (default is None or default is NA): # determine rows for which `index` is out-of-bounds lengths = count_elements(self._column) - out_of_bounds_mask = (np.negative(index) > lengths) | (index >= lengths) + out_of_bounds_mask = (np.negative(index) > lengths) | ( + index >= lengths + ) # replace the value in those rows (should be NA) with `default` if out_of_bounds_mask.any(): - out = out._scatter_by_column(out_of_bounds_mask, cudf.Scalar(default)) + out = out._scatter_by_column( + out_of_bounds_mask, cudf.Scalar(default) + ) if out.dtype != self._column.dtype.element_type: # libcudf doesn't maintain struct labels so we must transfer over # manually from the input column if we lost some information @@ -497,7 +510,9 @@ def leaves(self) -> ParentType: 5 6 dtype: int64 """ - return self._return_or_inplace(self._column.leaves(), retain_index=False) + return self._return_or_inplace( + self._column.leaves(), retain_index=False + ) def len(self) -> ParentType: """ @@ -555,11 +570,17 @@ def take(self, lists_indices: ColumnLike) -> ParentType: if not isinstance(lists_indices_col, ListColumn): raise ValueError("lists_indices should be list type array.") if not lists_indices_col.size == self._column.size: - raise ValueError("lists_indices and list column is of different " "size.") + raise ValueError( + "lists_indices and list column is of different " "size." + ) if not _is_non_decimal_numeric_dtype( lists_indices_col.children[1].dtype - ) or not np.issubdtype(lists_indices_col.children[1].dtype, np.integer): - raise TypeError("lists_indices should be column of values of index types.") + ) or not np.issubdtype( + lists_indices_col.children[1].dtype, np.integer + ): + raise TypeError( + "lists_indices should be column of values of index types." + ) return self._return_or_inplace( segmented_gather(self._column, lists_indices_col) @@ -724,5 +745,7 @@ def astype(self, dtype): ListDtype(float64) """ return self._return_or_inplace( - self._column._transform_leaves(lambda col, dtype: col.astype(dtype), dtype) + self._column._transform_leaves( + lambda col, dtype: col.astype(dtype), dtype + ) ) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 422d03677e9..0f5a0eb086b 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -67,7 +67,9 @@ def _return_or_inplace( """ if inplace: self._parent._mimic_inplace( - self._parent.__class__._from_data({self._parent.name: new_col}), + self._parent.__class__._from_data( + {self._parent.name: new_col} + ), inplace=True, ) return None @@ -95,6 +97,8 @@ def _return_or_inplace( else: return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.core.index.as_index(new_col, name=self._parent.name) + return cudf.core.index.as_index( + new_col, name=self._parent.name + ) else: return self._parent._mimic_inplace(new_col, inplace=False) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 8e2e4cb3d8b..b2bd73c9856 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -131,16 +131,24 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: f"Cannot use a {type(value).__name__} to find an index of " f"a {self.dtype} Index." ) - if value is not None and self.dtype.kind in {"c", "f"} and np.isnan(value): + if ( + value is not None + and self.dtype.kind in {"c", "f"} + and np.isnan(value) + ): return column.as_column( - cp.argwhere(cp.isnan(self.data_array_view(mode="read"))).flatten(), + cp.argwhere( + cp.isnan(self.data_array_view(mode="read")) + ).flatten(), dtype=size_type_dtype, ) else: return super().indices_of(value) def has_nulls(self, include_nan: bool = False) -> bool: - return bool(self.null_count != 0) or (include_nan and bool(self.nan_count != 0)) + return bool(self.null_count != 0) or ( + include_nan and bool(self.nan_count != 0) + ) def __setitem__(self, key: Any, value: Any): """ @@ -173,7 +181,9 @@ def __setitem__(self, key: Any, value: Any): else: key = as_column( key, - dtype="float64" if isinstance(key, list) and len(key) == 0 else None, + dtype="float64" + if isinstance(key, list) and len(key) == 0 + else None, ) if not isinstance(key, cudf.core.column.NumericalColumn): raise ValueError(f"Invalid scatter map type {key.dtype}.") @@ -345,9 +355,9 @@ def as_string_column( self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if len(self) > 0: - return string._numeric_to_str_typecast_functions[cudf.dtype(self.dtype)]( - self - ) + return string._numeric_to_str_typecast_functions[ + cudf.dtype(self.dtype) + ](self) else: return cast( cudf.core.column.StringColumn, @@ -382,7 +392,9 @@ def as_timedelta_column( ), ) - def as_decimal_column(self, dtype: Dtype) -> "cudf.core.column.DecimalBaseColumn": + def as_decimal_column( + self, dtype: Dtype + ) -> "cudf.core.column.DecimalBaseColumn": return libcudf.unary.cast(self, dtype) def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: @@ -488,11 +500,15 @@ def find_and_replace( ): return self.copy() - to_replace_col = _normalize_find_and_replace_input(self.dtype, to_replace) + to_replace_col = _normalize_find_and_replace_input( + self.dtype, to_replace + ) if all_nan: replacement_col = column.as_column(replacement, dtype=self.dtype) else: - replacement_col = _normalize_find_and_replace_input(self.dtype, replacement) + replacement_col = _normalize_find_and_replace_input( + self.dtype, replacement + ) if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( replacement[0], length=len(to_replace_col), dtype=self.dtype @@ -502,7 +518,9 @@ def find_and_replace( to_replace_col, replacement_col, replaced = numeric_normalize_types( to_replace_col, replacement_col, self ) - df = cudf.DataFrame._from_data({"old": to_replace_col, "new": replacement_col}) + df = cudf.DataFrame._from_data( + {"old": to_replace_col, "new": replacement_col} + ) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: replaced = replaced.fillna( @@ -512,7 +530,9 @@ def find_and_replace( ) df = df.dropna(subset=["old"]) - return libcudf.replace.replace(replaced, df._data["old"], df._data["new"]) + return libcudf.replace.replace( + replaced, df._data["old"], df._data["new"] + ) def fillna( self, @@ -533,7 +553,10 @@ def fillna( if fill_value is None: raise ValueError("Must specify either 'fill_value' or 'method'") - if isinstance(fill_value, cudf.Scalar) and fill_value.dtype == col.dtype: + if ( + isinstance(fill_value, cudf.Scalar) + and fill_value.dtype == col.dtype + ): return super(NumericalColumn, col).fillna(fill_value, method) if np.isscalar(fill_value): @@ -604,7 +627,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: i_max_ = np.iinfo(self.dtype).max u_max_ = np.iinfo(to_dtype).max - return (self.min() >= 0) and ((i_max_ <= u_max_) or (self.max() < u_max_)) + return (self.min() >= 0) and ( + (i_max_ <= u_max_) or (self.max() < u_max_) + ) # want to cast uint to int elif self.dtype.kind == "u" and to_dtype.kind == "i": @@ -617,7 +642,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: elif self.dtype.kind in {"i", "u"} and to_dtype.kind == "f": info = np.finfo(to_dtype) biggest_exact_int = 2 ** (info.nmant + 1) - if (self.min() >= -biggest_exact_int) and (self.max() <= biggest_exact_int): + if (self.min() >= -biggest_exact_int) and ( + self.max() <= biggest_exact_int + ): return True else: filled = self.fillna(0) @@ -664,14 +691,20 @@ def to_pandas( arrow_type: bool = False, ) -> pd.Series: if arrow_type and nullable: - raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) elif arrow_type: return pd.Series( pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index ) elif ( nullable - and (pandas_nullable_dtype := np_dtypes_to_pandas_dtypes.get(self.dtype)) + and ( + pandas_nullable_dtype := np_dtypes_to_pandas_dtypes.get( + self.dtype + ) + ) is not None ): arrow_array = self.to_arrow() @@ -685,7 +718,9 @@ def to_pandas( def _reduction_result_dtype(self, reduction_op: str) -> Dtype: col_dtype = self.dtype if reduction_op in {"sum", "product"}: - col_dtype = col_dtype if col_dtype.kind == "f" else np.dtype("int64") + col_dtype = ( + col_dtype if col_dtype.kind == "f" else np.dtype("int64") + ) elif reduction_op == "sum_of_squares": col_dtype = np.result_dtype(col_dtype, np.dtype("uint64")) @@ -703,7 +738,9 @@ def _normalize_find_and_replace_input( if isinstance(col_to_normalize, list): if normalized_column.null_count == len(normalized_column): normalized_column = normalized_column.astype(input_column_dtype) - col_to_normalize_dtype = min_column_type(normalized_column, input_column_dtype) + col_to_normalize_dtype = min_column_type( + normalized_column, input_column_dtype + ) # Scalar case if len(col_to_normalize) == 1: if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]): @@ -730,7 +767,8 @@ def _normalize_find_and_replace_input( raise TypeError(f"Type {type(col_to_normalize)} not supported") if ( - col_to_normalize_dtype.kind == "f" and input_column_dtype.kind in {"i", "u"} + col_to_normalize_dtype.kind == "f" + and input_column_dtype.kind in {"i", "u"} ) or (col_to_normalize_dtype.num > input_column_dtype.num): raise TypeError( f"Potentially unsafe cast for non-equivalent " @@ -740,7 +778,9 @@ def _normalize_find_and_replace_input( return normalized_column.astype(input_column_dtype) -def digitize(column: ColumnBase, bins: np.ndarray, right: bool = False) -> ColumnBase: +def digitize( + column: ColumnBase, bins: np.ndarray, right: bool = False +) -> ColumnBase: """Return the indices of the bins to which each value in column belongs. Parameters diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index ecb3a14e18b..c45a9c7fd5d 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. """Define an interface for columns that can perform numerical operations.""" from __future__ import annotations @@ -99,7 +99,9 @@ def quantile( return_scalar: bool, ) -> NumericalBaseColumn: if np.logical_or(q < 0, q > 1).any(): - raise ValueError("percentiles should all be in the interval [0, 1]") + raise ValueError( + "percentiles should all be in the interval [0, 1]" + ) # Beyond this point, q either being scalar or list-like # will only have values in range [0, 1] if len(self) == 0: @@ -117,7 +119,9 @@ def quantile( try: new_scalar = self.dtype.type(scalar_result) scalar_result = ( - new_scalar if new_scalar == scalar_result else scalar_result + new_scalar + if new_scalar == scalar_result + else scalar_result ) except (TypeError, ValueError): pass @@ -134,7 +138,9 @@ def mean( min_count: int = 0, dtype=np.float64, ): - return self._reduce("mean", skipna=skipna, min_count=min_count, dtype=dtype) + return self._reduce( + "mean", skipna=skipna, min_count=min_count, dtype=dtype + ) def var( self, @@ -176,14 +182,20 @@ def _numeric_quantile( self, q: np.ndarray, interpolation: str, exact: bool ) -> NumericalBaseColumn: # get sorted indices and exclude nulls - indices = libcudf.sort.order_by([self], [True], "first", stable=True).slice( - self.null_count, len(self) - ) + indices = libcudf.sort.order_by( + [self], [True], "first", stable=True + ).slice(self.null_count, len(self)) - return libcudf.quantiles.quantile(self, q, interpolation, indices, exact) + return libcudf.quantiles.quantile( + self, q, interpolation, indices, exact + ) def cov(self, other: NumericalBaseColumn) -> float: - if len(self) == 0 or len(other) == 0 or (len(self) == 1 and len(other) == 1): + if ( + len(self) == 0 + or len(other) == 0 + or (len(self) == 1 and len(other) == 1) + ): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) result = (self - self.mean()) * (other - other.mean()) @@ -201,7 +213,9 @@ def corr(self, other: NumericalBaseColumn) -> float: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) return cov / lhs_std / rhs_std - def round(self, decimals: int = 0, how: str = "half_even") -> NumericalBaseColumn: + def round( + self, decimals: int = 0, how: str = "half_even" + ) -> NumericalBaseColumn: if not cudf.api.types.is_integer(decimals): raise TypeError("Values in decimals must be integers") """Round the values in the Column to the given number of decimals.""" diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 49f69f63592..fb76fcdaf39 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -40,9 +40,9 @@ def str_to_boolean(column: StringColumn): """Takes in string column and returns boolean column""" - return (libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8")).fillna( - False - ) + return ( + libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8") + ).fillna(False) if TYPE_CHECKING: @@ -126,7 +126,9 @@ def __init__(self, parent): else parent.dtype ) if not is_string_dtype(value_type): - raise AttributeError("Can only use .str accessor with string values") + raise AttributeError( + "Can only use .str accessor with string values" + ) super().__init__(parent=parent) def htoi(self) -> SeriesOrIndex: @@ -217,7 +219,9 @@ def len(self) -> SeriesOrIndex: dtype: int32 """ - return self._return_or_inplace(libstrings.count_characters(self._column)) + return self._return_or_inplace( + libstrings.count_characters(self._column) + ) def byte_count(self) -> SeriesOrIndex: """ @@ -251,7 +255,9 @@ def byte_count(self) -> SeriesOrIndex: ) @overload - def cat(self, sep: Optional[str] = None, na_rep: Optional[str] = None) -> str: + def cat( + self, sep: Optional[str] = None, na_rep: Optional[str] = None + ) -> str: ... @overload @@ -374,7 +380,9 @@ def cat(self, others=None, sep=None, na_rep=None): out = out[0] return out - def join(self, sep=None, string_na_rep=None, sep_na_rep=None) -> SeriesOrIndex: + def join( + self, sep=None, string_na_rep=None, sep_na_rep=None + ) -> SeriesOrIndex: """ Join lists contained as elements in the Series/Index with passed delimiter. @@ -494,7 +502,9 @@ def join(self, sep=None, string_na_rep=None, sep_na_rep=None) -> SeriesOrIndex: string_na_rep = "" if is_scalar(sep) and sep_na_rep: - raise ValueError("sep_na_rep cannot be defined when `sep` is scalar.") + raise ValueError( + "sep_na_rep cannot be defined when `sep` is scalar." + ) if sep_na_rep is None: sep_na_rep = "" @@ -557,7 +567,9 @@ def _split_by_character(self): children=(offset_col, result_col), ) - def extract(self, pat: str, flags: int = 0, expand: bool = True) -> SeriesOrIndex: + def extract( + self, pat: str, flags: int = 0, expand: bool = True + ) -> SeriesOrIndex: r""" Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -616,7 +628,9 @@ def extract(self, pat: str, flags: int = 0, expand: bool = True) -> SeriesOrInde re.MULTILINE. """ # noqa W605 if not _is_supported_regex_flags(flags): - raise NotImplementedError("unsupported value for `flags` parameter") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) data, _ = libstrings.extract(self._column, pat, flags) if len(data) == 1 and expand is False: @@ -748,9 +762,13 @@ def contains( flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise NotImplementedError("unsupported value for `flags` parameter") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) if regex and not case: - raise NotImplementedError("`case=False` only supported when `regex=False`") + raise NotImplementedError( + "`case=False` only supported when `regex=False`" + ) if is_scalar(pat): if regex: @@ -820,13 +838,17 @@ def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex: dtype: boolean """ if not isinstance(pat, str): - raise TypeError(f"expected a string object, not {type(pat).__name__}") + raise TypeError( + f"expected a string object, not {type(pat).__name__}" + ) if esc is None: esc = "" if not isinstance(esc, str): - raise TypeError(f"expected a string object, not {type(esc).__name__}") + raise TypeError( + f"expected a string object, not {type(esc).__name__}" + ) if len(esc) > 1: raise ValueError( @@ -892,7 +914,9 @@ def repeat( ), ) - return self._return_or_inplace(libstrings.repeat_scalar(self._column, repeats)) + return self._return_or_inplace( + libstrings.repeat_scalar(self._column, repeats) + ) def replace( self, @@ -1001,7 +1025,9 @@ def replace( # Pandas forces non-regex replace when pat is a single-character return self._return_or_inplace( - libstrings.replace_re(self._column, pat, cudf.Scalar(repl, "str"), n) + libstrings.replace_re( + self._column, pat, cudf.Scalar(repl, "str"), n + ) if regex is True and len(pat) > 1 else libstrings.replace( self._column, @@ -1245,7 +1271,9 @@ def istimestamp(self, format: str) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(str_cast.istimestamp(self._column, format)) + return self._return_or_inplace( + str_cast.istimestamp(self._column, format) + ) def isfloat(self) -> SeriesOrIndex: r""" @@ -2056,10 +2084,14 @@ def filter_alphanum( repl = "" return self._return_or_inplace( - libstrings.filter_alphanum(self._column, cudf.Scalar(repl, "str"), keep), + libstrings.filter_alphanum( + self._column, cudf.Scalar(repl, "str"), keep + ), ) - def slice_from(self, starts: "cudf.Series", stops: "cudf.Series") -> SeriesOrIndex: + def slice_from( + self, starts: "cudf.Series", stops: "cudf.Series" + ) -> SeriesOrIndex: """ Return substring of each string using positions for each string. @@ -2196,7 +2228,9 @@ def slice_replace( ), ) - def insert(self, start: int = 0, repl: Optional[str] = None) -> SeriesOrIndex: + def insert( + self, start: int = 0, repl: Optional[str] = None + ) -> SeriesOrIndex: """ Insert the specified string into each string in the specified position. @@ -2363,7 +2397,9 @@ def get_json_object( options = libstrings.GetJsonObjectOptions( allow_single_quotes=allow_single_quotes, - strip_quotes_from_single_strings=(strip_quotes_from_single_strings), + strip_quotes_from_single_strings=( + strip_quotes_from_single_strings + ), missing_fields_as_nulls=missing_fields_as_nulls, ) return self._return_or_inplace( @@ -2499,7 +2535,8 @@ def split( if expand not in (True, False): raise ValueError( - f"expand parameter accepts only : [True, False], " f"got {expand}" + f"expand parameter accepts only : [True, False], " + f"got {expand}" ) # Pandas treats 0 as all @@ -2522,7 +2559,9 @@ def split( if regex is True: data, _ = libstrings.split_re(self._column, pat, n) else: - data, _ = libstrings.split(self._column, cudf.Scalar(pat, "str"), n) + data, _ = libstrings.split( + self._column, cudf.Scalar(pat, "str"), n + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2672,7 +2711,8 @@ def rsplit( if expand not in (True, False): raise ValueError( - f"expand parameter accepts only : [True, False], " f"got {expand}" + f"expand parameter accepts only : [True, False], " + f"got {expand}" ) # Pandas treats 0 as all @@ -2701,7 +2741,9 @@ def rsplit( result_table = data else: if regex is True: - result_table = libstrings.rsplit_record_re(self._column, pat, n) + result_table = libstrings.rsplit_record_re( + self._column, pat, n + ) else: result_table = libstrings.rsplit_record( self._column, cudf.Scalar(pat, "str"), n @@ -2781,7 +2823,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: """ if expand is not True: - raise NotImplementedError("`expand=False` is currently not supported") + raise NotImplementedError( + "`expand=False` is currently not supported" + ) if sep is None: sep = " " @@ -2844,7 +2888,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: ) """ if expand is not True: - raise NotImplementedError("`expand=False` is currently not supported") + raise NotImplementedError( + "`expand=False` is currently not supported" + ) if sep is None: sep = " " @@ -2854,7 +2900,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: expand=expand, ) - def pad(self, width: int, side: str = "left", fillchar: str = " ") -> SeriesOrIndex: + def pad( + self, width: int, side: str = "left", fillchar: str = " " + ) -> SeriesOrIndex: """ Pad strings in the Series/Index up to width. @@ -2916,7 +2964,9 @@ def pad(self, width: int, side: str = "left", fillchar: str = " ") -> SeriesOrIn dtype: object """ if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" + msg = ( + f"fillchar must be a character, not {type(fillchar).__name__}" + ) raise TypeError(msg) if len(fillchar) != 1: @@ -2929,7 +2979,9 @@ def pad(self, width: int, side: str = "left", fillchar: str = " ") -> SeriesOrIn try: side = libstrings.SideType[side.upper()] except KeyError: - raise ValueError("side has to be either one of {'left', 'right', 'both'}") + raise ValueError( + "side has to be either one of {'left', 'right', 'both'}" + ) return self._return_or_inplace( libstrings.pad(self._column, width, fillchar, side) @@ -3057,7 +3109,9 @@ def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: dtype: object """ if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" + msg = ( + f"fillchar must be a character, not {type(fillchar).__name__}" + ) raise TypeError(msg) if len(fillchar) != 1: @@ -3067,7 +3121,9 @@ def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace(libstrings.center(self._column, width, fillchar)) + return self._return_or_inplace( + libstrings.center(self._column, width, fillchar) + ) def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3107,7 +3163,9 @@ def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: dtype: object """ if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" + msg = ( + f"fillchar must be a character, not {type(fillchar).__name__}" + ) raise TypeError(msg) if len(fillchar) != 1: @@ -3117,7 +3175,9 @@ def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace(libstrings.ljust(self._column, width, fillchar)) + return self._return_or_inplace( + libstrings.ljust(self._column, width, fillchar) + ) def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3157,7 +3217,9 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: dtype: object """ if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" + msg = ( + f"fillchar must be a character, not {type(fillchar).__name__}" + ) raise TypeError(msg) if len(fillchar) != 1: @@ -3167,7 +3229,9 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace(libstrings.rjust(self._column, width, fillchar)) + return self._return_or_inplace( + libstrings.rjust(self._column, width, fillchar) + ) def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: r""" @@ -3384,30 +3448,42 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex: if expand_tabs is True: raise NotImplementedError("`expand_tabs=True` is not supported") elif expand_tabs is None: - warnings.warn("wrap current implementation defaults to `expand_tabs`=False") + warnings.warn( + "wrap current implementation defaults to `expand_tabs`=False" + ) replace_whitespace = kwargs.get("replace_whitespace", True) if not replace_whitespace: - raise NotImplementedError("`replace_whitespace=False` is not supported") + raise NotImplementedError( + "`replace_whitespace=False` is not supported" + ) drop_whitespace = kwargs.get("drop_whitespace", True) if not drop_whitespace: - raise NotImplementedError("`drop_whitespace=False` is not supported") + raise NotImplementedError( + "`drop_whitespace=False` is not supported" + ) break_long_words = kwargs.get("break_long_words", None) if break_long_words is True: - raise NotImplementedError("`break_long_words=True` is not supported") + raise NotImplementedError( + "`break_long_words=True` is not supported" + ) elif break_long_words is None: warnings.warn( - "wrap current implementation defaults to " "`break_long_words`=False" + "wrap current implementation defaults to " + "`break_long_words`=False" ) break_on_hyphens = kwargs.get("break_on_hyphens", None) if break_long_words is True: - raise NotImplementedError("`break_on_hyphens=True` is not supported") + raise NotImplementedError( + "`break_on_hyphens=True` is not supported" + ) elif break_on_hyphens is None: warnings.warn( - "wrap current implementation defaults to " "`break_on_hyphens`=False" + "wrap current implementation defaults to " + "`break_on_hyphens`=False" ) return self._return_or_inplace(libstrings.wrap(self._column, width)) @@ -3475,9 +3551,13 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise NotImplementedError("unsupported value for `flags` parameter") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) - return self._return_or_inplace(libstrings.count_re(self._column, pat, flags)) + return self._return_or_inplace( + libstrings.count_re(self._column, pat, flags) + ) def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ @@ -3548,7 +3628,9 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise NotImplementedError("unsupported value for `flags` parameter") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) data = libstrings.findall(self._column, pat, flags) return self._return_or_inplace(data) @@ -3752,7 +3834,9 @@ def endswith(self, pat: str) -> SeriesOrIndex: f"{type(pat).__name__}" ) elif is_scalar(pat): - result_col = libstrings.endswith(self._column, cudf.Scalar(pat, "str")) + result_col = libstrings.endswith( + self._column, cudf.Scalar(pat, "str") + ) else: result_col = libstrings.endswith_multiple( self._column, column.as_column(pat, dtype="str") @@ -3813,7 +3897,9 @@ def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex: f"{type(pat).__name__}" ) elif is_scalar(pat): - result_col = libstrings.startswith(self._column, cudf.Scalar(pat, "str")) + result_col = libstrings.startswith( + self._column, cudf.Scalar(pat, "str") + ) else: result_col = libstrings.startswith_multiple( self._column, column.as_column(pat, dtype="str") @@ -3854,8 +3940,12 @@ def removesuffix(self, suffix: str) -> SeriesOrIndex: """ if suffix is None or len(suffix) == 0: return self._return_or_inplace(self._column) - ends_column = libstrings.endswith(self._column, cudf.Scalar(suffix, "str")) - removed_column = libstrings.slice_strings(self._column, 0, -len(suffix), None) + ends_column = libstrings.endswith( + self._column, cudf.Scalar(suffix, "str") + ) + removed_column = libstrings.slice_strings( + self._column, 0, -len(suffix), None + ) result = cudf._lib.copying.copy_if_else( removed_column, self._column, ends_column ) @@ -3894,8 +3984,12 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: """ if prefix is None or len(prefix) == 0: return self._return_or_inplace(self._column) - starts_column = libstrings.startswith(self._column, cudf.Scalar(prefix, "str")) - removed_column = libstrings.slice_strings(self._column, len(prefix), None, None) + starts_column = libstrings.startswith( + self._column, cudf.Scalar(prefix, "str") + ) + removed_column = libstrings.slice_strings( + self._column, len(prefix), None, None + ) result = cudf._lib.copying.copy_if_else( removed_column, self._column, starts_column ) @@ -3945,12 +4039,16 @@ def find( dtype: int32 """ if not isinstance(sub, str): - raise TypeError(f"expected a string object, not {type(sub).__name__}") + raise TypeError( + f"expected a string object, not {type(sub).__name__}" + ) if end is None: end = -1 - result_col = libstrings.find(self._column, cudf.Scalar(sub, "str"), start, end) + result_col = libstrings.find( + self._column, cudf.Scalar(sub, "str"), start, end + ) return self._return_or_inplace(result_col) @@ -4002,12 +4100,16 @@ def rfind( dtype: int32 """ if not isinstance(sub, str): - raise TypeError(f"expected a string object, not {type(sub).__name__}") + raise TypeError( + f"expected a string object, not {type(sub).__name__}" + ) if end is None: end = -1 - result_col = libstrings.rfind(self._column, cudf.Scalar(sub, "str"), start, end) + result_col = libstrings.rfind( + self._column, cudf.Scalar(sub, "str"), start, end + ) return self._return_or_inplace(result_col) @@ -4055,12 +4157,16 @@ def index( dtype: int32 """ if not isinstance(sub, str): - raise TypeError(f"expected a string object, not {type(sub).__name__}") + raise TypeError( + f"expected a string object, not {type(sub).__name__}" + ) if end is None: end = -1 - result_col = libstrings.find(self._column, cudf.Scalar(sub, "str"), start, end) + result_col = libstrings.find( + self._column, cudf.Scalar(sub, "str"), start, end + ) result = self._return_or_inplace(result_col) @@ -4113,12 +4219,16 @@ def rindex( dtype: int32 """ if not isinstance(sub, str): - raise TypeError(f"expected a string object, not {type(sub).__name__}") + raise TypeError( + f"expected a string object, not {type(sub).__name__}" + ) if end is None: end = -1 - result_col = libstrings.rfind(self._column, cudf.Scalar(sub, "str"), start, end) + result_col = libstrings.rfind( + self._column, cudf.Scalar(sub, "str"), start, end + ) result = self._return_or_inplace(result_col) @@ -4127,7 +4237,9 @@ def rindex( else: return result - def match(self, pat: str, case: bool = True, flags: int = 0) -> SeriesOrIndex: + def match( + self, pat: str, case: bool = True, flags: int = 0 + ) -> SeriesOrIndex: """ Determine if each string matches a regular expression. @@ -4176,9 +4288,13 @@ def match(self, pat: str, case: bool = True, flags: int = 0) -> SeriesOrIndex: flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise NotImplementedError("unsupported value for `flags` parameter") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) - return self._return_or_inplace(libstrings.match_re(self._column, pat, flags)) + return self._return_or_inplace( + libstrings.match_re(self._column, pat, flags) + ) def url_decode(self) -> SeriesOrIndex: """ @@ -4322,7 +4438,9 @@ def translate(self, table: dict) -> SeriesOrIndex: dtype: object """ table = str.maketrans(table) - return self._return_or_inplace(libstrings.translate(self._column, table)) + return self._return_or_inplace( + libstrings.translate(self._column, table) + ) def filter_characters( self, table: dict, keep: bool = True, repl: Optional[str] = None @@ -4395,7 +4513,9 @@ def normalize_spaces(self) -> SeriesOrIndex: 1 test string dtype: object """ - return self._return_or_inplace(libstrings.normalize_spaces(self._column)) + return self._return_or_inplace( + libstrings.normalize_spaces(self._column) + ) def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" @@ -4498,7 +4618,9 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: ) return result - def detokenize(self, indices: "cudf.Series", separator: str = " ") -> SeriesOrIndex: + def detokenize( + self, indices: "cudf.Series", separator: str = " " + ) -> SeriesOrIndex: """ Combines tokens into strings by concatenating them in the order in which they appear in the ``indices`` column. The ``separator`` is @@ -4668,7 +4790,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: retain_index=False, ) - def character_ngrams(self, n: int = 2, as_list: bool = False) -> SeriesOrIndex: + def character_ngrams( + self, n: int = 2, as_list: bool = False + ) -> SeriesOrIndex: """ Generate the n-grams from characters in a column of strings. @@ -4731,7 +4855,9 @@ def character_ngrams(self, n: int = 2, as_list: bool = False) -> SeriesOrIndex: return result.explode() return result - def hash_character_ngrams(self, n: int = 5, as_list: bool = False) -> SeriesOrIndex: + def hash_character_ngrams( + self, n: int = 5, as_list: bool = False + ) -> SeriesOrIndex: """ Generate hashes of n-grams from characters in a column of strings. The MurmurHash32 algorithm is used to produce the hash results. @@ -4880,7 +5006,8 @@ def replace_tokens( delimiter = "" elif not is_scalar(delimiter): raise TypeError( - f"Type of delimiter should be a string," f" found {type(delimiter)}" + f"Type of delimiter should be a string," + f" found {type(delimiter)}" ) return self._return_or_inplace( @@ -4942,14 +5069,16 @@ def filter_tokens( replacement = "" elif not is_scalar(replacement): raise TypeError( - f"Type of replacement should be a string," f" found {type(replacement)}" + f"Type of replacement should be a string," + f" found {type(replacement)}" ) if delimiter is None: delimiter = "" elif not is_scalar(delimiter): raise TypeError( - f"Type of delimiter should be a string," f" found {type(delimiter)}" + f"Type of delimiter should be a string," + f" found {type(delimiter)}" ) return self._return_or_inplace( @@ -4980,7 +5109,9 @@ def porter_stemmer_measure(self) -> SeriesOrIndex: 1 2 dtype: int32 """ - return self._return_or_inplace(libstrings.porter_stemmer_measure(self._column)) + return self._return_or_inplace( + libstrings.porter_stemmer_measure(self._column) + ) def is_consonant(self, position) -> SeriesOrIndex: """ @@ -5148,13 +5279,17 @@ def edit_distance_matrix(self) -> SeriesOrIndex: dtype: list """ if self._column.size < 2: - raise ValueError("Require size >= 2 to compute edit distance matrix.") + raise ValueError( + "Require size >= 2 to compute edit distance matrix." + ) if self._column.has_nulls(): raise ValueError( "Cannot compute edit distance between null strings. " "Consider removing them using `dropna` or fill with `fillna`." ) - return self._return_or_inplace(libstrings.edit_distance_matrix(self._column)) + return self._return_or_inplace( + libstrings.edit_distance_matrix(self._column) + ) def minhash( self, seeds: Optional[ColumnLike] = None, width: int = 4 @@ -5364,7 +5499,9 @@ def __init__( if len(children) == 0 and size != 0: # all nulls-column: - offsets = column.as_column(0, length=size + 1, dtype=size_type_dtype) + offsets = column.as_column( + 0, length=size + 1, dtype=size_type_dtype + ) children = (offsets,) @@ -5409,7 +5546,9 @@ def end_offset(self) -> int: and (self.offset + self.size) < self.base_children[0].size ): self._end_offset = int( - self.base_children[0].element_indexing(self.offset + self.size) + self.base_children[0].element_indexing( + self.offset + self.size + ) ) else: self._end_offset = 0 @@ -5422,7 +5561,9 @@ def memory_usage(self) -> int: if self.data is not None: n += self.data.size if len(self.base_children) == 1: - child0_size = (self.size + 1) * self.base_children[0].dtype.itemsize + child0_size = (self.size + 1) * self.base_children[ + 0 + ].dtype.itemsize n += child0_size if self.nullable: @@ -5449,7 +5590,9 @@ def data(self): ): self._data = self.base_data else: - self._data = self.base_data[self.start_offset : self.end_offset] + self._data = self.base_data[ + self.start_offset : self.end_offset + ] return self._data def all(self, skipna: bool = True) -> bool: @@ -5467,7 +5610,9 @@ def any(self, skipna: bool = True) -> bool: raise NotImplementedError("`any` not implemented for `StringColumn`") - def data_array_view(self, *, mode="write") -> cuda.devicearray.DeviceNDArray: + def data_array_view( + self, *, mode="write" + ) -> cuda.devicearray.DeviceNDArray: raise ValueError("Cannot get an array view of a StringColumn") def to_arrow(self) -> pa.Array: @@ -5487,7 +5632,9 @@ def to_arrow(self) -> pa.Array: ] """ if self.null_count == len(self): - return pa.NullArray.from_buffers(pa.null(), len(self), [pa.py_buffer(b"")]) + return pa.NullArray.from_buffers( + pa.null(), len(self), [pa.py_buffer(b"")] + ) else: return super().to_arrow() @@ -5497,7 +5644,9 @@ def sum( dtype: Optional[Dtype] = None, min_count: int = 0, ): - result_col = self._process_for_reduction(skipna=skipna, min_count=min_count) + result_col = self._process_for_reduction( + skipna=skipna, min_count=min_count + ) if isinstance(result_col, type(self)): return libstrings.join( result_col, @@ -5517,7 +5666,9 @@ def __contains__(self, item: ScalarLike) -> bool: self, column.as_column(item, dtype=self.dtype) ) - def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": out_dtype = cudf.api.types.dtype(dtype) string_col = self if out_dtype.kind in {"i", "u"}: @@ -5548,7 +5699,9 @@ def _as_datetime_or_timedelta_column(self, dtype, format): if dtype.kind == "M": without_nat = self.apply_boolean_mask(is_nat.unary_operator("not")) all_same_length = ( - libstrings.count_characters(without_nat).distinct_count(dropna=True) + libstrings.count_characters(without_nat).distinct_count( + dropna=True + ) == 1 ) if not all_same_length: @@ -5588,7 +5741,9 @@ def as_datetime_column( if self.null_count == len(self): return cast( "cudf.core.column.DatetimeColumn", - column.column_empty(len(self), dtype=out_dtype, masked=True), + column.column_empty( + len(self), dtype=out_dtype, masked=True + ), ) else: format = datetime.infer_format( @@ -5609,10 +5764,14 @@ def as_timedelta_column( format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) - def as_decimal_column(self, dtype: Dtype) -> "cudf.core.column.DecimalBaseColumn": + def as_decimal_column( + self, dtype: Dtype + ) -> "cudf.core.column.DecimalBaseColumn": return libstrings.to_decimal(self, dtype) - def as_string_column(self, dtype: Dtype, format: str | None = None) -> StringColumn: + def as_string_column( + self, dtype: Dtype, format: str | None = None + ) -> StringColumn: return self @property @@ -5637,7 +5796,9 @@ def to_pandas( arrow_type: bool = False, ) -> pd.Series: if arrow_type and nullable: - raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) if arrow_type: return pd.Series( pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index @@ -5653,7 +5814,10 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: if self.dtype == to_dtype: return True - elif to_dtype.kind in {"i", "u"} and not libstrings.is_integer(self).all(): + elif ( + to_dtype.kind in {"i", "u"} + and not libstrings.is_integer(self).all() + ): return False elif to_dtype.kind == "f" and not libstrings.is_float(self).all(): return False @@ -5680,9 +5844,14 @@ def find_and_replace( f"value dtype: {replacement_col.dtype}" ) - if to_replace_col.dtype != self.dtype and replacement_col.dtype != self.dtype: + if ( + to_replace_col.dtype != self.dtype + and replacement_col.dtype != self.dtype + ): return self.copy() - df = cudf.DataFrame._from_data({"old": to_replace_col, "new": replacement_col}) + df = cudf.DataFrame._from_data( + {"old": to_replace_col, "new": replacement_col} + ) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: res = self.fillna( @@ -5710,7 +5879,9 @@ def fillna( fill_value = cudf.Scalar(fill_value, dtype=self.dtype) return super().fillna(fill_value, method=method) - def normalize_binop_value(self, other) -> Union[column.ColumnBase, cudf.Scalar]: + def normalize_binop_value( + self, other + ) -> Union[column.ColumnBase, cudf.Scalar]: if ( isinstance(other, (column.ColumnBase, cudf.Scalar)) and other.dtype == "object" @@ -5720,7 +5891,9 @@ def normalize_binop_value(self, other) -> Union[column.ColumnBase, cudf.Scalar]: return cudf.Scalar(other) return NotImplemented - def _binaryop(self, other: ColumnBinaryOperand, op: str) -> "column.ColumnBase": + def _binaryop( + self, other: ColumnBinaryOperand, op: str + ) -> "column.ColumnBase": reflect, op = self._check_reflected_op(op) # Due to https://github.com/pandas-dev/pandas/issues/46332 we need to # support binary operations between empty or all null string columns @@ -5765,7 +5938,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> "column.ColumnBase": if isinstance(other, cudf.Scalar): other = cast( StringColumn, - column.as_column(other, length=len(self), dtype="object"), + column.as_column( + other, length=len(self), dtype="object" + ), ) # Explicit types are necessary because mypy infers ColumnBase @@ -5792,13 +5967,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> "column.ColumnBase": "NULL_EQUALS", }: lhs, rhs = (other, self) if reflect else (self, other) - return libcudf.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype="bool") + return libcudf.binaryop.binaryop( + lhs=lhs, rhs=rhs, op=op, dtype="bool" + ) return NotImplemented @copy_docstring(column.ColumnBase.view) def view(self, dtype) -> "cudf.core.column.ColumnBase": if self.null_count > 0: - raise ValueError("Can not produce a view of a string column with nulls") + raise ValueError( + "Can not produce a view of a string column with nulls" + ) dtype = cudf.api.types.dtype(dtype) str_byte_offset = self.base_children[0].element_indexing(self.offset) str_end_byte_offset = self.base_children[0].element_indexing( @@ -5827,7 +6006,9 @@ def _get_cols_list(parent_obj, others): and len(others) > 0 and ( can_convert_to_column( - others.iloc[0] if isinstance(others, cudf.Series) else others[0] + others.iloc[0] + if isinstance(others, cudf.Series) + else others[0] ) ) ): diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index b3026d2478d..1b2ffcc2700 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -35,12 +35,17 @@ def base_size(self): def to_arrow(self): children = [ - pa.nulls(len(child)) if len(child) == child.null_count else child.to_arrow() + pa.nulls(len(child)) + if len(child) == child.null_count + else child.to_arrow() for child in self.children ] pa_type = pa.struct( - {field: child.type for field, child in zip(self.dtype.fields, children)} + { + field: child.type + for field, child in zip(self.dtype.fields, children) + } ) if self.nullable: @@ -62,12 +67,16 @@ def to_pandas( # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 if arrow_type and nullable: - raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: - return pd.Series(pd.arrays.ArrowExtensionArray(pa_array), index=index) + return pd.Series( + pd.arrays.ArrowExtensionArray(pa_array), index=index + ) else: return pd.Series(pa_array.tolist(), dtype="object", index=index) @@ -85,7 +94,8 @@ def memory_usage(self): def element_indexing(self, index: int): result = super().element_indexing(index) return { - field: value for field, value in zip(self.dtype.fields, result.values()) + field: value + for field, value in zip(self.dtype.fields, result.values()) } def __setitem__(self, key, value): @@ -162,7 +172,9 @@ class StructMethods(ColumnMethods): def __init__(self, parent=None): if not isinstance(parent.dtype, StructDtype): - raise AttributeError("Can only use .struct accessor with a 'struct' dtype") + raise AttributeError( + "Can only use .struct accessor with a 'struct' dtype" + ) super().__init__(parent=parent) def field(self, key): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index cc45b0e1956..0d24e8e5120 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -122,13 +122,17 @@ def values(self): """ Return a CuPy representation of the TimeDeltaColumn. """ - raise NotImplementedError("TimeDelta Arrays is not yet implemented in cudf") + raise NotImplementedError( + "TimeDelta Arrays is not yet implemented in cudf" + ) @acquire_spill_lock() def to_arrow(self) -> pa.Array: mask = None if self.nullable: - mask = pa.py_buffer(self.mask_array_view(mode="read").copy_to_host()) + mask = pa.py_buffer( + self.mask_array_view(mode="read").copy_to_host() + ) data = pa.py_buffer( self.as_numerical_column("int64") .data_array_view(mode="read") @@ -172,7 +176,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: this = self.astype(common_dtype).astype(out_dtype) if isinstance(other, cudf.Scalar): if other.is_valid(): - other = other.value.astype(common_dtype).astype(out_dtype) + other = other.value.astype(common_dtype).astype( + out_dtype + ) else: other = cudf.Scalar(None, out_dtype) else: @@ -199,9 +205,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: lhs, rhs = (other, this) if reflect else (this, other) result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - if cudf.get_option("mode.pandas_compatible") and out_dtype == cudf.dtype( - np.bool_ - ): + if cudf.get_option( + "mode.pandas_compatible" + ) and out_dtype == cudf.dtype(np.bool_): result = result.fillna(op == "__ne__") return result @@ -263,7 +269,9 @@ def fillna( fill_value = column.as_column(fill_value, nan_as_null=False) return super().fillna(fill_value, method) - def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn": + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": col = column.build_column( data=self.base_data, dtype=np.int64, @@ -276,7 +284,9 @@ def as_numerical_column(self, dtype: Dtype) -> "cudf.core.column.NumericalColumn def as_datetime_column( self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": - raise TypeError(f"cannot astype a timedelta from {self.dtype} to {dtype}") + raise TypeError( + f"cannot astype a timedelta from {self.dtype} to {dtype}" + ) def as_string_column( self, dtype: Dtype, format: str | None = None @@ -286,9 +296,9 @@ def as_string_column( self.dtype.name, "%D days %H:%M:%S" ) if len(self) > 0: - return string._timedelta_to_str_typecast_functions[cudf.dtype(self.dtype)]( - self, format=format - ) + return string._timedelta_to_str_typecast_functions[ + cudf.dtype(self.dtype) + ](self, format=format) else: return cast( "cudf.core.column.StringColumn", @@ -332,7 +342,9 @@ def quantile( return_scalar=return_scalar, ) if return_scalar: - return pd.Timedelta(result, unit=self.time_unit).as_unit(self.time_unit) + return pd.Timedelta(result, unit=self.time_unit).as_unit( + self.time_unit + ) return result.astype(self.dtype) def sum( @@ -370,7 +382,9 @@ def cov(self, other: TimeDeltaColumn) -> float: raise TypeError( f"cannot perform cov with types {self.dtype}, {other.dtype}" ) - return self.as_numerical_column("int64").cov(other.as_numerical_column("int64")) + return self.as_numerical_column("int64").cov( + other.as_numerical_column("int64") + ) def corr(self, other: TimeDeltaColumn) -> float: if not isinstance(other, TimeDeltaColumn): @@ -419,35 +433,35 @@ def components(self, index=None) -> "cudf.DataFrame": data = { "days": self // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( - self.dtype - ) + np.timedelta64( + _unit_to_nanoseconds_conversion["D"], "ns" + ).astype(self.dtype) ), "hours": ( self % cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( - self.dtype - ) + np.timedelta64( + _unit_to_nanoseconds_conversion["D"], "ns" + ).astype(self.dtype) ) ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["h"], "ns").astype( - self.dtype - ) + np.timedelta64( + _unit_to_nanoseconds_conversion["h"], "ns" + ).astype(self.dtype) ), "minutes": ( self % cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["h"], "ns").astype( - self.dtype - ) + np.timedelta64( + _unit_to_nanoseconds_conversion["h"], "ns" + ).astype(self.dtype) ) ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["m"], "ns").astype( - self.dtype - ) + np.timedelta64( + _unit_to_nanoseconds_conversion["m"], "ns" + ).astype(self.dtype) ), } keys_list = iter(date_meta.keys()) @@ -461,9 +475,9 @@ def components(self, index=None) -> "cudf.DataFrame": ).astype(self.dtype) ) ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion[value[1]], "ns").astype( - self.dtype - ) + np.timedelta64( + _unit_to_nanoseconds_conversion[value[1]], "ns" + ).astype(self.dtype) ) if self._time_unit == value[1]: break @@ -511,11 +525,13 @@ def seconds(self) -> "cudf.core.column.NumericalColumn": return ( self % cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( - self.dtype - ) + np.timedelta64( + _unit_to_nanoseconds_conversion["D"], "ns" + ).astype(self.dtype) ) - ) // cudf.Scalar(np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns")) + ) // cudf.Scalar( + np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") + ) @property def microseconds(self) -> "cudf.core.column.NumericalColumn": @@ -533,10 +549,12 @@ def microseconds(self) -> "cudf.core.column.NumericalColumn": return ( self - % np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns").astype( - self.dtype - ) - ) // cudf.Scalar(np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns")) + % np.timedelta64( + _unit_to_nanoseconds_conversion["s"], "ns" + ).astype(self.dtype) + ) // cudf.Scalar( + np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") + ) @property def nanoseconds(self) -> "cudf.core.column.NumericalColumn": @@ -560,8 +578,12 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": return cast("cudf.core.column.NumericalColumn", res_col) return ( self - % cudf.Scalar(np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns")) - ) // cudf.Scalar(np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns")) + % cudf.Scalar( + np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") + ) + ) // cudf.Scalar( + np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns") + ) def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index faed3abc618..33085bede78 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -169,7 +169,9 @@ def __repr__(self) -> str: f"multiindex={self.multiindex}, " f"level_names={self.level_names})" ) - column_info = "\n".join([f"{name}: {col.dtype}" for name, col in self.items()]) + column_info = "\n".join( + [f"{name}: {col.dtype}" for name, col in self.items()] + ) return f"{type_info}\n{column_info}" @property @@ -241,7 +243,9 @@ def to_pandas_index(self) -> pd.Index: # Determine if we can return a RangeIndex if self.rangeindex: if not self.names: - return pd.RangeIndex(start=0, stop=0, step=1, name=self.name) + return pd.RangeIndex( + start=0, stop=0, step=1, name=self.name + ) elif cudf.api.types.infer_dtype(self.names) == "integer": if len(self.names) == 1: start = self.names[0] @@ -251,7 +255,9 @@ def to_pandas_index(self) -> pd.Index: uniques = np.unique(np.diff(np.array(self.names))) if len(uniques) == 1 and uniques[0] != 0: diff = uniques[0] - new_range = range(self.names[0], self.names[-1] + diff, diff) + new_range = range( + self.names[0], self.names[-1] + diff, diff + ) return pd.RangeIndex(new_range, name=self.name) result = pd.Index( self.names, @@ -261,7 +267,9 @@ def to_pandas_index(self) -> pd.Index: ) return result - def insert(self, name: Any, value: Any, loc: int = -1, validate: bool = True): + def insert( + self, name: Any, value: Any, loc: int = -1, validate: bool = True + ): """ Insert column into the ColumnAccessor at the specified location. @@ -284,7 +292,9 @@ def insert(self, name: Any, value: Any, loc: int = -1, validate: bool = True): if loc == -1: loc = ncols if not (0 <= loc <= ncols): - raise ValueError("insert: loc out of bounds: must be 0 <= loc <= ncols") + raise ValueError( + "insert: loc out of bounds: must be 0 <= loc <= ncols" + ) # TODO: we should move all insert logic here if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") @@ -360,7 +370,9 @@ def get_labels_by_index(self, index: Any) -> tuple: return (self.names[index],) elif (bn := len(index)) > 0 and all(map(is_bool, index)): if bn != (n := len(self.names)): - raise IndexError(f"Boolean mask has wrong length: {bn} not {n}") + raise IndexError( + f"Boolean mask has wrong length: {bn} not {n}" + ) if isinstance(index, (pd.Series, cudf.Series)): # Don't allow iloc indexing with series raise NotImplementedError( @@ -462,9 +474,13 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: # Special-casing for boolean mask if (bn := len(key)) > 0 and all(map(is_bool, key)): if bn != (n := len(self.names)): - raise IndexError(f"Boolean mask has wrong length: {bn} not {n}") + raise IndexError( + f"Boolean mask has wrong length: {bn} not {n}" + ) data = dict( - item for item, keep in zip(self._grouped_data.items(), key) if keep + item + for item, keep in zip(self._grouped_data.items(), key) + if keep ) else: data = {k: self._grouped_data[k] for k in key} @@ -625,9 +641,12 @@ def droplevel(self, level): level += self.nlevels self._data = { - _remove_key_level(key, level): value for key, value in self._data.items() + _remove_key_level(key, level): value + for key, value in self._data.items() } - self._level_names = self._level_names[:level] + self._level_names[level + 1 :] + self._level_names = ( + self._level_names[:level] + self._level_names[level + 1 :] + ) if ( len(self._level_names) == 1 @@ -685,7 +704,9 @@ def _get_level(x, nlevels, level_names): if x < 0: x += nlevels if x >= nlevels: - raise IndexError(f"Level {x} out of bounds. Index has {nlevels} levels.") + raise IndexError( + f"Level {x} out of bounds. Index has {nlevels} levels." + ) return x else: x = level_names.index(x) diff --git a/python/cudf/cudf/core/common.py b/python/cudf/cudf/core/common.py index 8cf3d35b11f..5276cd518e5 100644 --- a/python/cudf/cudf/core/common.py +++ b/python/cudf/cudf/core/common.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. def pipe(obj, func, *args, **kwargs): @@ -28,7 +28,9 @@ def pipe(obj, func, *args, **kwargs): if isinstance(func, tuple): func, target = func if target in kwargs: - raise ValueError(f"{target} is both the pipe target and a keyword argument") + raise ValueError( + f"{target} is both the pipe target and a keyword argument" + ) kwargs[target] = obj return func(*args, **kwargs) else: diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 4d10db7fb58..6afbc0bbc65 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast @@ -61,14 +61,18 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool): # TODO: we should fix this further up. # Alternately we can have an Optional[Column] and handle None # specially in _gather. - self.column = cast("NumericalColumn", self.column.astype(size_type_dtype)) + self.column = cast( + "NumericalColumn", self.column.astype(size_type_dtype) + ) else: if self.column.dtype.kind not in {"i", "u"}: raise TypeError("Gather map must have integer dtype") if not nullify: lo, hi = libcudf.reduce.minmax(self.column) if lo.value < -nrows or hi.value >= nrows: - raise IndexError(f"Gather map is out of bounds for [0, {nrows})") + raise IndexError( + f"Gather map is out of bounds for [0, {nrows})" + ) @classmethod def from_column_unchecked( diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index ae7cad33734..ccf730c91fb 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. from collections import abc @@ -159,7 +159,9 @@ def cut( # create bins if given an int or single scalar if not isinstance(bins, pd.IntervalIndex): if not isinstance(bins, (abc.Sequence)): - if isinstance(x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray)): + if isinstance( + x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray) + ): mn = x.min() mx = x.max() else: @@ -206,10 +208,14 @@ def cut( old_bins[0], old_bins[1], periods=1, closed=closed ) else: - interval_labels = IntervalIndex.from_breaks(old_bins, closed=closed) + interval_labels = IntervalIndex.from_breaks( + old_bins, closed=closed + ) else: # get labels for categories - interval_labels = IntervalIndex.from_breaks(int_label_bins, closed=closed) + interval_labels = IntervalIndex.from_breaks( + int_label_bins, closed=closed + ) elif labels is not False: if not (is_list_like(labels)): raise ValueError( @@ -232,7 +238,9 @@ def cut( labels, categories=None, ordered=False ) else: - interval_labels = labels if len(set(labels)) == len(labels) else None + interval_labels = ( + labels if len(set(labels)) == len(labels) else None + ) if isinstance(bins, pd.IntervalIndex): # get the left and right edges of the bins as columns diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 191fa7cf125..35588725655 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -135,7 +135,10 @@ def _shape_mismatch_error(x, y): class _DataFrameIndexer(_FrameIndexer): def __getitem__(self, arg): - if isinstance(self._frame.index, MultiIndex) or self._frame._data.multiindex: + if ( + isinstance(self._frame.index, MultiIndex) + or self._frame._data.multiindex + ): # This try/except block allows the use of pandas-like # tuple arguments into MultiIndex dataframes. try: @@ -168,7 +171,9 @@ def _can_downcast_to_series(self, df, arg): if not is_scalar(arg[1]): return False elif (is_list_like(arg[0]) or is_column_like(arg[0])) and ( - is_list_like(arg[1]) or is_column_like(arg[0]) or type(arg[1]) is slice + is_list_like(arg[1]) + or is_column_like(arg[0]) + or type(arg[1]) is slice ): return False else: @@ -178,7 +183,9 @@ def _can_downcast_to_series(self, df, arg): return True dtypes = df.dtypes.values.tolist() all_numeric = all(is_numeric_dtype(t) for t in dtypes) - if all_numeric or (len(dtypes) and all(t == dtypes[0] for t in dtypes)): + if all_numeric or ( + len(dtypes) and all(t == dtypes[0] for t in dtypes) + ): return True if isinstance(arg[1], tuple): return True @@ -311,7 +318,9 @@ def _getitem_tuple_arg(self, arg): as_column( tmp_arg[0], dtype=self._frame.index.dtype - if isinstance(self._frame.index.dtype, cudf.CategoricalDtype) + if isinstance( + self._frame.index.dtype, cudf.CategoricalDtype + ) else None, ), tmp_arg[1], @@ -323,17 +332,27 @@ def _getitem_tuple_arg(self, arg): ) else: tmp_col_name = str(uuid4()) - cantor_name = "_" + "_".join(map(str, columns_df._data.names)) + cantor_name = "_" + "_".join( + map(str, columns_df._data.names) + ) if columns_df._data.multiindex: # column names must be appropriate length tuples - extra = tuple("" for _ in range(columns_df._data.nlevels - 1)) + extra = tuple( + "" for _ in range(columns_df._data.nlevels - 1) + ) tmp_col_name = (tmp_col_name, *extra) cantor_name = (cantor_name, *extra) other_df = DataFrame( - {tmp_col_name: column.as_column(range(len(tmp_arg[0])))}, + { + tmp_col_name: column.as_column( + range(len(tmp_arg[0])) + ) + }, index=as_index(tmp_arg[0]), ) - columns_df[cantor_name] = column.as_column(range(len(columns_df))) + columns_df[cantor_name] = column.as_column( + range(len(columns_df)) + ) df = other_df.join(columns_df, how="inner") # as join is not assigning any names to index, # update it over here @@ -351,7 +370,10 @@ def _getitem_tuple_arg(self, arg): @_cudf_nvtx_annotate def _setitem_tuple_arg(self, key, value): - if isinstance(self._frame.index, MultiIndex) or self._frame._data.multiindex: + if ( + isinstance(self._frame.index, MultiIndex) + or self._frame._data.multiindex + ): raise NotImplementedError( "Setting values using df.loc[] not supported on " "DataFrames with a MultiIndex" @@ -379,7 +401,9 @@ def _setitem_tuple_arg(self, key, value): new_col = cudf.Series(value, index=idx) if not self._frame.empty: - new_col = new_col._align_to_index(self._frame.index, how="right") + new_col = new_col._align_to_index( + self._frame.index, how="right" + ) if self._frame.empty: self._frame.index = ( @@ -453,7 +477,9 @@ def __getitem__(self, arg): column_names, ), ) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame) - row_spec = indexing_utils.parse_row_iloc_indexer(row_key, len(self._frame)) + row_spec = indexing_utils.parse_row_iloc_indexer( + row_key, len(self._frame) + ) ca = self._frame._data index = self._frame.index if col_is_scalar: @@ -685,7 +711,9 @@ def __init__( self._reindex( column_names=columns, index=index, deep=False, inplace=True ) - if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)): + if isinstance( + columns, (range, pd.RangeIndex, cudf.RangeIndex) + ): self._data.rangeindex = True else: self._data = data._data @@ -729,7 +757,9 @@ def __init__( label_dtype = getattr(columns, "dtype", None) self._data = ColumnAccessor( { - k: column.column_empty(len(self), dtype="object", masked=True) + k: column.column_empty( + len(self), dtype="object", masked=True + ) for k in columns }, level_names=tuple(columns.names) @@ -749,9 +779,13 @@ def __init__( # descr is an optional field of the _cuda_ary_iface_ if "descr" in arr_interface: if len(arr_interface["descr"]) == 1: - new_df = self._from_arrays(data, index=index, columns=columns) + new_df = self._from_arrays( + data, index=index, columns=columns + ) else: - new_df = self.from_records(data, index=index, columns=columns) + new_df = self.from_records( + data, index=index, columns=columns + ) else: new_df = self._from_arrays(data, index=index, columns=columns) @@ -794,12 +828,18 @@ def __init__( ) self._data.rangeindex = rangeindex self._data.label_dtype = ( - cudf.dtype(label_dtype) if label_dtype is not None else None + cudf.dtype(label_dtype) + if label_dtype is not None + else None ) elif len(data) > 0 and isinstance(data[0], Series): - self._init_from_series_list(data=data, columns=columns, index=index) + self._init_from_series_list( + data=data, columns=columns, index=index + ) else: - self._init_from_list_like(data, index=index, columns=columns) + self._init_from_list_like( + data, index=index, columns=columns + ) self._check_data_index_length_match() else: if not is_dict_like(data): @@ -881,7 +921,8 @@ def _init_from_series_list(self, data, columns, index): for idx, series in enumerate(data): if not series.index.is_unique: raise ValueError( - "Reindexing only valid with uniquely valued Index " "objects" + "Reindexing only valid with uniquely valued Index " + "objects" ) if not series.index.equals(final_columns): series = series.reindex(final_columns) @@ -945,7 +986,9 @@ def _init_from_list_like(self, data, index=None, columns=None): elif len(data) > 0 and isinstance(data[0], pd.Interval): data = DataFrame.from_pandas(pd.DataFrame(data)) self._data = data._data - elif any(not isinstance(col, (abc.Iterable, abc.Sequence)) for col in data): + elif any( + not isinstance(col, (abc.Iterable, abc.Sequence)) for col in data + ): raise TypeError("Inputs should be an iterable or sequence.") elif len(data) > 0 and not can_convert_to_column(data[0]): raise ValueError("Must pass 2-d input.") @@ -986,7 +1029,9 @@ def _init_from_list_like(self, data, index=None, columns=None): self._data.label_dtype = getattr(columns, "dtype", None) @_cudf_nvtx_annotate - def _init_from_dict_like(self, data, index=None, columns=None, nan_as_null=None): + def _init_from_dict_like( + self, data, index=None, columns=None, nan_as_null=None + ): label_dtype = None if columns is not None: label_dtype = getattr(columns, "dtype", None) @@ -1009,7 +1054,9 @@ def _init_from_dict_like(self, data, index=None, columns=None, nan_as_null=None) masked=index is not None, ) - data = {c: data[c] if c in data else empty_column() for c in columns} + data = { + c: data[c] if c in data else empty_column() for c in columns + } data, index = self._align_input_series_indices(data, index=index) @@ -1088,7 +1135,9 @@ def _align_input_series_indices(data, index): ] else: - aligned_input_series = cudf.core.series._align_indices(input_series) + aligned_input_series = cudf.core.series._align_indices( + input_series + ) index = aligned_input_series[0].index for name, val in data.items(): @@ -1126,7 +1175,9 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): index_nframes = header["index_frame_count"] - obj = super().deserialize(header, frames[header["index_frame_count"] :]) + obj = super().deserialize( + header, frames[header["index_frame_count"] :] + ) idx_typ = pickle.loads(header["index"]["type-serialized"]) index = idx_typ.deserialize(header["index"], frames[:index_nframes]) @@ -1180,7 +1231,9 @@ def __dir__(self): o = set(dir(type(self))) o.update(self.__dict__) o.update( - c for c in self._column_names if isinstance(c, str) and c.isidentifier() + c + for c in self._column_names + if isinstance(c, str) and c.isidentifier() ) return list(o) @@ -1297,7 +1350,9 @@ def __getitem__(self, arg): elif isinstance(arg, DataFrame): return self.where(arg) else: - raise TypeError(f"__getitem__ on type {type(arg)} is not supported") + raise TypeError( + f"__getitem__ on type {type(arg)} is not supported" + ) @_cudf_nvtx_annotate def __setitem__(self, arg, value): @@ -1315,9 +1370,9 @@ def __setitem__(self, arg, value): if is_scalar(value): self._data[col_name][scatter_map] = value else: - self._data[col_name][scatter_map] = column.as_column(value)[ - scatter_map - ] + self._data[col_name][scatter_map] = column.as_column( + value + )[scatter_map] elif is_scalar(arg) or isinstance(arg, tuple): if isinstance(value, DataFrame): _setitem_with_dataframe( @@ -1403,12 +1458,16 @@ def __setitem__(self, arg, value): else: for col in arg: if is_scalar(value): - self._data[col] = as_column(value, length=len(self)) + self._data[col] = as_column( + value, length=len(self) + ) else: self._data[col] = column.as_column(value) else: - raise TypeError(f"__setitem__ on type {type(arg)} is not supported") + raise TypeError( + f"__setitem__ on type {type(arg)} is not supported" + ) def __delitem__(self, name): self._drop_column(name) @@ -1503,7 +1562,9 @@ def assign(self, **kwargs: Union[Callable[[Self], Any], Any]): @classmethod @_cudf_nvtx_annotate - def _concat(cls, objs, axis=0, join="outer", ignore_index=False, sort=False): + def _concat( + cls, objs, axis=0, join="outer", ignore_index=False, sort=False + ): # flag to indicate at least one empty input frame also has an index empty_has_index = False # length of output frame's RangeIndex if all input frames are empty, @@ -1544,7 +1605,9 @@ def _concat(cls, objs, axis=0, join="outer", ignore_index=False, sort=False): intersecting_columns = functools.reduce( set.intersection, sets_of_column_names ) - union_of_columns = functools.reduce(set.union, sets_of_column_names) + union_of_columns = functools.reduce( + set.union, sets_of_column_names + ) non_intersecting_columns = union_of_columns.symmetric_difference( intersecting_columns ) @@ -1562,7 +1625,8 @@ def _concat(cls, objs, axis=0, join="outer", ignore_index=False, sort=False): if axis == 0: if ignore_index and ( - num_empty_input_frames > 0 or len(intersecting_columns) == 0 + num_empty_input_frames > 0 + or len(intersecting_columns) == 0 ): # When ignore_index is True and if there is # at least 1 empty dataframe and no @@ -1602,7 +1666,9 @@ def _concat(cls, objs, axis=0, join="outer", ignore_index=False, sort=False): # can't sort anything else. try: str_names = sorted(n for n in names if isinstance(n, str)) - non_str_names = sorted(n for n in names if not isinstance(n, str)) + non_str_names = sorted( + n for n in names if not isinstance(n, str) + ) names = non_str_names + str_names except TypeError: names = list(names) @@ -1618,7 +1684,8 @@ def _concat(cls, objs, axis=0, join="outer", ignore_index=False, sort=False): columns = [ ( [] - if are_all_range_index or (ignore_index and not empty_has_index) + if are_all_range_index + or (ignore_index and not empty_has_index) else list(f._index._data.columns) ) + [f._data[name] if name in f._data else None for name in names] @@ -1684,7 +1751,9 @@ def _concat(cls, objs, axis=0, join="outer", ignore_index=False, sort=False): if empty_has_index and num_empty_input_frames == len(objs): out._index = cudf.RangeIndex(result_index_length) elif are_all_range_index and not ignore_index: - out._index = cudf.core.index.Index._concat([o._index for o in objs]) + out._index = cudf.core.index.Index._concat( + [o._index for o in objs] + ) # Reassign the categories for any categorical table cols _reassign_categories( @@ -1701,9 +1770,13 @@ def _concat(cls, objs, axis=0, join="outer", ignore_index=False, sort=False): if not isinstance(out._index, MultiIndex) and isinstance( out._index._values.dtype, cudf.CategoricalDtype ): - out = out.set_index(cudf.core.index.as_index(out.index._values)) + out = out.set_index( + cudf.core.index.as_index(out.index._values) + ) for name, col in out._data.items(): - out._data[name] = col._with_type_metadata(tables[0]._data[name].dtype) + out._data[name] = col._with_type_metadata( + tables[0]._data[name].dtype + ) # Reassign index and column names if objs[0]._data.multiindex: @@ -1762,7 +1835,9 @@ def _clean_renderable_dataframe(self, output): if lines[-1].startswith("["): lines = lines[:-1] - lines.append("[%d rows x %d columns]" % (len(self), len(self._data.names))) + lines.append( + "[%d rows x %d columns]" % (len(self), len(self._data.names)) + ) return "\n".join(lines) def _clean_nulls_from_dataframe(self, df): @@ -1775,7 +1850,9 @@ def _clean_nulls_from_dataframe(self, df): filling with `` values. """ for col in df._data: - if isinstance(df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype)): + if isinstance( + df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype) + ): # TODO we need to handle this pass elif df._data[col].has_nulls(): @@ -1846,7 +1923,9 @@ def _get_renderable_dataframe(self): right_cols = len(self._data.names) - int(ncols / 2.0) # adjust right columns for output if multiindex. right_cols = ( - right_cols - 1 if isinstance(self.index, MultiIndex) else right_cols + right_cols - 1 + if isinstance(self.index, MultiIndex) + else right_cols ) left_cols = int(ncols / 2.0) + 1 if right_cols > 0: @@ -1885,11 +1964,17 @@ def __repr__(self): @_cudf_nvtx_annotate def _repr_html_(self): - lines = self._get_renderable_dataframe().to_pandas()._repr_html_().split("\n") + lines = ( + self._get_renderable_dataframe() + .to_pandas() + ._repr_html_() + .split("\n") + ) if lines[-2].startswith("

"): lines = lines[:-2] lines.append( - "

%d rows × %d columns

" % (len(self), len(self._data.names)) + "

%d rows × %d columns

" + % (len(self), len(self._data.names)) ) lines.append("") return "\n".join(lines) @@ -1899,7 +1984,9 @@ def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() @_cudf_nvtx_annotate - def _get_columns_by_label(self, labels, *, downcast=False) -> Self | Series: + def _get_columns_by_label( + self, labels, *, downcast=False + ) -> Self | Series: """ Return columns of dataframe by `labels` @@ -1950,7 +2037,11 @@ def _make_operands_and_index_for_binop( if ( not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS - and (not self._data.to_pandas_index().equals(other.index.to_pandas())) + and ( + not self._data.to_pandas_index().equals( + other.index.to_pandas() + ) + ) ): raise ValueError( "Can only compare DataFrame & Series objects " @@ -1961,7 +2052,9 @@ def _make_operands_and_index_for_binop( # For keys in right but not left, perform binops between NaN (not # NULL!) and the right value (result is NaN). left_default = as_column(np.nan, length=len(self)) - equal_columns = other.index.to_pandas().equals(self._data.to_pandas_index()) + equal_columns = other.index.to_pandas().equals( + self._data.to_pandas_index() + ) can_use_self_column_name = ( equal_columns or list(other._index._data.names) == self._data._level_names @@ -1989,7 +2082,8 @@ def _make_operands_and_index_for_binop( left_default = fill_value equal_columns = self._column_names == other._column_names can_use_self_column_name = ( - equal_columns or self._data._level_names == other._data._level_names + equal_columns + or self._data._level_names == other._data._level_names ) elif isinstance(other, (dict, abc.Mapping)): # Need to fail early on host mapping types because we ultimately @@ -2125,7 +2219,9 @@ def from_dict( ): result = cls(data).T result.columns = ( - columns if columns is not None else range(len(result._data)) + columns + if columns is not None + else range(len(result._data)) ) if dtype is not None: result = result.astype(dtype) @@ -2141,14 +2237,22 @@ def from_dict( ) elif orient == "columns": if columns is not None: - raise ValueError("Cannot use columns parameter with orient='columns'") + raise ValueError( + "Cannot use columns parameter with orient='columns'" + ) return cls(data, columns=None, dtype=dtype) elif orient == "tight": if columns is not None: - raise ValueError("Cannot use columns parameter with orient='right'") + raise ValueError( + "Cannot use columns parameter with orient='right'" + ) - index = _from_dict_create_index(data["index"], data["index_names"], cudf) - columns = _from_dict_create_index(data["columns"], data["column_names"], pd) + index = _from_dict_create_index( + data["index"], data["index_names"], cudf + ) + columns = _from_dict_create_index( + data["columns"], data["column_names"], pd + ) return cls(data["data"], index=index, columns=columns, dtype=dtype) else: raise ValueError( @@ -2268,7 +2372,9 @@ def to_dict( elif issubclass(into, abc.Mapping): cons = into # type: ignore[assignment] if issubclass(into, defaultdict): - raise TypeError("to_dict() only accepts initialized defaultdicts") + raise TypeError( + "to_dict() only accepts initialized defaultdicts" + ) else: raise TypeError(f"unsupported type: {into}") return cons(self.items()) # type: ignore[misc] @@ -2362,7 +2468,8 @@ def scatter_by_map( if map_size: result += [ - self._empty_like(keep_index) for _ in range(map_size - len(result)) + self._empty_like(keep_index) + for _ in range(map_size - len(result)) ] return result @@ -2424,7 +2531,9 @@ def update( if join != "left": raise NotImplementedError("Only left join is supported") if errors not in {"ignore", "raise"}: - raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + raise ValueError( + "The parameter errors must be either 'ignore' or 'raise'" + ) if filter_func is not None: raise NotImplementedError("filter_func is not supported yet") @@ -2481,7 +2590,9 @@ def equals(self, other): ret = super().equals(other) # If all other checks matched, validate names. if ret: - for self_name, other_name in zip(self._data.names, other._data.names): + for self_name, other_name in zip( + self._data.names, other._data.names + ): if self_name != other_name: ret = False break @@ -2855,7 +2966,9 @@ def set_index( raise TypeError(msg) if isinstance(col, (MultiIndex, pd.MultiIndex)): col = ( - cudf.from_pandas(col) if isinstance(col, pd.MultiIndex) else col + cudf.from_pandas(col) + if isinstance(col, pd.MultiIndex) + else col ) cols = [col._data[x] for x in col._data] columns_to_add.extend(cols) @@ -2867,7 +2980,9 @@ def set_index( else: # For pandas obj, convert to gpu obj columns_to_add.append(as_column(col)) - if isinstance(col, (cudf.Series, cudf.Index, pd.Series, pd.Index)): + if isinstance( + col, (cudf.Series, cudf.Index, pd.Series, pd.Index) + ): names.append(col.name) else: names.append(None) @@ -2925,7 +3040,9 @@ def where(self, cond, other=None, inplace=False): {name: cond._column for name in self._column_names}, ) elif hasattr(cond, "__cuda_array_interface__"): - cond = DataFrame(cond, columns=self._column_names, index=self.index) + cond = DataFrame( + cond, columns=self._column_names, index=self.index + ) elif ( hasattr(cond, "__array_interface__") and cond.__array_interface__["shape"] != self.shape @@ -2939,7 +3056,9 @@ def where(self, cond, other=None, inplace=False): cond = cond.reindex(self.index) else: if cond.shape != self.shape: - raise ValueError("Array conditional must be same shape as self") + raise ValueError( + "Array conditional must be same shape as self" + ) # Setting `self` column names to `cond` as it has no column names. cond._set_columns_like(self._data) @@ -2968,7 +3087,9 @@ def where(self, cond, other=None, inplace=False): ) if cond_col := cond._data.get(name): - result = cudf._lib.copying.copy_if_else(col, other_col, cond_col) + result = cudf._lib.copying.copy_if_else( + col, other_col, cond_col + ) out[name] = _make_categorical_like(result, self._data[name]) else: @@ -2978,7 +3099,9 @@ def where(self, cond, other=None, inplace=False): ) out[name] = col.set_mask(out_mask) - return self._mimic_inplace(self._from_data_like_self(out), inplace=inplace) + return self._mimic_inplace( + self._from_data_like_self(out), inplace=inplace + ) @docutils.doc_apply( doc_reset_index_template.format( @@ -3114,7 +3237,9 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): # least require a deprecation cycle because we currently support # inserting a pd.Categorical. if isinstance(value, pd.Categorical): - value = cudf.core.column.categorical.pandas_categorical_as_column(value) + value = cudf.core.column.categorical.pandas_categorical_as_column( + value + ) if _is_scalar_or_zero_d_array(value): dtype = None @@ -3147,7 +3272,9 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): elif isinstance(value, (pd.Series, Series)): value = Series(value, nan_as_null=nan_as_null) if not ignore_index: - value = value._align_to_index(self._index, how="right", sort=False) + value = value._align_to_index( + self._index, how="right", sort=False + ) value = column.as_column(value, nan_as_null=nan_as_null) @@ -3436,13 +3563,17 @@ def rename( with a number. """ if errors != "ignore": - raise NotImplementedError("Only errors='ignore' is currently supported") + raise NotImplementedError( + "Only errors='ignore' is currently supported" + ) if mapper is None and index is None and columns is None: return self.copy(deep=copy) index = mapper if index is None and axis in (0, "index") else index - columns = mapper if columns is None and axis in (1, "columns") else columns + columns = ( + mapper if columns is None and axis in (1, "columns") else columns + ) if index: if ( @@ -3496,13 +3627,17 @@ def rename( @_cudf_nvtx_annotate def add_prefix(self, prefix): out = self.copy(deep=True) - out.columns = [prefix + col_name for col_name in list(self._data.keys())] + out.columns = [ + prefix + col_name for col_name in list(self._data.keys()) + ] return out @_cudf_nvtx_annotate def add_suffix(self, suffix): out = self.copy(deep=True) - out.columns = [col_name + suffix for col_name in list(self._data.keys())] + out.columns = [ + col_name + suffix for col_name in list(self._data.keys()) + ] return out @_cudf_nvtx_annotate @@ -3551,7 +3686,9 @@ def agg(self, aggs, axis=None): if axis == 0 or axis is not None: raise NotImplementedError("axis not implemented yet") - if isinstance(aggs, abc.Iterable) and not isinstance(aggs, (str, dict)): + if isinstance(aggs, abc.Iterable) and not isinstance( + aggs, (str, dict) + ): result = DataFrame() # TODO : Allow simultaneous pass for multi-aggregation as # a future optimization @@ -3562,7 +3699,8 @@ def agg(self, aggs, axis=None): elif isinstance(aggs, str): if not hasattr(self, aggs): raise AttributeError( - f"{aggs} is not a valid function for " f"'DataFrame' object" + f"{aggs} is not a valid function for " + f"'DataFrame' object" ) result = DataFrame() result[aggs] = getattr(self, aggs)() @@ -3573,14 +3711,17 @@ def agg(self, aggs, axis=None): elif isinstance(aggs, dict): cols = aggs.keys() if any(callable(val) for val in aggs.values()): - raise NotImplementedError("callable parameter is not implemented yet") + raise NotImplementedError( + "callable parameter is not implemented yet" + ) elif all(isinstance(val, str) for val in aggs.values()): res = {} for key, value in aggs.items(): col = self[key] if not hasattr(col, value): raise AttributeError( - f"{value} is not a valid function for " f"'Series' object" + f"{value} is not a valid function for " + f"'Series' object" ) res[key] = getattr(col, value)() result = cudf.Series(list(res.values()), index=res.keys()) @@ -3600,7 +3741,9 @@ def agg(self, aggs, axis=None): result = DataFrame(index=idxs, columns=cols) for key in aggs.keys(): col = self[key] - col_empty = column_empty(len(idxs), dtype=col.dtype, masked=True) + col_empty = column_empty( + len(idxs), dtype=col.dtype, masked=True + ) ans = cudf.Series(data=col_empty, index=idxs) if isinstance(aggs.get(key), abc.Iterable): # TODO : Allow simultaneous pass for multi-aggregation @@ -3626,7 +3769,9 @@ def agg(self, aggs, axis=None): return result elif callable(aggs): - raise NotImplementedError("callable parameter is not implemented yet") + raise NotImplementedError( + "callable parameter is not implemented yet" + ) else: raise ValueError("argument must be a string, list or dict") @@ -3861,7 +4006,8 @@ def transpose(self): source_dtype = source_columns[0].dtype if isinstance(source_dtype, cudf.CategoricalDtype): if any( - not isinstance(c.dtype, cudf.CategoricalDtype) for c in source_columns + not isinstance(c.dtype, cudf.CategoricalDtype) + for c in source_columns ): raise ValueError("Columns must all have the same dtype") cats = list(c.categories for c in source_columns) @@ -4042,14 +4188,17 @@ def merge( ordering. """ if indicator: - raise NotImplementedError("Only indicator=False is currently supported") + raise NotImplementedError( + "Only indicator=False is currently supported" + ) if lsuffix or rsuffix: raise ValueError( "The lsuffix and rsuffix keywords have been replaced with the " "``suffixes=`` keyword. " "Please provide the following instead: \n\n" - " suffixes=('%s', '%s')" % (lsuffix or "_x", rsuffix or "_y") + " suffixes=('%s', '%s')" + % (lsuffix or "_x", rsuffix or "_y") ) else: lsuffix, rsuffix = suffixes @@ -4124,7 +4273,9 @@ def join( suffixes=(lsuffix, rsuffix), sort=sort, ) - df.index.name = None if self.index.name != other.index.name else self.index.name + df.index.name = ( + None if self.index.name != other.index.name else self.index.name + ) return df @_cudf_nvtx_annotate @@ -4244,7 +4395,8 @@ def query(self, expr, local_dict=None): if not isinstance(local_dict, dict): raise TypeError( - f"local_dict type: expected dict but found " f"{type(local_dict)}" + f"local_dict type: expected dict but found " + f"{type(local_dict)}" ) # Get calling environment @@ -4256,10 +4408,14 @@ def query(self, expr, local_dict=None): } # Run query boolmask = queryutils.query_execute(self, expr, callenv) - return self._apply_boolean_mask(BooleanMask.from_column_unchecked(boolmask)) + return self._apply_boolean_mask( + BooleanMask.from_column_unchecked(boolmask) + ) @_cudf_nvtx_annotate - def apply(self, func, axis=1, raw=False, result_type=None, args=(), **kwargs): + def apply( + self, func, axis=1, raw=False, result_type=None, args=(), **kwargs + ): """ Apply a function along an axis of the DataFrame. ``apply`` relies on Numba to JIT compile ``func``. @@ -4436,7 +4592,9 @@ def apply(self, func, axis=1, raw=False, result_type=None, args=(), **kwargs): """ if axis != 1: - raise ValueError("DataFrame.apply currently only supports row wise ops") + raise ValueError( + "DataFrame.apply currently only supports row wise ops" + ) if raw: raise ValueError("The `raw` kwarg is not yet supported.") if result_type is not None: @@ -4471,7 +4629,8 @@ def applymap( # Do not remove until pandas 3.0 support is added. assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." warnings.warn( - "DataFrame.applymap has been deprecated. Use DataFrame.map " "instead.", + "DataFrame.applymap has been deprecated. Use DataFrame.map " + "instead.", FutureWarning, ) return self.map(func=func, na_action=na_action, **kwargs) @@ -4709,7 +4868,9 @@ def partition_by_hash(self, columns, nparts, keep_index=True): else: cols = [*self._columns] - output_columns, offsets = libcudf.hash.hash_partition(cols, key_indices, nparts) + output_columns, offsets = libcudf.hash.hash_partition( + cols, key_indices, nparts + ) outdf = self._from_columns_like_self( output_columns, self._column_names, @@ -4873,7 +5034,9 @@ def info( entries_summary = f", {self._index[0]} to {self._index[-1]}" else: entries_summary = "" - index_summary = f"{index_name}: {len(self._index)} entries{entries_summary}" + index_summary = ( + f"{index_name}: {len(self._index)} entries{entries_summary}" + ) lines.append(index_summary) if len(self._data) == 0: @@ -4915,17 +5078,22 @@ def _verbose_repr(): space_num = max(max_id, len_id) + col_space counts = None - header = _put_str(id_head, space_num) + _put_str(column_head, space) + header = _put_str(id_head, space_num) + _put_str( + column_head, space + ) if show_counts: counts = self.count().to_pandas().tolist() if col_count != len(counts): raise AssertionError( - f"Columns must equal " f"counts ({col_count} != {len(counts)})" + f"Columns must equal " + f"counts ({col_count} != {len(counts)})" ) count_header = "Non-Null Count" len_count = len(count_header) non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + max_count = max(len(pprint_thing(k)) for k in counts) + len( + non_null + ) space_count = max(len_count, max_count) + col_space count_temp = "{count}" + non_null else: @@ -5013,7 +5181,9 @@ def _sizeof_fmt(num, size_qualifier): if "object" in dtype_counts or self.index.dtype == "object": size_qualifier = "+" mem_usage = self.memory_usage(index=True, deep=deep).sum() - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") + lines.append( + f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n" + ) cudf.utils.ioutils.buffer_write_lines(buf, lines) @@ -5039,7 +5209,9 @@ def describe( data_to_describe = self else: - data_to_describe = self.select_dtypes(include=include, exclude=exclude) + data_to_describe = self.select_dtypes( + include=include, exclude=exclude + ) if data_to_describe.empty: raise ValueError("No data of included types.") @@ -5053,9 +5225,15 @@ def describe( if len(describe_series_list) == 1: return describe_series_list[0].to_frame() else: - ldesc_indexes = sorted((x.index for x in describe_series_list), key=len) + ldesc_indexes = sorted( + (x.index for x in describe_series_list), key=len + ) names = dict.fromkeys( - [name for idxnames in ldesc_indexes for name in idxnames.to_pandas()], + [ + name + for idxnames in ldesc_indexes + for name in idxnames.to_pandas() + ], None, ) @@ -5151,7 +5329,9 @@ def to_pandas( """ out_index = self.index.to_pandas() out_data = { - i: col.to_pandas(index=out_index, nullable=nullable, arrow_type=arrow_type) + i: col.to_pandas( + index=out_index, nullable=nullable, arrow_type=arrow_type + ) for i, col in enumerate(self._data.columns) } @@ -5192,14 +5372,18 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): 2 3 4 """ if nan_as_null is no_default: - nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None + nan_as_null = ( + False if cudf.get_option("mode.pandas_compatible") else None + ) if isinstance(dataframe, pd.DataFrame): if not dataframe.columns.is_unique: raise ValueError("Duplicate column names are not allowed") data = { - col_name: column.as_column(col_value.array, nan_as_null=nan_as_null) + col_name: column.as_column( + col_value.array, nan_as_null=nan_as_null + ) for col_name, col_value in dataframe.items() } if isinstance(dataframe.index, pd.MultiIndex): @@ -5207,7 +5391,9 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): dataframe.index, nan_as_null=nan_as_null ) else: - index = cudf.Index.from_pandas(dataframe.index, nan_as_null=nan_as_null) + index = cudf.Index.from_pandas( + dataframe.index, nan_as_null=nan_as_null + ) df = cls._from_data(data, index) df._data._level_names = tuple(dataframe.columns.names) @@ -5223,7 +5409,9 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): # this isn't pandas specific return from_dataframe(dataframe, allow_copy=True) else: - raise TypeError(f"Could not construct DataFrame from {type(dataframe)}") + raise TypeError( + f"Could not construct DataFrame from {type(dataframe)}" + ) @classmethod @_cudf_nvtx_annotate @@ -5348,13 +5536,18 @@ def to_arrow(self, preserve_index=True): else: if isinstance(self.index, MultiIndex): gen_names = tuple( - f"level_{i}" for i, _ in enumerate(self.index._data.names) + f"level_{i}" + for i, _ in enumerate(self.index._data.names) ) else: gen_names = ( - self.index.names if self.index.name is not None else ("index",) + self.index.names + if self.index.name is not None + else ("index",) ) - for gen_name, col_name in zip(gen_names, self.index._data.names): + for gen_name, col_name in zip( + gen_names, self.index._data.names + ): data._insert( data.shape[1], gen_name, @@ -5419,7 +5612,9 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): DataFrame """ if data.ndim != 1 and data.ndim != 2: - raise ValueError(f"records dimension expected 1 or 2 but found {data.ndim}") + raise ValueError( + f"records dimension expected 1 or 2 but found {data.ndim}" + ) num_cols = len(data[0]) @@ -5432,7 +5627,8 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): else: if len(columns) != num_cols: raise ValueError( - f"columns length expected {num_cols} " f"but found {len(columns)}" + f"columns length expected {num_cols} " + f"but found {len(columns)}" ) names = columns @@ -5440,10 +5636,14 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): if data.ndim == 2: for i, k in enumerate(names): - df._data[k] = column.as_column(data[:, i], nan_as_null=nan_as_null) + df._data[k] = column.as_column( + data[:, i], nan_as_null=nan_as_null + ) elif data.ndim == 1: for k in names: - df._data[k] = column.as_column(data[k], nan_as_null=nan_as_null) + df._data[k] = column.as_column( + data[k], nan_as_null=nan_as_null + ) if index is None: df._index = RangeIndex(start=0, stop=len(data)) @@ -5495,7 +5695,8 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): else: if len(columns) != num_cols: raise ValueError( - f"columns length expected {num_cols} but " f"found {len(columns)}" + f"columns length expected {num_cols} but " + f"found {len(columns)}" ) elif len(columns) != len(set(columns)): raise ValueError("Duplicate column names are not allowed") @@ -5504,9 +5705,13 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df = cls() if data.ndim == 2: for i, k in enumerate(names): - df._data[k] = column.as_column(data[:, i], nan_as_null=nan_as_null) + df._data[k] = column.as_column( + data[:, i], nan_as_null=nan_as_null + ) elif data.ndim == 1: - df._data[names[0]] = column.as_column(data, nan_as_null=nan_as_null) + df._data[names[0]] = column.as_column( + data, nan_as_null=nan_as_null + ) if isinstance(columns, pd.Index): df._data._level_names = tuple(columns.names) if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)): @@ -5659,7 +5864,9 @@ def quantile( if q_is_number: result = result.transpose() - return Series(data=result._columns[0], index=result.index, name=q) + return Series( + data=result._columns[0], index=result.index, name=q + ) else: # Ensure that qs is non-scalar so that we always get a column back. interpolation = interpolation or "linear" @@ -5765,7 +5972,10 @@ def make_false_column_like_self(): # precomputed variables inside the loop requires nontrivial logic. # This optimization could be attempted if `isin` ever becomes a # bottleneck. - if isinstance(values, (Series, DataFrame)) and not values.index.is_unique: + if ( + isinstance(values, (Series, DataFrame)) + and not values.index.is_unique + ): # if DataFrame ever supports duplicate columns # would need to check that here raise ValueError("cannot compute isin with a duplicate axis.") @@ -5918,7 +6128,12 @@ def count(self, axis=0, numeric_only=False): raise NotImplementedError("Only axis=0 is currently supported.") length = len(self) return Series._from_data( - {None: [length - self._data[col].null_count for col in self._data.names]}, + { + None: [ + length - self._data[col].null_count + for col in self._data.names + ] + }, as_index(self._data.names), ) @@ -5977,7 +6192,9 @@ def _reduce( # TODO: concat + op can probably be done in the general case # for axis == 2. # https://github.com/rapidsai/cudf/issues/14930 - return getattr(concat_columns(source._data.columns), op)(**kwargs) + return getattr(concat_columns(source._data.columns), op)( + **kwargs + ) try: result = [ getattr(source._data[col], op)(**kwargs) @@ -6021,13 +6238,17 @@ def _reduce( else: raise if axis == 2: - return getattr(as_column(result, nan_as_null=False), op)(**kwargs) + return getattr(as_column(result, nan_as_null=False), op)( + **kwargs + ) else: source_dtypes = [c.dtype for c in source._data.columns] common_dtype = find_common_type(source_dtypes) if ( is_object_dtype(common_dtype) - and any(not is_object_dtype(dtype) for dtype in source_dtypes) + and any( + not is_object_dtype(dtype) for dtype in source_dtypes + ) or not is_bool_dtype(common_dtype) and any(is_bool_dtype(dtype) for dtype in source_dtypes) ): @@ -6150,13 +6371,17 @@ def mode(self, axis=0, numeric_only=False, dropna=True): else: data_df = self - mode_results = [data_df[col].mode(dropna=dropna) for col in data_df._data] + mode_results = [ + data_df[col].mode(dropna=dropna) for col in data_df._data + ] if len(mode_results) == 0: return DataFrame() with warnings.catch_warnings(): - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.simplefilter("ignore", FutureWarning) df = cudf.concat(mode_results, axis=1) @@ -6229,7 +6454,9 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): prepared._data[col] = ( prepared._data[col] .astype( - cudf.utils.dtypes.get_min_float_dtype(prepared._data[col]) + cudf.utils.dtypes.get_min_float_dtype( + prepared._data[col] + ) if not is_datetime_dtype(common_dtype) else cudf.dtype("float64") ) @@ -6256,7 +6483,8 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): } result_dtype = ( common_dtype - if method in type_coerced_methods or is_datetime_dtype(common_dtype) + if method in type_coerced_methods + or is_datetime_dtype(common_dtype) else None ) result = column.as_column(result, dtype=result_dtype) @@ -6279,7 +6507,9 @@ def _columns_view(self, columns): """ Return a subset of the DataFrame's columns as a view. """ - return DataFrame({col: self._data[col] for col in columns}, index=self.index) + return DataFrame( + {col: self._data[col] for col in columns}, index=self.index + ) @_cudf_nvtx_annotate def select_dtypes(self, include=None, exclude=None): @@ -6359,7 +6589,9 @@ def select_dtypes(self, include=None, exclude=None): selection = tuple(map(frozenset, (include, exclude))) if not any(selection): - raise ValueError("at least one of include or exclude must be nonempty") + raise ValueError( + "at least one of include or exclude must be nonempty" + ) include, exclude = map( lambda x: frozenset(map(cudf_dtype_from_pydata_dtype, x)), @@ -6368,7 +6600,9 @@ def select_dtypes(self, include=None, exclude=None): # can't both include AND exclude! if not include.isdisjoint(exclude): - raise ValueError(f"include and exclude overlap on {(include & exclude)}") + raise ValueError( + f"include and exclude overlap on {(include & exclude)}" + ) # include all subtypes include_subtypes = set() @@ -6670,16 +6904,18 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): if future_stack: if dropna is not no_default: raise ValueError( - "dropna must be unspecified with future_stack=True as the new " - "implementation does not introduce rows of NA values. This " - "argument will be removed in a future version of cudf." + "dropna must be unspecified with future_stack=True as " + "the new implementation does not introduce rows of NA " + "values. This argument will be removed in a future " + "version of cudf." ) else: if dropna is not no_default or self._data.nlevels > 1: warnings.warn( - "The previous implementation of stack is deprecated and will be " - "removed in a future version of cudf. Specify future_stack=True " - "to adopt the new implementation and silence this warning.", + "The previous implementation of stack is deprecated and " + "will be removed in a future version of cudf. Specify " + "future_stack=True to adopt the new implementation and " + "silence this warning.", FutureWarning, ) if dropna is no_default: @@ -6693,7 +6929,9 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): "level must be either an int/str, or a list of int/str." ) else: - raise ValueError("level must be either an int/str, or a list of int/str.") + raise ValueError( + "level must be either an int/str, or a list of int/str." + ) level = [level] if not isinstance(level, list) else level @@ -6721,7 +6959,9 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): ) else: # Must be a list of positions, normalize negative positions - level_indices = [lv + self._data.nlevels if lv < 0 else lv for lv in level] + level_indices = [ + lv + self._data.nlevels if lv < 0 else lv for lv in level + ] unnamed_levels_indices = [ i for i in range(self._data.nlevels) if i not in level_indices @@ -6766,14 +7006,18 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Compute the column indices that serves as the input for # `interleave_columns` - column_idx_df = pd.DataFrame(data=range(len(self._data)), index=named_levels) + column_idx_df = pd.DataFrame( + data=range(len(self._data)), index=named_levels + ) column_indices: list[list[int]] = [] if has_unnamed_levels: unnamed_level_values = list( map(column_name_idx.get_level_values, unnamed_levels_indices) ) - unnamed_level_values = pd.MultiIndex.from_arrays(unnamed_level_values) + unnamed_level_values = pd.MultiIndex.from_arrays( + unnamed_level_values + ) def unnamed_group_generator(): if has_unnamed_levels: @@ -6790,7 +7034,9 @@ def unnamed_group_generator(): ).values else: yield ( - grpdf.reindex(unique_named_levels, axis=0, fill_value=-1) + grpdf.reindex( + unique_named_levels, axis=0, fill_value=-1 + ) .sort_index() .values ) @@ -6819,7 +7065,9 @@ def unnamed_group_generator(): ) all_nulls = functools.cache( - functools.partial(column_empty, self.shape[0], common_type, masked=True) + functools.partial( + column_empty, self.shape[0], common_type, masked=True + ) ) # homogenize the dtypes of the columns @@ -6832,7 +7080,9 @@ def unnamed_group_generator(): # Construct the resulting dataframe / series if not has_unnamed_levels: - result = Series._from_data(data={None: stacked[0]}, index=new_index) + result = Series._from_data( + data={None: stacked[0]}, index=new_index + ) else: if unnamed_level_values.nlevels == 1: unnamed_level_values = unnamed_level_values.get_level_values(0) @@ -6842,9 +7092,14 @@ def unnamed_group_generator(): dict( zip( unnamed_level_values, - [stacked[i] for i in unnamed_level_values.argsort().argsort()] + [ + stacked[i] + for i in unnamed_level_values.argsort().argsort() + ] if not future_stack - else [stacked[i] for i in unnamed_level_values.argsort()], + else [ + stacked[i] for i in unnamed_level_values.argsort() + ], ) ), isinstance(unnamed_level_values, pd.MultiIndex), @@ -7046,7 +7301,9 @@ def pivot_table( @_cudf_nvtx_annotate @copy_docstring(reshape.unstack) def unstack(self, level=-1, fill_value=None): - return cudf.core.reshape.unstack(self, level=level, fill_value=fill_value) + return cudf.core.reshape.unstack( + self, level=level, fill_value=fill_value + ) @_cudf_nvtx_annotate def explode(self, column, ignore_index=False): @@ -7143,7 +7400,9 @@ def pct_change( if fill_method not in (no_default, None) or limit is not no_default: # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The 'fill_method' and 'limit' keywords in " f"{type(self).__name__}.pct_change are deprecated and will be " @@ -7161,9 +7420,13 @@ def pct_change( warnings.simplefilter("ignore") data = self.fillna(method=fill_method, limit=limit) - return data.diff(periods=periods) / data.shift(periods=periods, freq=freq) + return data.diff(periods=periods) / data.shift( + periods=periods, freq=freq + ) - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ): return df_protocol.__dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) @@ -7211,7 +7474,8 @@ def _sample_axis_1( # Since cuDF does not support multiple columns with same name, # sample with replace=True at axis 1 is unsupported. raise NotImplementedError( - "Sample is not supported for axis 1/`columns` when" "`replace=True`." + "Sample is not supported for axis 1/`columns` when" + "`replace=True`." ) sampled_column_labels = random_state.choice( @@ -7275,7 +7539,9 @@ def interleave_columns(self): The interleaved columns as a single column """ if ("category" == self.dtypes).any(): - raise ValueError("interleave_columns does not support 'category' dtype.") + raise ValueError( + "interleave_columns does not support 'category' dtype." + ) return self._constructor_sliced._from_data( {None: libcudf.reshape.interleave_columns([*self._columns])} @@ -7385,7 +7651,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): the output of earlier statements) is not supported. """ if kwargs: - raise ValueError("Keyword arguments other than `inplace` are not supported") + raise ValueError( + "Keyword arguments other than `inplace` are not supported" + ) # Have to use a regex match to avoid capturing ==, >=, or <= equals_sign_regex = "[^=><]=[^=]" @@ -7403,7 +7671,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): if not includes_assignment: if inplace: - raise ValueError("Cannot operate inplace if there is no assignment") + raise ValueError( + "Cannot operate inplace if there is no assignment" + ) return Series._from_data( { None: libcudf.transform.compute_column( @@ -7427,7 +7697,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): exprs.append(e.strip()) cols = ( - libcudf.transform.compute_column([*self._columns], self._column_names, e) + libcudf.transform.compute_column( + [*self._columns], self._column_names, e + ) for e in exprs ) ret = self if inplace else self.copy(deep=False) @@ -7592,7 +7864,9 @@ def func(left, right, output): if isinstance(right, Series): uncommon_columns = set(left._column_names) ^ set(right.index) elif isinstance(right, DataFrame): - uncommon_columns = set(left._column_names) ^ set(right._column_names) + uncommon_columns = set(left._column_names) ^ set( + right._column_names + ) elif _is_scalar_or_zero_d_array(right): for name, col in output._data.items(): output._data[name] = col.fillna(value) @@ -7601,7 +7875,9 @@ def func(left, right, output): return output for name in uncommon_columns: - output._data[name] = as_column(value, length=len(output), dtype="bool") + output._data[name] = as_column( + value, length=len(output), dtype="bool" + ) return output return func @@ -7622,7 +7898,9 @@ def func(left, right, output): "gt", "ge", ]: - setattr(DataFrame, binop, make_binop_func(binop, _make_replacement_func(False))) + setattr( + DataFrame, binop, make_binop_func(binop, _make_replacement_func(False)) + ) @_cudf_nvtx_annotate @@ -7725,7 +8003,9 @@ def from_pandas(obj, nan_as_null=no_default): """ if nan_as_null is no_default: - nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None + nan_as_null = ( + False if cudf.get_option("mode.pandas_compatible") else None + ) if isinstance(obj, pd.DataFrame): return DataFrame.from_pandas(obj, nan_as_null=nan_as_null) @@ -7825,7 +8105,9 @@ def _setitem_with_dataframe( input_cols = input_df._column_names if len(input_cols) != len(replace_df._column_names): - raise ValueError("Number of Input Columns must be same replacement Dataframe") + raise ValueError( + "Number of Input Columns must be same replacement Dataframe" + ) if ( not ignore_index @@ -7837,7 +8119,9 @@ def _setitem_with_dataframe( for col_1, col_2 in zip(input_cols, replace_df._column_names): if col_1 in input_df._column_names: if mask is not None: - input_df._data[col_1][mask] = column.as_column(replace_df[col_2]) + input_df._data[col_1][mask] = column.as_column( + replace_df[col_2] + ) else: input_df._data[col_1] = column.as_column(replace_df[col_2]) else: @@ -7932,7 +8216,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): if all(is_numeric_dtype(col.dtype) for col in cols): dtypes[idx] = find_common_type([col.dtype for col in cols]) # If all categorical dtypes, combine the categories - elif all(isinstance(col, cudf.core.column.CategoricalColumn) for col in cols): + elif all( + isinstance(col, cudf.core.column.CategoricalColumn) for col in cols + ): # Combine and de-dupe the categories categories[idx] = cudf.Series( concat_columns([col.categories for col in cols]) @@ -7977,7 +8263,9 @@ def _reassign_categories(categories, cols, col_idxs): if idx in categories: cols[name] = build_categorical_column( categories=categories[idx], - codes=build_column(cols[name].base_data, dtype=cols[name].dtype), + codes=build_column( + cols[name].base_data, dtype=cols[name].dtype + ), mask=cols[name].base_mask, offset=cols[name].offset, size=cols[name].size, diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index e0ea964c767..62ded8ac6f1 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -247,7 +247,9 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: kind = _DtypeKind.CATEGORICAL # Codes and categories' dtypes are different. # We use codes' dtype as these are stored in the buffer. - codes = cast(cudf.core.column.CategoricalColumn, self._col).codes + codes = cast( + cudf.core.column.CategoricalColumn, self._col + ).codes dtype = codes.dtype else: raise ValueError( @@ -324,7 +326,9 @@ def describe_null(self) -> Tuple[int, Any]: return _MaskKind.BITMASK, 0 else: - raise NotImplementedError(f"Data type {self.dtype} not yet supported") + raise NotImplementedError( + f"Data type {self.dtype} not yet supported" + ) @property def null_count(self) -> int: @@ -346,7 +350,9 @@ def num_chunks(self) -> int: """ return 1 - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["_CuDFColumn"]: + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable["_CuDFColumn"]: """ Return an iterable yielding the chunks. @@ -405,16 +411,21 @@ def _get_validity_buffer( if null == _MaskKind.BITMASK: assert self._col.mask is not None - buffer = _CuDFBuffer(self._col.mask, cp.uint8, allow_copy=self._allow_copy) + buffer = _CuDFBuffer( + self._col.mask, cp.uint8, allow_copy=self._allow_copy + ) dtype = (_DtypeKind.UINT, 8, "C", "=") return buffer, dtype elif null == _MaskKind.NAN: raise RuntimeError( - "This column uses NaN as null " "so does not have a separate mask" + "This column uses NaN as null " + "so does not have a separate mask" ) elif null == _MaskKind.NON_NULLABLE: - raise RuntimeError("This column is non-nullable so does not have a mask") + raise RuntimeError( + "This column is non-nullable so does not have a mask" + ) else: raise NotImplementedError( f"See {self.__class__.__name__}.describe_null method." @@ -465,18 +476,26 @@ def _get_data_buffer( dtype = self.dtype elif self.dtype[0] == _DtypeKind.CATEGORICAL: - col_data = cast(cudf.core.column.CategoricalColumn, self._col).codes + col_data = cast( + cudf.core.column.CategoricalColumn, self._col + ).codes dtype = self._dtype_from_cudfdtype(col_data.dtype) elif self.dtype[0] == _DtypeKind.STRING: - col_data = build_column(data=self._col.data, dtype=np.dtype("int8")) + col_data = build_column( + data=self._col.data, dtype=np.dtype("int8") + ) dtype = self._dtype_from_cudfdtype(col_data.dtype) else: - raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") + raise NotImplementedError( + f"Data type {self._col.dtype} not handled yet" + ) assert (col_data is not None) and (col_data.data is not None), " " f"col_data(.data) should not be None when dtype = {dtype}" - buffer = _CuDFBuffer(col_data.data, col_data.dtype, allow_copy=self._allow_copy) + buffer = _CuDFBuffer( + col_data.data, col_data.dtype, allow_copy=self._allow_copy + ) return buffer, dtype @@ -516,7 +535,9 @@ def __dataframe__( """ See the docstring of the `cudf.DataFrame.__dataframe__` for details """ - return _CuDFDataFrame(self._df, nan_as_null=nan_as_null, allow_copy=allow_copy) + return _CuDFDataFrame( + self._df, nan_as_null=nan_as_null, allow_copy=allow_copy + ) @property def metadata(self): @@ -537,10 +558,14 @@ def column_names(self) -> Iterable[str]: return self._df._column_names def get_column(self, i: int) -> _CuDFColumn: - return _CuDFColumn(as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy) + return _CuDFColumn( + as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy + ) def get_column_by_name(self, name: str) -> _CuDFColumn: - return _CuDFColumn(as_column(self._df[name]), allow_copy=self._allow_copy) + return _CuDFColumn( + as_column(self._df[name]), allow_copy=self._allow_copy + ) def get_columns(self) -> Iterable[_CuDFColumn]: return [ @@ -562,7 +587,9 @@ def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame": self._df.loc[:, names], self._nan_as_null, self._allow_copy ) - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["_CuDFDataFrame"]: + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable["_CuDFDataFrame"]: """ Return an iterator yielding the chunks. """ @@ -627,7 +654,9 @@ def __dataframe__( } -def from_dataframe(df: DataFrameObject, allow_copy: bool = False) -> _CuDFDataFrame: +def from_dataframe( + df: DataFrameObject, allow_copy: bool = False +) -> _CuDFDataFrame: """ Construct a ``DataFrame`` from ``df`` if it supports the dataframe interchange protocol (``__dataframe__``). @@ -688,16 +717,24 @@ def from_dataframe(df: DataFrameObject, allow_copy: bool = False) -> _CuDFDataFr _DtypeKind.FLOAT, _DtypeKind.BOOL, ): - columns[name], _buf = _protocol_to_cudf_column_numeric(col, allow_copy) + columns[name], _buf = _protocol_to_cudf_column_numeric( + col, allow_copy + ) elif col.dtype[0] == _DtypeKind.CATEGORICAL: - columns[name], _buf = _protocol_to_cudf_column_categorical(col, allow_copy) + columns[name], _buf = _protocol_to_cudf_column_categorical( + col, allow_copy + ) elif col.dtype[0] == _DtypeKind.STRING: - columns[name], _buf = _protocol_to_cudf_column_string(col, allow_copy) + columns[name], _buf = _protocol_to_cudf_column_string( + col, allow_copy + ) else: - raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") + raise NotImplementedError( + f"Data type {col.dtype[0]} not handled yet" + ) _buffers.append(_buf) @@ -759,12 +796,16 @@ def _set_missing_values( if valid_mask is not None: null, invalid = protocol_col.describe_null if null == _MaskKind.BYTEMASK: - valid_mask = _ensure_gpu_buffer(valid_mask[0], valid_mask[1], allow_copy) + valid_mask = _ensure_gpu_buffer( + valid_mask[0], valid_mask[1], allow_copy + ) boolmask = as_column(valid_mask._buf, dtype="bool") bitmask = cudf._lib.transform.bools_to_mask(boolmask) return cudf_col.set_mask(bitmask) elif null == _MaskKind.BITMASK: - valid_mask = _ensure_gpu_buffer(valid_mask[0], valid_mask[1], allow_copy) + valid_mask = _ensure_gpu_buffer( + valid_mask[0], valid_mask[1], allow_copy + ) bitmask = valid_mask._buf return cudf_col.set_mask(bitmask) return cudf_col @@ -790,7 +831,9 @@ def _protocol_to_cudf_column_categorical( """ ordered, is_dict, categories = col.describe_categorical if not is_dict: - raise NotImplementedError("Non-dictionary categoricals not supported yet") + raise NotImplementedError( + "Non-dictionary categoricals not supported yet" + ) buffers = col.get_buffers() assert buffers["data"] is not None, "data buffer should not be None" codes_buffer, codes_dtype = buffers["data"] @@ -851,6 +894,8 @@ def _protocol_to_cudf_column_string( def _protocol_buffer_to_cudf_buffer(protocol_buffer): return as_buffer( - rmm.DeviceBuffer(ptr=protocol_buffer.ptr, size=protocol_buffer.bufsize), + rmm.DeviceBuffer( + ptr=protocol_buffer.ptr, size=protocol_buffer.bufsize + ), exposed=True, ) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 4974329396d..3bd342e24c2 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -225,7 +225,9 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": >>> cudf_dtype CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) """ # noqa: E501 - return CategoricalDtype(categories=dtype.categories, ordered=dtype.ordered) + return CategoricalDtype( + categories=dtype.categories, ordered=dtype.ordered + ) def to_pandas(self) -> pd.CategoricalDtype: """ @@ -307,7 +309,9 @@ def deserialize(cls, header, frames): categories_header = header["categories"] categories_frames = frames categories_type = pickle.loads(categories_header["type-serialized"]) - categories = categories_type.deserialize(categories_header, categories_frames) + categories = categories_type.deserialize( + categories_header, categories_frames + ) return klass(categories=categories, ordered=ordered) def __repr__(self): @@ -354,7 +358,9 @@ def __init__(self, element_type: Any) -> None: if isinstance(element_type, ListDtype): self._typ = pa.list_(element_type._typ) else: - element_type = cudf.utils.dtypes.cudf_dtype_to_pa_type(element_type) + element_type = cudf.utils.dtypes.cudf_dtype_to_pa_type( + element_type + ) self._typ = pa.list_(element_type) @cached_property @@ -541,7 +547,8 @@ class StructDtype(_BaseDtype): def __init__(self, fields): pa_fields = { - k: cudf.utils.dtypes.cudf_dtype_to_pa_type(v) for k, v in fields.items() + k: cudf.utils.dtypes.cudf_dtype_to_pa_type(v) + for k, v in fields.items() } self._typ = pa.struct(pa_fields) @@ -646,7 +653,9 @@ def deserialize(cls, header: dict, frames: list): for k, dtype in header["fields"].items(): if isinstance(dtype, tuple): dtype_header, (start, stop) = dtype - fields[k] = pickle.loads(dtype_header["type-serialized"]).deserialize( + fields[k] = pickle.loads( + dtype_header["type-serialized"] + ).deserialize( dtype_header, frames[start:stop], ) @@ -910,7 +919,9 @@ def from_arrow(cls, typ): return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed) def to_arrow(self): - return ArrowIntervalType(pa.from_numpy_dtype(self.subtype), self.closed) + return ArrowIntervalType( + pa.from_numpy_dtype(self.subtype), self.closed + ) @classmethod def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype": @@ -1048,7 +1059,10 @@ def is_list_dtype(obj): or type(obj) is cudf.core.column.ListColumn or obj is cudf.core.column.ListColumn or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name) - or (hasattr(obj, "dtype") and isinstance(obj.dtype, cudf.core.dtypes.ListDtype)) + or ( + hasattr(obj, "dtype") + and isinstance(obj.dtype, cudf.core.dtypes.ListDtype) + ) ) @@ -1095,7 +1109,9 @@ def is_decimal_dtype(obj): Whether or not the array-like or dtype is of the decimal dtype. """ return ( - is_decimal32_dtype(obj) or is_decimal64_dtype(obj) or is_decimal128_dtype(obj) + is_decimal32_dtype(obj) + or is_decimal64_dtype(obj) + or is_decimal128_dtype(obj) ) @@ -1110,7 +1126,9 @@ def _is_interval_dtype(obj): ) or obj is cudf.core.dtypes.IntervalDtype or (isinstance(obj, cudf.core.index.BaseIndex) and obj._is_interval()) - or (isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name) + or ( + isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name + ) or ( isinstance( getattr(obj, "dtype", None), @@ -1145,7 +1163,10 @@ def is_decimal32_dtype(obj): return ( type(obj) is cudf.core.dtypes.Decimal32Dtype or obj is cudf.core.dtypes.Decimal32Dtype - or (isinstance(obj, str) and obj == cudf.core.dtypes.Decimal32Dtype.name) + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal32Dtype.name + ) or (hasattr(obj, "dtype") and is_decimal32_dtype(obj.dtype)) ) @@ -1154,7 +1175,10 @@ def is_decimal64_dtype(obj): return ( type(obj) is cudf.core.dtypes.Decimal64Dtype or obj is cudf.core.dtypes.Decimal64Dtype - or (isinstance(obj, str) and obj == cudf.core.dtypes.Decimal64Dtype.name) + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal64Dtype.name + ) or (hasattr(obj, "dtype") and is_decimal64_dtype(obj.dtype)) ) @@ -1163,6 +1187,9 @@ def is_decimal128_dtype(obj): return ( type(obj) is cudf.core.dtypes.Decimal128Dtype or obj is cudf.core.dtypes.Decimal128Dtype - or (isinstance(obj, str) and obj == cudf.core.dtypes.Decimal128Dtype.name) + or ( + isinstance(obj, str) + and obj == cudf.core.dtypes.Decimal128Dtype.name + ) or (hasattr(obj, "dtype") and is_decimal128_dtype(obj.dtype)) ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 2294bc40e15..809bdb4e6d1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -87,7 +87,9 @@ def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? @property def _dtypes(self): - return dict(zip(self._data.names, (col.dtype for col in self._data.columns))) + return dict( + zip(self._data.names, (col.dtype for col in self._data.columns)) + ) @_cudf_nvtx_annotate def serialize(self): @@ -119,7 +121,9 @@ def deserialize(cls, header, frames): key = f"column_{metadata}" if key in header: kwargs[metadata] = pickle.loads(header[key]) - col_accessor = ColumnAccessor(data=dict(zip(column_names, columns)), **kwargs) + col_accessor = ColumnAccessor( + data=dict(zip(column_names, columns)), **kwargs + ) return cls_deserialize._from_data(col_accessor) @classmethod @@ -152,11 +156,15 @@ def _from_columns_like_self( return frame._copy_type_metadata(self, override_dtypes=override_dtypes) @_cudf_nvtx_annotate - def _mimic_inplace(self, result: Self, inplace: bool = False) -> Optional[Self]: + def _mimic_inplace( + self, result: Self, inplace: bool = False + ) -> Optional[Self]: if inplace: for col in self._data: if col in result._data: - self._data[col]._mimic_inplace(result._data[col], inplace=True) + self._data[col]._mimic_inplace( + result._data[col], inplace=True + ) self._data = result._data return None else: @@ -345,7 +353,9 @@ def equals(self, other) -> bool: return all( self_col.equals(other_col, check_dtypes=True) - for self_col, other_col in zip(self._data.values(), other._data.values()) + for self_col, other_col in zip( + self._data.values(), other._data.values() + ) ) @_cudf_nvtx_annotate @@ -443,7 +453,9 @@ def get_column_values_na(col): ) dtype = find_common_type(dtypes) - matrix = make_empty_matrix(shape=(len(self), ncol), dtype=dtype, order="F") + matrix = make_empty_matrix( + shape=(len(self), ncol), dtype=dtype, order="F" + ) for i, col in enumerate(self._data.values()): # TODO: col.values may fail if there is nullable data or an # unsupported dtype. We may want to catch and provide a more @@ -484,7 +496,9 @@ def to_cupy( cupy.ndarray """ return self._to_array( - (lambda col: col.values.copy()) if copy else (lambda col: col.values), + (lambda col: col.values.copy()) + if copy + else (lambda col: col.values), cupy.empty, dtype, na_value, @@ -521,7 +535,9 @@ def to_numpy( "array always copies the data." ) - return self._to_array((lambda col: col.values_host), np.empty, dtype, na_value) + return self._to_array( + (lambda col: col.values_host), np.empty, dtype, na_value + ) @_cudf_nvtx_annotate def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: @@ -712,7 +728,9 @@ def fillna( if method: if method not in {"ffill", "bfill", "pad", "backfill"}: - raise NotImplementedError(f"Fill method {method} is not supported") + raise NotImplementedError( + f"Fill method {method} is not supported" + ) if method == "pad": method = "ffill" elif method == "backfill": @@ -791,7 +809,9 @@ def _quantile_table( column_order = [libcudf.types.Order[key] for key in column_order] - null_precedence = [libcudf.types.NullOrder[key] for key in null_precedence] + null_precedence = [ + libcudf.types.NullOrder[key] for key in null_precedence + ] return self._from_columns_like_self( libcudf.quantiles.quantile_table( @@ -892,13 +912,17 @@ def from_arrow(cls, data: pa.Table) -> Self: size=codes.size, ordered=dict_ordered[name], ) - for name, codes in zip(dict_indices_table.column_names, indices_columns) + for name, codes in zip( + dict_indices_table.column_names, indices_columns + ) } # Handle non-dict arrays cudf_non_category_frame = { name: col - for name, col in zip(data.column_names, libcudf.interop.from_arrow(data)) + for name, col in zip( + data.column_names, libcudf.interop.from_arrow(data) + ) } result = {**cudf_non_category_frame, **cudf_category_frame} @@ -906,13 +930,19 @@ def from_arrow(cls, data: pa.Table) -> Self: # There are some special cases that need to be handled # based on metadata. for name in result: - if len(result[name]) == 0 and pandas_dtypes.get(name) == "categorical": + if ( + len(result[name]) == 0 + and pandas_dtypes.get(name) == "categorical" + ): # When pandas_dtype is a categorical column and the size # of column is 0 (i.e., empty) then we will have an # int8 column in result._data[name] returned by libcudf, # which needs to be type-casted to 'category' dtype. result[name] = result[name].as_categorical_column("category") - elif pandas_dtypes.get(name) == "empty" and np_dtypes.get(name) == "object": + elif ( + pandas_dtypes.get(name) == "empty" + and np_dtypes.get(name) == "object" + ): # When a string column has all null values, pandas_dtype is # is specified as 'empty' and np_dtypes as 'object', # hence handling this special case to type-cast the empty @@ -981,7 +1011,9 @@ def _positions_from_column_names(self, column_names) -> list[int]: Frame. """ return [ - i for i, name in enumerate(self._column_names) if name in set(column_names) + i + for i, name in enumerate(self._column_names) + if name in set(column_names) ] @_cudf_nvtx_annotate @@ -1258,11 +1290,15 @@ def searchsorted( for col, val in zip(self._columns, values) ] sources = [ - col if is_dtype_equal(col.dtype, common_dtype) else col.astype(common_dtype) + col + if is_dtype_equal(col.dtype, common_dtype) + else col.astype(common_dtype) for col, common_dtype in zip(self._columns, common_dtype_list) ] values = [ - val if is_dtype_equal(val.dtype, common_dtype) else val.astype(common_dtype) + val + if is_dtype_equal(val.dtype, common_dtype) + else val.astype(common_dtype) for val, common_dtype in zip(values, common_dtype_list) ] @@ -1428,12 +1464,16 @@ def _is_sorted(self, ascending=None, null_position=None): Returns True, if sorted as expected by ``ascending`` and ``null_position``, False otherwise. """ - if ascending is not None and not cudf.api.types.is_list_like(ascending): + if ascending is not None and not cudf.api.types.is_list_like( + ascending + ): raise TypeError( f"Expected a list-like or None for `ascending`, got " f"{type(ascending)}" ) - if null_position is not None and not cudf.api.types.is_list_like(null_position): + if null_position is not None and not cudf.api.types.is_list_like( + null_position + ): raise TypeError( f"Expected a list-like or None for `null_position`, got " f"{type(null_position)}" @@ -1449,7 +1489,9 @@ def _split(self, splits): """ return [ self._from_columns_like_self( - libcudf.copying.columns_split([*self._data.columns], splits)[split_idx], + libcudf.copying.columns_split([*self._data.columns], splits)[ + split_idx + ], self._column_names, ) for split_idx in range(len(splits) + 1) @@ -1552,7 +1594,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @_cudf_nvtx_annotate @acquire_spill_lock() - def _apply_cupy_ufunc_to_operands(self, ufunc, cupy_func, operands, **kwargs): + def _apply_cupy_ufunc_to_operands( + self, ufunc, cupy_func, operands, **kwargs + ): # Note: There are some operations that may be supported by libcudf but # are not supported by pandas APIs. In particular, libcudf binary # operations support logical and/or operations as well as @@ -1865,7 +1909,10 @@ def __copy__(self): def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data_like_self( - {name: _apply_inverse_column(col) for name, col in self._data.items()} + { + name: _apply_inverse_column(col) + for name, col in self._data.items() + } ) @_cudf_nvtx_annotate @@ -1885,14 +1932,19 @@ def nunique(self, dropna: bool = True): Name and unique value counts of each column in frame. """ return { - name: col.distinct_count(dropna=dropna) for name, col in self._data.items() + name: col.distinct_count(dropna=dropna) + for name, col in self._data.items() } @staticmethod @_cudf_nvtx_annotate - def _repeat(columns: List[ColumnBase], repeats, axis=None) -> List[ColumnBase]: + def _repeat( + columns: List[ColumnBase], repeats, axis=None + ) -> List[ColumnBase]: if axis is not None: - raise NotImplementedError("Only axis=`None` supported at this time.") + raise NotImplementedError( + "Only axis=`None` supported at this time." + ) if not is_scalar(repeats): repeats = as_column(repeats) @@ -1918,4 +1970,6 @@ def _apply_inverse_column(col: ColumnBase) -> ColumnBase: elif is_bool_dtype(col.dtype): return col.unary_operator("not") else: - raise TypeError(f"Operation `~` not supported on {col.dtype.type.__name__}") + raise TypeError( + f"Operation `~` not supported on {col.dtype.type.__name__}" + ) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 375b6320297..d995964057b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -283,7 +283,9 @@ def __iter__(self): group_names = group_names.to_pandas() for i, name in enumerate(group_names): yield ( - (name,) if isinstance(self._by, list) and len(self._by) == 1 else name, + (name,) + if isinstance(self._by, list) and len(self._by) == 1 + else name, grouped_values[offsets[i] : offsets[i + 1]], ) @@ -321,7 +323,10 @@ def dtypes(self): index = self.grouping.keys.unique().sort_values().to_pandas() obj_dtypes = self.obj._dtypes return pd.DataFrame( - {name: [obj_dtypes[name]] * len(index) for name in self.obj._data.names}, + { + name: [obj_dtypes[name]] * len(index) + for name in self.obj._data.names + }, index=index, ) @@ -339,7 +344,9 @@ def groups(self): f"number of groups. Got {len(group_names)} groups." ) - return dict(zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1]))) + return dict( + zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1])) + ) @cached_property def indices(self): @@ -360,7 +367,11 @@ def indices(self): {10: array([0, 1]), 40: array([2])} """ offsets, group_keys, (indices,) = self._groupby.groups( - [cudf.core.column.as_column(range(len(self.obj)), dtype=size_type_dtype)] + [ + cudf.core.column.as_column( + range(len(self.obj)), dtype=size_type_dtype + ) + ] ) group_keys = libcudf.stream_compaction.drop_duplicates(group_keys) @@ -369,7 +380,9 @@ def indices(self): else: (group_keys,) = group_keys index = cudf.Index(group_keys) - return dict(zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))) + return dict( + zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1])) + ) @_cudf_nvtx_annotate def get_group(self, name, obj=None): @@ -422,7 +435,9 @@ def size(self): """ return ( cudf.Series( - cudf.core.column.column_empty(len(self.obj), "int8", masked=False) + cudf.core.column.column_empty( + len(self.obj), "int8", masked=False + ) ) .groupby(self.grouping, sort=self._sort, dropna=self._dropna) .agg("size") @@ -435,7 +450,9 @@ def cumcount(self): """ return ( cudf.Series( - cudf.core.column.column_empty(len(self.obj), "int8", masked=False), + cudf.core.column.column_empty( + len(self.obj), "int8", masked=False + ), index=self.obj.index, ) .groupby(self.grouping, sort=self._sort) @@ -468,9 +485,12 @@ def rank( # treats NaNs the way we treat nulls. if cudf.get_option("mode.pandas_compatible"): if any( - is_float_dtype(typ) for typ in self.grouping.values._dtypes.values() + is_float_dtype(typ) + for typ in self.grouping.values._dtypes.values() ): - raise NotImplementedError("NaNs are not supported in groupby.rank.") + raise NotImplementedError( + "NaNs are not supported in groupby.rank." + ) def rank(x): return getattr(x, "rank")( @@ -490,7 +510,9 @@ def rank(x): @cached_property def _groupby(self): - return libgroupby.GroupBy([*self.grouping.keys._columns], dropna=self._dropna) + return libgroupby.GroupBy( + [*self.grouping.keys._columns], dropna=self._dropna + ) @_cudf_nvtx_annotate def agg(self, func): @@ -613,7 +635,10 @@ def agg(self, func): key = (col_name, agg_name) else: key = col_name - if agg in {list, "collect"} and orig_dtype != col.dtype.element_type: + if ( + agg in {list, "collect"} + and orig_dtype != col.dtype.element_type + ): # Structs lose their labels which we reconstruct here col = col._with_type_metadata(cudf.ListDtype(orig_dtype)) @@ -651,7 +676,9 @@ def agg(self, func): ) and not libgroupby._is_all_scan_aggregate(normalized_aggs): # Even with `sort=False`, pandas guarantees that # groupby preserves the order of rows within each group. - left_cols = list(self.grouping.keys.drop_duplicates()._data.columns) + left_cols = list( + self.grouping.keys.drop_duplicates()._data.columns + ) right_cols = list(result_index._data.columns) join_keys = [ _match_join_keys(lcol, rcol, "left") @@ -705,9 +732,13 @@ def _reduce( The numeric_only, min_count """ if numeric_only: - raise NotImplementedError("numeric_only parameter is not implemented yet") + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) if min_count != 0: - raise NotImplementedError("min_count parameter is not implemented yet") + raise NotImplementedError( + "min_count parameter is not implemented yet" + ) return self.agg(op) def _scan(self, op: str, *args, **kwargs): @@ -750,7 +781,9 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # "Out of bounds" n for the group size either means no entries # (negative) or all the entries (positive) if n < 0: - size_per_group = np.maximum(size_per_group + n, 0, out=size_per_group) + size_per_group = np.maximum( + size_per_group + n, 0, out=size_per_group + ) else: size_per_group = np.minimum(size_per_group, n, out=size_per_group) if take_head: @@ -827,7 +860,9 @@ def head(self, n: int = 5, *, preserve_order: bool = True): 6 3 6 8 3 8 """ - return self._head_tail(n, take_head=True, preserve_order=preserve_order) + return self._head_tail( + n, take_head=True, preserve_order=preserve_order + ) @_cudf_nvtx_annotate def tail(self, n: int = 5, *, preserve_order: bool = True): @@ -879,7 +914,9 @@ def tail(self, n: int = 5, *, preserve_order: bool = True): 9 3 9 10 3 10 """ - return self._head_tail(n, take_head=False, preserve_order=preserve_order) + return self._head_tail( + n, take_head=False, preserve_order=preserve_order + ) @_cudf_nvtx_annotate def nth(self, n): @@ -895,7 +932,9 @@ def nth(self, n): result = result[sizes > n] - result._index = self.obj.index.take(result._data["__groupbynth_order__"]) + result._index = self.obj.index.take( + result._data["__groupbynth_order__"] + ) del result._data["__groupbynth_order__"] del self.obj._data["__groupbynth_order__"] return result @@ -1046,7 +1085,8 @@ def sample( # TODO: handle random states properly. if random_state is not None and not isinstance(random_state, int): raise NotImplementedError( - "Only integer seeds are supported for random_state " "in this case" + "Only integer seeds are supported for random_state " + "in this case" ) # Get the groups # TODO: convince Cython to convert the std::vector offsets @@ -1068,9 +1108,9 @@ def sample( # Pandas uses round-to-nearest, ties to even to # pick sample sizes for the fractional case (unlike IEEE # which is round-to-nearest, ties to sgn(x) * inf). - samples_per_group = np.round(size_per_group * frac, decimals=0).astype( - size_type_dtype - ) + samples_per_group = np.round( + size_per_group * frac, decimals=0 + ).astype(size_type_dtype) if replace: # We would prefer to use cupy here, but their rng.integers # interface doesn't take array-based low and high @@ -1141,7 +1181,9 @@ def deserialize(cls, header, frames): kwargs = header["kwargs"] obj_type = pickle.loads(header["obj_type"]) - obj = obj_type.deserialize(header["obj"], frames[: header["num_obj_frames"]]) + obj = obj_type.deserialize( + header["obj"], frames[: header["num_obj_frames"]] + ) grouping = _Grouping.deserialize( header["grouping"], frames[header["num_obj_frames"] :] ) @@ -1266,7 +1308,9 @@ def pipe(self, func, *args, **kwargs): def _jit_groupby_apply( self, function, group_names, offsets, group_keys, grouped_values, *args ): - chunk_results = jit_groupby_apply(offsets, grouped_values, function, *args) + chunk_results = jit_groupby_apply( + offsets, grouped_values, function, *args + ) return self._post_process_chunk_results( chunk_results, group_names, group_keys, grouped_values ) @@ -1284,7 +1328,9 @@ def _iterative_groupby_apply( RuntimeWarning, ) - chunks = [grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])] + chunks = [ + grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) + ] chunk_results = [function(chk, *args) for chk in chunks] return self._post_process_chunk_results( chunk_results, group_names, group_keys, grouped_values @@ -1330,7 +1376,9 @@ def _post_process_chunk_results( # TODO: Is there a better way to determine what # the column name should be, especially if we applied # a nameless UDF. - result = result.to_frame(name=grouped_values._data.names[0]) + result = result.to_frame( + name=grouped_values._data.names[0] + ) else: index_data = group_keys._data.copy(deep=True) index_data[None] = grouped_values.index._column @@ -1356,7 +1404,9 @@ def _post_process_chunk_results( return result @_cudf_nvtx_annotate - def apply(self, function, *args, engine="auto", include_groups: bool = True): + def apply( + self, function, *args, engine="auto", include_groups: bool = True + ): """Apply a python transformation function over the grouped chunk. Parameters @@ -1870,9 +1920,13 @@ def corr(self, method="pearson", min_periods=1): """ if method.lower() not in ("pearson",): - raise NotImplementedError("Only pearson correlation is currently supported") + raise NotImplementedError( + "Only pearson correlation is currently supported" + ) - return self._cov_or_corr(lambda x: x.corr(method, min_periods), "Correlation") + return self._cov_or_corr( + lambda x: x.corr(method, min_periods), "Correlation" + ) @_cudf_nvtx_annotate def cov(self, min_periods=0, ddof=1): @@ -1960,7 +2014,9 @@ def cov(self, min_periods=0, ddof=1): val3 3.833333 12.333333 12.333333 """ - return self._cov_or_corr(lambda x: x.cov(min_periods, ddof), "Covariance") + return self._cov_or_corr( + lambda x: x.cov(min_periods, ddof), "Covariance" + ) def _cov_or_corr(self, func, method_name): """ @@ -2004,15 +2060,17 @@ def _cov_or_corr(self, func, method_name): offset=0, ) - column_pair_groupby = cudf.DataFrame._from_data(column_pair_structs).groupby( - by=self.grouping.keys - ) + column_pair_groupby = cudf.DataFrame._from_data( + column_pair_structs + ).groupby(by=self.grouping.keys) try: gb_cov_corr = column_pair_groupby.agg(func) except RuntimeError as e: if "Unsupported groupby reduction type-agg combination" in str(e): - raise TypeError(f"{method_name} accepts only numerical column-pairs") + raise TypeError( + f"{method_name} accepts only numerical column-pairs" + ) raise # ensure that column-pair labels are arranged in ascending order @@ -2022,7 +2080,8 @@ def _cov_or_corr(self, func, method_name): for i, x in enumerate(column_names) ] cols_split = [ - cols_list[i : i + num_cols] for i in range(0, len(cols_list), num_cols) + cols_list[i : i + num_cols] + for i in range(0, len(cols_list), num_cols) ] # interleave: combines the correlation or covariance results for each @@ -2213,8 +2272,8 @@ def fillna( """ warnings.warn( "groupby fillna is deprecated and " - "will be removed in a future version. Use groupby ffill or groupby bfill " - "for forward or backward filling instead.", + "will be removed in a future version. Use groupby ffill " + "or groupby bfill for forward or backward filling instead.", FutureWarning, ) if inplace: @@ -2239,7 +2298,9 @@ def fillna( values = self.obj.__class__._from_data( self.grouping.values._data, self.obj.index ) - return values.fillna(value=value, inplace=inplace, axis=axis, limit=limit) + return values.fillna( + value=value, inplace=inplace, axis=axis, limit=limit + ) @_cudf_nvtx_annotate def shift(self, periods=1, freq=None, axis=0, fill_value=None): @@ -2285,7 +2346,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): values = self.grouping.values if is_list_like(fill_value): if len(fill_value) != len(values._data): - raise ValueError("Mismatched number of columns and values to fill.") + raise ValueError( + "Mismatched number of columns and values to fill." + ) else: fill_value = [fill_value] * len(values._data) @@ -2293,7 +2356,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): dict( zip( values._column_names, - self._groupby.shift([*values._columns], periods, fill_value)[0], + self._groupby.shift( + [*values._columns], periods, fill_value + )[0], ) ) ) @@ -2345,11 +2410,15 @@ def pct_change( if freq is not None: raise NotImplementedError("freq parameter not supported yet.") elif fill_method not in {no_default, None, "ffill", "bfill"}: - raise ValueError("fill_method must be one of 'ffill', or" "'bfill'.") + raise ValueError( + "fill_method must be one of 'ffill', or" "'bfill'." + ) if fill_method not in (no_default, None) or limit is not no_default: # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The 'fill_method' keyword being not None and the 'limit' " f"keywords in {type(self).__name__}.pct_change are " @@ -2506,7 +2575,9 @@ def value_counts( df["__placeholder"] = 1 result = ( - df.groupby(groupings + list(subset), dropna=dropna)["__placeholder"] + df.groupby(groupings + list(subset), dropna=dropna)[ + "__placeholder" + ] .count() .sort_index() .astype(np.int64) @@ -2525,7 +2596,9 @@ def value_counts( if not self._as_index: if name in df._column_names: - raise ValueError(f"Column label '{name}' is duplicate of result column") + raise ValueError( + f"Column label '{name}' is duplicate of result column" + ) result.name = name result = result.to_frame().reset_index() else: @@ -2533,7 +2606,9 @@ def value_counts( return result - def _mimic_pandas_order(self, result: DataFrameOrSeries) -> DataFrameOrSeries: + def _mimic_pandas_order( + self, result: DataFrameOrSeries + ) -> DataFrameOrSeries: """Given a groupby result from libcudf, reconstruct the row orders matching that of pandas. This also adds appropriate indices. """ @@ -2546,9 +2621,12 @@ def _mimic_pandas_order(self, result: DataFrameOrSeries) -> DataFrameOrSeries: # result coming back from libcudf has null_count few rows than # the input, so we must produce an ordering from the full # input range. - _, _, (ordering,) = self._groupby.groups([as_column(range(0, len(self.obj)))]) + _, _, (ordering,) = self._groupby.groups( + [as_column(range(0, len(self.obj)))] + ) if self._dropna and any( - c.has_nulls(include_nan=True) > 0 for c in self.grouping._key_columns + c.has_nulls(include_nan=True) > 0 + for c in self.grouping._key_columns ): # Scan aggregations with null/nan keys put nulls in the # corresponding output rows in pandas, to do that here @@ -2616,7 +2694,9 @@ def apply(self, func, *args): # TODO: should we define this as a dataclass instead? class Grouper: - def __init__(self, key=None, level=None, freq=None, closed=None, label=None): + def __init__( + self, key=None, level=None, freq=None, closed=None, label=None + ): if key is not None and level is not None: raise ValueError("Grouper cannot specify both key and level") if (key, level) == (None, None) and not freq: @@ -2685,7 +2765,9 @@ def keys(self): dict(zip(range(nkeys), self._key_columns)) )._set_names(self.names) else: - return cudf.core.index.as_index(self._key_columns[0], name=self.names[0]) + return cudf.core.index.as_index( + self._key_columns[0], name=self.names[0] + ) @property def values(self) -> cudf.core.frame.Frame: @@ -2772,7 +2854,9 @@ def serialize(self): def deserialize(cls, header, frames): names = pickle.loads(header["names"]) _named_columns = pickle.loads(header["_named_columns"]) - key_columns = cudf.core.column.deserialize_columns(header["columns"], frames) + key_columns = cudf.core.column.deserialize_columns( + header["columns"], frames + ) out = _Grouping.__new__(_Grouping) out.names = names out._named_columns = _named_columns diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1d0efc3ad47..bd9dc1ae3da 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -141,7 +141,8 @@ def _index_from_data(data: MutableMapping, name: Any = no_default): index_class_type = IntervalIndex else: raise NotImplementedError( - "Unsupported column type passed to " f"create an Index: {type(values)}" + "Unsupported column type passed to " + f"create an Index: {type(values)}" ) else: index_class_type = cudf.MultiIndex @@ -200,7 +201,9 @@ class RangeIndex(BaseIndex, BinaryOperand): _range: range @_cudf_nvtx_annotate - def __init__(self, start, stop=None, step=1, dtype=None, copy=False, name=None): + def __init__( + self, start, stop=None, step=1, dtype=None, copy=False, name=None + ): if step == 0: raise ValueError("Step must not be zero.") if not cudf.api.types.is_hashable(name): @@ -216,14 +219,20 @@ def __init__(self, start, stop=None, step=1, dtype=None, copy=False, name=None): if stop is None: start, stop = 0, start if not is_integer(start): - raise TypeError(f"start must be an integer, not {type(start).__name__}") + raise TypeError( + f"start must be an integer, not {type(start).__name__}" + ) self._start = int(start) if not is_integer(stop): - raise TypeError(f"stop must be an integer, not {type(stop).__name__}") + raise TypeError( + f"stop must be an integer, not {type(stop).__name__}" + ) self._stop = int(stop) if step is not None: if not is_integer(step): - raise TypeError(f"step must be an integer, not {type(step).__name__}") + raise TypeError( + f"step must be an integer, not {type(step).__name__}" + ) self._step = int(step) else: self._step = 1 @@ -234,7 +243,9 @@ def __init__(self, start, stop=None, step=1, dtype=None, copy=False, name=None): # whereas _stop is an upper bound. self._end = self._start + self._step * (len(self._range) - 1) - def _copy_type_metadata(self, other: RangeIndex, *, override_dtypes=None) -> Self: + def _copy_type_metadata( + self, other: RangeIndex, *, override_dtypes=None + ) -> Self: # There is no metadata to be copied for RangeIndex since it does not # have an underlying column. return self @@ -330,7 +341,9 @@ def hasnans(self): @property # type: ignore @_cudf_nvtx_annotate def _data(self): - return cudf.core.column_accessor.ColumnAccessor({self.name: self._values}) + return cudf.core.column_accessor.ColumnAccessor( + {self.name: self._values} + ) @_cudf_nvtx_annotate def __contains__(self, item): @@ -528,7 +541,9 @@ def __mul__(self, other): ): other = other.item() if isinstance(other, (int, np.integer)): - return RangeIndex(self.start * other, self.stop * other, self.step * other) + return RangeIndex( + self.start * other, self.stop * other, self.step * other + ) return self._as_int_index().__mul__(other) @_cudf_nvtx_annotate @@ -544,7 +559,9 @@ def _as_int_index(self): @_cudf_nvtx_annotate def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - return self._as_int_index().__array_ufunc__(ufunc, method, *inputs, **kwargs) + return self._as_int_index().__array_ufunc__( + ufunc, method, *inputs, **kwargs + ) @_cudf_nvtx_annotate def get_indexer(self, target, limit=None, method=None, tolerance=None): @@ -699,7 +716,9 @@ def _intersection(self, other, sort=None): # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds - tmp_start = first.start + (second.start - first.start) * first.step // gcd * s + tmp_start = ( + first.start + (second.start - first.start) * first.step // gcd * s + ) new_step = first.step * second.step // gcd no_steps = -(-(int_low - tmp_start) // abs(new_step)) new_start = tmp_start + abs(new_step) * no_steps @@ -718,7 +737,9 @@ def difference(self, other, sort=None): if isinstance(other, RangeIndex) and self.equals(other): return self[:0]._get_reconciled_name_object(other) - return self._try_reconstruct_range_index(super().difference(other, sort=sort)) + return self._try_reconstruct_range_index( + super().difference(other, sort=sort) + ) def _try_reconstruct_range_index(self, index): if isinstance(index, RangeIndex) or index.dtype.kind == "f": @@ -780,14 +801,18 @@ def repeat(self, repeats, axis=None): return self._as_int_index().repeat(repeats, axis) def _split(self, splits): - return cudf.Index._from_data({self.name: self._as_int_index()._split(splits)}) + return cudf.Index._from_data( + {self.name: self._as_int_index()._split(splits)} + ) def _binaryop(self, other, op: str): # TODO: certain binops don't require materializing range index and # could use some optimization. return self._as_int_index()._binaryop(other, op=op) - def join(self, other, how="left", level=None, return_indexers=False, sort=False): + def join( + self, other, how="left", level=None, return_indexers=False, sort=False + ): if how in {"left", "right"} or self.equals(other): # pandas supports directly merging RangeIndex objects and can # intelligently create RangeIndex outputs depending on the type of @@ -802,10 +827,14 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) sort=sort, ) if return_indexers: - return tuple(cudf.from_pandas(result[0]), result[1], result[2]) + return tuple( + cudf.from_pandas(result[0]), result[1], result[2] + ) else: return cudf.from_pandas(result) - return self._as_int_index().join(other, how, level, return_indexers, sort) + return self._as_int_index().join( + other, how, level, return_indexers, sort + ) @property # type: ignore @_cudf_nvtx_annotate @@ -832,7 +861,9 @@ def argsort( raise ValueError(f"invalid na_position: {na_position}") indices = cupy.arange(0, len(self)) - if (ascending and self._step < 0) or (not ascending and self._step > 0): + if (ascending and self._step < 0) or ( + not ascending and self._step > 0 + ): indices = indices[::-1] return indices @@ -877,7 +908,9 @@ def _minmax(self, meth: str): no_steps = len(self) - 1 if no_steps == -1: return np.nan - elif (meth == "min" and self.step > 0) or (meth == "max" and self.step < 0): + elif (meth == "min" and self.step > 0) or ( + meth == "max" and self.step < 0 + ): return self.start return self.start + self.step * no_steps @@ -974,7 +1007,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): inputs = self._make_operands_for_binop(other) else: inputs = { - name: (col, None, False, None) for name, col in self._data.items() + name: (col, None, False, None) + for name, col in self._data.items() } data = self._apply_cupy_ufunc_to_operands( @@ -1041,7 +1075,9 @@ def _binaryop( other_name = getattr(other, "name", self.name) ret.name = ( - self.name if cudf.utils.utils._is_same_name(self.name, other_name) else None + self.name + if cudf.utils.utils._is_same_name(self.name, other_name) + else None ) # pandas returns numpy arrays when the outputs are boolean. We @@ -1056,8 +1092,12 @@ def _binaryop( # Override just to make mypy happy. @_cudf_nvtx_annotate - def _copy_type_metadata(self, other: Self, *, override_dtypes=None) -> Self: - return super()._copy_type_metadata(other, override_dtypes=override_dtypes) + def _copy_type_metadata( + self, other: Self, *, override_dtypes=None + ) -> Self: + return super()._copy_type_metadata( + other, override_dtypes=override_dtypes + ) @property # type: ignore @_cudf_nvtx_annotate @@ -1070,7 +1110,9 @@ def _concat(cls, objs): non_empties = [index for index in objs if len(index)] if len(objs) != len(non_empties): # Do not remove until pandas-3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The behavior of array concatenation with empty entries is " "deprecated. In a future version, this will no longer exclude " @@ -1105,7 +1147,11 @@ def is_unique(self): @_cudf_nvtx_annotate def equals(self, other): - if other is None or not isinstance(other, BaseIndex) or len(self) != len(other): + if ( + other is None + or not isinstance(other, BaseIndex) + or len(self) != len(other) + ): return False check_dtypes = False @@ -1120,7 +1166,9 @@ def equals(self, other): check_dtypes = True try: - return self._column.equals(other._column, check_dtypes=check_dtypes) + return self._column.equals( + other._column, check_dtypes=check_dtypes + ) except TypeError: return False @@ -1172,7 +1220,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if not self.is_unique: raise ValueError("Cannot get index for a non-unique Index.") - is_sorted = self.is_monotonic_increasing or self.is_monotonic_decreasing + is_sorted = ( + self.is_monotonic_increasing or self.is_monotonic_decreasing + ) if not is_sorted and method is not None: raise ValueError( @@ -1226,7 +1276,9 @@ def get_loc(self, key): if not is_scalar(key): raise TypeError("Should be a scalar-like") - is_sorted = self.is_monotonic_increasing or self.is_monotonic_decreasing + is_sorted = ( + self.is_monotonic_increasing or self.is_monotonic_decreasing + ) target_as_table = cudf.core.frame.Frame({"None": as_column([key])}) lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( @@ -1238,7 +1290,11 @@ def get_loc(self, key): if lower_bound + 1 == upper_bound: # Search result is unique, return int. - return lower_bound if is_sorted else sort_inds.element_indexing(lower_bound) + return ( + lower_bound + if is_sorted + else sort_inds.element_indexing(lower_bound) + ) if is_sorted: # In monotonic index, lex search result is continuous. A slice for @@ -1286,7 +1342,9 @@ def __repr__(self): ) ) break_idx = output.find("ordered=") - output = output[:break_idx].replace("'", "") + output[break_idx:] + output = ( + output[:break_idx].replace("'", "") + output[break_idx:] + ) else: output = repr(preprocess.to_pandas()) @@ -1535,7 +1593,9 @@ def str(self): if isinstance(self._values, cudf.core.column.StringColumn): return StringMethods(parent=self) else: - raise AttributeError("Can only use .str accessor with string values!") + raise AttributeError( + "Can only use .str accessor with string values!" + ) @cache @_warn_no_dask_cudf @@ -1678,7 +1738,9 @@ def _copy_type_metadata( return self @classmethod - def _from_data(cls, data: MutableMapping, name: Any = no_default, freq: Any = None): + def _from_data( + cls, data: MutableMapping, name: Any = no_default, freq: Any = None + ): result = super()._from_data(data, name) result._freq = _validate_freq(freq) return result @@ -2043,7 +2105,9 @@ def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.DatetimeIndex: if arrow_type and nullable: - raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") @@ -2229,7 +2293,9 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): result_col = delocalize(self._column) else: result_col = localize(self._column, tz, ambiguous, nonexistent) - return DatetimeIndex._from_data({self.name: result_col}, freq=self._freq) + return DatetimeIndex._from_data( + {self.name: result_col}, freq=self._freq + ) def tz_convert(self, tz): """ @@ -2379,7 +2445,9 @@ def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.TimedeltaIndex: if arrow_type and nullable: - raise ValueError(f"{arrow_type=} and {nullable=} cannot both be set.") + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") @@ -2396,7 +2464,9 @@ def days(self): Number of days for each element. """ # Need to specifically return `int64` to avoid overflow. - return as_index(arbitrary=self._values.days, name=self.name, dtype="int64") + return as_index( + arbitrary=self._values.days, name=self.name, dtype="int64" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -2404,7 +2474,9 @@ def seconds(self): """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return as_index(arbitrary=self._values.seconds, name=self.name, dtype="int32") + return as_index( + arbitrary=self._values.seconds, name=self.name, dtype="int32" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -2516,7 +2588,8 @@ def __init__( if isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): if categories is not None or ordered is not None: raise ValueError( - "Cannot specify `categories` or " "`ordered` together with `dtype`." + "Cannot specify `categories` or " + "`ordered` together with `dtype`." ) if copy: data = column.as_column(data, dtype=dtype).copy(deep=True) @@ -2540,7 +2613,9 @@ def __init__( ordered=data.ordered, ) else: - data = column.as_column(data, dtype="category" if dtype is None else dtype) + data = column.as_column( + data, dtype="category" if dtype is None else dtype + ) # dtype has already been taken care dtype = None @@ -2665,7 +2740,8 @@ def interval_range( end = start + periods * freq if any( - not _is_non_decimal_numeric_dtype(x.dtype) for x in (start, periods, freq, end) + not _is_non_decimal_numeric_dtype(x.dtype) + for x in (start, periods, freq, end) ): raise ValueError("start, end, periods, freq must be numeric values.") @@ -2791,7 +2867,9 @@ def closed(self): def from_breaks( cls, breaks, - closed: Optional[Literal["left", "right", "neither", "both"]] = "right", + closed: Optional[ + Literal["left", "right", "neither", "both"] + ] = "right", name=None, copy: bool = False, dtype=None, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index b99fed499bb..ca9d5590044 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -211,8 +211,8 @@ def _get_label_range_or_mask(index, start, stop, step): return slice(start_loc, stop_loc) else: raise KeyError( - "Value based partial slicing on non-monotonic DatetimeIndexes " - "with non-existing keys is not allowed.", + "Value based partial slicing on non-monotonic " + "DatetimeIndexes with non-existing keys is not allowed.", ) elif start is not None: boolean_mask = index >= start @@ -354,7 +354,9 @@ def __round__(self, digits=0): # this method. return self.round(decimals=digits) - def _mimic_inplace(self, result: Self, inplace: bool = False) -> Optional[Self]: + def _mimic_inplace( + self, result: Self, inplace: bool = False + ) -> Optional[Self]: if inplace: self._index = result._index return super()._mimic_inplace(result, inplace) @@ -510,7 +512,9 @@ def empty(self): def to_json(self, path_or_buf=None, *args, **kwargs): """{docstring}""" - return cudf.io.json.to_json(self, path_or_buf=path_or_buf, *args, **kwargs) + return cudf.io.json.to_json( + self, path_or_buf=path_or_buf, *args, **kwargs + ) @_cudf_nvtx_annotate @ioutils.doc_to_hdf() @@ -1009,7 +1013,11 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): if self.ndim == 1: # In case of series and Index, # swap lower and upper if lower > upper - if lower[0] is not None and upper[0] is not None and (lower[0] > upper[0]): + if ( + lower[0] is not None + and upper[0] is not None + and (lower[0] > upper[0]) + ): lower[0], upper[0] = upper[0], lower[0] data = { @@ -1104,8 +1112,12 @@ def dot(self, other, reflect=False): elif isinstance(self, cudf.DataFrame) and isinstance( other, (cudf.Series, cudf.DataFrame) ): - common = self._data.to_pandas_index().union(other.index.to_pandas()) - if len(common) > len(self._data.names) or len(common) > len(other.index): + common = self._data.to_pandas_index().union( + other.index.to_pandas() + ) + if len(common) > len(self._data.names) or len(common) > len( + other.index + ): raise ValueError("matrices are not aligned") lhs = self.reindex(columns=common, copy=False) @@ -1481,7 +1493,9 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs): **kwargs, ) - def median(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def median( + self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs + ): """ Return the median of the values for the requested axis. @@ -1834,7 +1848,9 @@ def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: @_cudf_nvtx_annotate @copy_docstring(Rolling) - def rolling(self, window, min_periods=None, center=False, axis=0, win_type=None): + def rolling( + self, window, min_periods=None, center=False, axis=0, win_type=None + ): return Rolling( self, window, @@ -1911,7 +1927,11 @@ def _copy_type_metadata( See `ColumnBase._with_type_metadata` for more information. """ super()._copy_type_metadata(other, override_dtypes=override_dtypes) - if include_index and self._index is not None and other._index is not None: + if ( + include_index + and self._index is not None + and other._index is not None + ): self._index._copy_type_metadata(other._index) # When other._index is a CategoricalIndex, the current index # will be a NumericalIndex with an underlying CategoricalColumn @@ -1920,7 +1940,9 @@ def _copy_type_metadata( # appropriate index. if isinstance( other._index, cudf.core.index.CategoricalIndex - ) and not isinstance(self._index, cudf.core.index.CategoricalIndex): + ) and not isinstance( + self._index, cudf.core.index.CategoricalIndex + ): self._index = cudf.Index( cast("cudf.Index", self._index)._column, name=self._index.name, @@ -2038,7 +2060,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): if freq is not None: raise ValueError("The freq argument is not yet supported.") - data_columns = (col.shift(periods, fill_value) for col in self._columns) + data_columns = ( + col.shift(periods, fill_value) for col in self._columns + ) return self.__class__._from_data( zip(self._column_names, data_columns), self._index ) @@ -2425,7 +2449,8 @@ def squeeze(self, axis: Literal["index", "columns", 0, 1, None] = None): ---------- axis : {0 or 'index', 1 or 'columns', None}, default None A specific axis to squeeze. By default, all length-1 axes are - squeezed. For `Series` this parameter is unused and defaults to `None`. + squeezed. For `Series` this parameter is unused and defaults + to `None`. Returns ------- @@ -2685,7 +2710,9 @@ def sort_index( by=by, ascending=ascending, na_position=na_position ) out = self._gather( - GatherMap.from_column_unchecked(inds, len(self), nullify=False) + GatherMap.from_column_unchecked( + inds, len(self), nullify=False + ) ) # TODO: frame factory function should handle multilevel column # names @@ -2699,7 +2726,9 @@ def sort_index( ): out = self.copy() else: - inds = idx.argsort(ascending=ascending, na_position=na_position) + inds = idx.argsort( + ascending=ascending, na_position=na_position + ) out = self._gather( GatherMap.from_column_unchecked( cudf.core.column.as_column(inds), @@ -2920,12 +2949,16 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: has_range_index = isinstance(index, RangeIndex) if len(range(start, stop, stride)) == 0: # Avoid materialising the range index column - result = self._empty_like(keep_index=keep_index and not has_range_index) + result = self._empty_like( + keep_index=keep_index and not has_range_index + ) if keep_index and has_range_index: lo = index.start + start * index.step hi = index.start + stop * index.step step = index.step * stride - result.index = RangeIndex(start=lo, stop=hi, step=step, name=index.name) + result.index = RangeIndex( + start=lo, stop=hi, step=step, name=index.name + ) return result if start < 0: start = start + num_rows @@ -2960,7 +2993,11 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: ) columns_to_slice = [ - *(self._index._data.columns if keep_index and not has_range_index else []), + *( + self._index._data.columns + if keep_index and not has_range_index + else [] + ), *self._columns, ] result = self._from_columns_like_self( @@ -2975,14 +3012,18 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: result.index = self.index[start:stop] return result - def _positions_from_column_names(self, column_names, offset_by_index_columns=False): + def _positions_from_column_names( + self, column_names, offset_by_index_columns=False + ): """Map each column name into their positions in the frame. Return positions of the provided column names, offset by the number of index columns if `offset_by_index_columns` is True. The order of indices returned corresponds to the column order in this Frame. """ - num_index_columns = len(self._index._data) if offset_by_index_columns else 0 + num_index_columns = ( + len(self._index._data) if offset_by_index_columns else 0 + ) return [ i + num_index_columns for i, name in enumerate(self._column_names) @@ -3011,7 +3052,8 @@ def drop_duplicates( """ if not isinstance(ignore_index, (np.bool_, bool)): raise ValueError( - f"{ignore_index=} must be bool, " f"not {type(ignore_index).__name__}" + f"{ignore_index=} must be bool, " + f"not {type(ignore_index).__name__}" ) subset = self._preprocess_subset(subset) subset_cols = [name for name in self._column_names if name in subset] @@ -3133,7 +3175,9 @@ def duplicated(self, subset=None, keep="first"): columns = [self._column] else: columns = [self._data[n] for n in subset] - distinct = libcudf.stream_compaction.distinct_indices(columns, keep=keep) + distinct = libcudf.stream_compaction.distinct_indices( + columns, keep=keep + ) (result,) = libcudf.copying.scatter( [cudf.Scalar(False, dtype=bool)], distinct, @@ -3180,10 +3224,14 @@ def _split(self, splits, keep_index=True): ] @_cudf_nvtx_annotate - def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None): # noqa: D102 + def fillna( + self, value=None, method=None, axis=None, inplace=False, limit=None + ): # noqa: D102 if method is not None: # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( f"{type(self).__name__}.fillna with 'method' is " "deprecated and will raise in a future version. " @@ -3406,7 +3454,9 @@ def _apply(self, func, kernel_getter, *args, **kwargs): self, func, args, kernel_getter=kernel_getter ) except Exception as e: - raise ValueError("user defined function compilation failed.") from e + raise ValueError( + "user defined function compilation failed." + ) from e # Mask and data column preallocated ans_col = _return_arr_from_dtype(retty, len(self)) @@ -3505,7 +3555,10 @@ def sort_values( ), keep_index=not ignore_index, ) - if isinstance(self, cudf.core.dataframe.DataFrame) and self._data.multiindex: + if ( + isinstance(self, cudf.core.dataframe.DataFrame) + and self._data.multiindex + ): out.columns = self._data.to_pandas_index() return out @@ -3557,9 +3610,13 @@ def _n_largest_or_smallest( # Empty slice. indices = indices.slice(0, 0) else: - indices = indices.slice(*slice(None, -n - 1, -1).indices(len(self))) + indices = indices.slice( + *slice(None, -n - 1, -1).indices(len(self)) + ) return self._gather( - GatherMap.from_column_unchecked(indices, len(self), nullify=False), + GatherMap.from_column_unchecked( + indices, len(self), nullify=False + ), keep_index=True, ) else: @@ -3596,7 +3653,9 @@ def _align_to_index( result = result.sort_values(sort_col_id) del result[sort_col_id] - result = self.__class__._from_data(data=result._data, index=result.index) + result = self.__class__._from_data( + data=result._data, index=result.index + ) result._data.multiindex = self._data.multiindex result._data._level_names = self._data._level_names result.index.names = self.index.names @@ -3642,7 +3701,9 @@ def _reindex( df = self if index is not None: if not df._index.is_unique: - raise ValueError("cannot reindex on an axis with duplicate labels") + raise ValueError( + "cannot reindex on an axis with duplicate labels" + ) index = cudf.core.index.as_index( index, name=getattr(index, "name", self._index.name) ) @@ -3657,7 +3718,9 @@ def _reindex( if not idx_dtype_match: column_names = ( - column_names if column_names is not None else list(df._column_names) + column_names + if column_names is not None + else list(df._column_names) ) df = cudf.DataFrame() else: @@ -3665,7 +3728,9 @@ def _reindex( rhs = cudf.DataFrame._from_data( { # bookkeeping workaround for unnamed series - (name or 0) if isinstance(self, cudf.Series) else name: col + (name or 0) + if isinstance(self, cudf.Series) + else name: col for name, col in df._data.items() }, index=df._index, @@ -3694,7 +3759,9 @@ def _reindex( names = column_names if isinstance(names, cudf.Index): names = names.to_pandas() - rangeindex = isinstance(column_names, (pd.RangeIndex, cudf.RangeIndex)) + rangeindex = isinstance( + column_names, (pd.RangeIndex, cudf.RangeIndex) + ) level_names = tuple(column_names.names) else: names = column_names @@ -3821,7 +3888,9 @@ def round(self, decimals=0, how="half_even"): elif isinstance(decimals, int): decimals = {name: decimals for name in self._column_names} elif not isinstance(decimals, abc.Mapping): - raise TypeError("decimals must be an integer, a dict-like or a Series") + raise TypeError( + "decimals must be an integer, a dict-like or a Series" + ) cols = { name: col.round(decimals[name], how=how) @@ -4014,14 +4083,18 @@ def resample( "- origin\n" "- offset" ) - by = cudf.Grouper(key=on, freq=rule, closed=closed, label=label, level=level) + by = cudf.Grouper( + key=on, freq=rule, closed=closed, label=label, level=level + ) return ( cudf.core.resample.SeriesResampler(self, by=by) if isinstance(self, cudf.Series) else cudf.core.resample.DataFrameResampler(self, by=by) ) - def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): + def dropna( + self, axis=0, how="any", thresh=None, subset=None, inplace=False + ): """ Drop rows (or columns) containing nulls from a Column. @@ -4121,7 +4194,9 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): if axis == 0: result = self._drop_na_rows(how=how, subset=subset, thresh=thresh) else: - result = self._drop_na_columns(how=how, subset=subset, thresh=thresh) + result = self._drop_na_columns( + how=how, subset=subset, thresh=thresh + ) return self._mimic_inplace(result, inplace=inplace) @@ -4148,7 +4223,9 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): check_col = col.nans_to_nulls() except AttributeError: check_col = col - no_threshold_valid_count = (len(col) - check_col.null_count) < thresh + no_threshold_valid_count = ( + len(col) - check_col.null_count + ) < thresh if no_threshold_valid_count: continue out_cols.append(name) @@ -4321,7 +4398,9 @@ def _first_or_last( if not isinstance(self._index, cudf.core.index.DatetimeIndex): raise TypeError("'first' only supports a DatetimeIndex index.") if not isinstance(offset, str): - raise NotImplementedError(f"Unsupported offset type {type(offset)}.") + raise NotImplementedError( + f"Unsupported offset type {type(offset)}." + ) if len(self) == 0: return self.copy() @@ -4341,7 +4420,9 @@ def _first_or_last( return self.loc[:to_search] needle = as_column(to_search, dtype=self._index.dtype) end_point = int( - self._index._column.searchsorted(needle, side=side).element_indexing(0) + self._index._column.searchsorted( + needle, side=side + ).element_indexing(0) ) return slice_func(end_point) @@ -4563,11 +4644,15 @@ def sample( "population `frac` > 1." ) if n is not None: - raise ValueError("Please enter a value for `frac` OR `n`, not both.") + raise ValueError( + "Please enter a value for `frac` OR `n`, not both." + ) n = int(round(size * frac)) if n > 0 and size == 0: - raise ValueError("Cannot take a sample larger than 0 when axis is empty.") + raise ValueError( + "Cannot take a sample larger than 0 when axis is empty." + ) if isinstance(random_state, cp.random.RandomState): lib = cp @@ -4597,14 +4682,18 @@ def sample( weights = weights / weights.sum() if axis == 0: - return self._sample_axis_0(n, weights, replace, random_state, ignore_index) + return self._sample_axis_0( + n, weights, replace, random_state, ignore_index + ) else: if isinstance(random_state, cp.random.RandomState): raise ValueError( "Sampling from `axis=1`/`columns` with cupy random state" "isn't supported." ) - return self._sample_axis_1(n, weights, replace, random_state, ignore_index) + return self._sample_axis_1( + n, weights, replace, random_state, ignore_index + ) def _sample_axis_0( self, @@ -4617,7 +4706,9 @@ def _sample_axis_0( try: gather_map = GatherMap.from_column_unchecked( cudf.core.column.as_column( - random_state.choice(len(self), size=n, replace=replace, p=weights) + random_state.choice( + len(self), size=n, replace=replace, p=weights + ) ), len(self), nullify=False, @@ -4661,7 +4752,9 @@ def _binaryop( if operands is NotImplemented: return NotImplemented - level_names = self._data._level_names if can_use_self_column_name else None + level_names = ( + self._data._level_names if can_use_self_column_name else None + ) return self._from_data( ColumnAccessor( type(self)._colwise_binop(operands, op), @@ -4701,11 +4794,14 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if cupy_func: if ufunc.nin == 2: other = inputs[self is inputs[0]] - inputs, index, _ = self._make_operands_and_index_for_binop(other, fname) + inputs, index, _ = self._make_operands_and_index_for_binop( + other, fname + ) else: # This works for Index too inputs = { - name: (col, None, False, None) for name, col in self._data.items() + name: (col, None, False, None) + for name, col in self._data.items() } index = self._index @@ -4784,7 +4880,9 @@ def repeat(self, repeats, axis=None): dtype: int64 """ return self._from_columns_like_self( - Frame._repeat([*self._index._data.columns, *self._columns], repeats, axis), + Frame._repeat( + [*self._index._data.columns, *self._columns], repeats, axis + ), self._column_names, self._index_names, ) @@ -5088,7 +5186,9 @@ def drop( """ if labels is not None: if index is not None or columns is not None: - raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") + raise ValueError( + "Cannot specify both 'labels' and 'index'/'columns'" + ) target = labels elif index is not None: target = index @@ -5098,7 +5198,8 @@ def drop( axis = 1 else: raise ValueError( - "Need to specify at least one of 'labels', " "'index' or 'columns'" + "Need to specify at least one of 'labels', " + "'index' or 'columns'" ) if inplace: @@ -5149,7 +5250,9 @@ def _explode(self, explode_column: Any, ignore_index: bool): ) # We must copy inner datatype of the exploded list column to # maintain struct dtype key names - exploded_dtype = cast(ListDtype, self._columns[column_index].dtype).element_type + exploded_dtype = cast( + ListDtype, self._columns[column_index].dtype + ).element_type return self._from_columns_like_self( exploded, self._column_names, @@ -5186,7 +5289,9 @@ def tile(self, count): The indexed frame containing the tiled "rows". """ return self._from_columns_like_self( - libcudf.reshape.tile([*self._index._columns, *self._columns], count), + libcudf.reshape.tile( + [*self._index._columns, *self._columns], count + ), column_names=self._column_names, index_names=self._index_names, ) @@ -5211,13 +5316,19 @@ def groupby( raise NotImplementedError("axis parameter is not yet implemented") if squeeze is not False: - raise NotImplementedError("squeeze parameter is not yet implemented") + raise NotImplementedError( + "squeeze parameter is not yet implemented" + ) if not observed: - raise NotImplementedError("observed parameter is not yet implemented") + raise NotImplementedError( + "observed parameter is not yet implemented" + ) if by is None and level is None: - raise TypeError("groupby() requires either by or level to be specified.") + raise TypeError( + "groupby() requires either by or level to be specified." + ) if group_keys is None: group_keys = False @@ -6126,7 +6237,9 @@ def rank( method_enum = libcudf.pylibcudf.aggregation.RankMethod[method.upper()] if na_option not in {"keep", "top", "bottom"}: - raise ValueError("na_option must be one of 'keep', 'top', or 'bottom'") + raise ValueError( + "na_option must be one of 'keep', 'top', or 'bottom'" + ) if axis not in (0, "index"): raise NotImplementedError( @@ -6136,9 +6249,9 @@ def rank( source = self if numeric_only: - if isinstance(source, cudf.Series) and not _is_non_decimal_numeric_dtype( - self.dtype - ): + if isinstance( + source, cudf.Series + ) and not _is_non_decimal_numeric_dtype(self.dtype): raise TypeError( "Series.rank does not allow numeric_only=True with " "non-numeric dtype." @@ -6256,7 +6369,9 @@ def _get_replacement_values_for_columns( if is_scalar(to_replace) and is_scalar(value): to_replace_columns = {col: [to_replace] for col in columns_dtype_map} values_columns = {col: [value] for col in columns_dtype_map} - elif cudf.api.types.is_list_like(to_replace) or isinstance(to_replace, ColumnBase): + elif cudf.api.types.is_list_like(to_replace) or isinstance( + to_replace, ColumnBase + ): if is_scalar(value): to_replace_columns = {col: to_replace for col in columns_dtype_map} values_columns = { @@ -6277,13 +6392,17 @@ def _get_replacement_values_for_columns( f" Expected {len(to_replace)}, got {len(value)}." ) else: - to_replace_columns = {col: to_replace for col in columns_dtype_map} + to_replace_columns = { + col: to_replace for col in columns_dtype_map + } values_columns = {col: value for col in columns_dtype_map} elif cudf.utils.dtypes.is_column_like(value): to_replace_columns = {col: to_replace for col in columns_dtype_map} values_columns = {col: value for col in columns_dtype_map} else: - raise TypeError("value argument must be scalar, list-like or Series") + raise TypeError( + "value argument must be scalar, list-like or Series" + ) elif _is_series(to_replace): if value is None or value is no_default: to_replace_columns = { @@ -6292,14 +6411,18 @@ def _get_replacement_values_for_columns( values_columns = {col: to_replace for col in columns_dtype_map} elif is_dict_like(value): to_replace_columns = { - col: to_replace[col] for col in columns_dtype_map if col in to_replace + col: to_replace[col] + for col in columns_dtype_map + if col in to_replace } values_columns = { col: value[col] for col in to_replace_columns if col in value } elif is_scalar(value) or _is_series(value): to_replace_columns = { - col: to_replace[col] for col in columns_dtype_map if col in to_replace + col: to_replace[col] + for col in columns_dtype_map + if col in to_replace } values_columns = { col: [value] if is_scalar(value) else value[col] @@ -6308,7 +6431,8 @@ def _get_replacement_values_for_columns( } else: raise ValueError( - "Series.replace cannot use dict-like to_replace and non-None " "value" + "Series.replace cannot use dict-like to_replace and non-None " + "value" ) elif is_dict_like(to_replace): if value is None or value is no_default: @@ -6320,14 +6444,18 @@ def _get_replacement_values_for_columns( } elif is_dict_like(value): to_replace_columns = { - col: to_replace[col] for col in columns_dtype_map if col in to_replace + col: to_replace[col] + for col in columns_dtype_map + if col in to_replace } values_columns = { col: value[col] for col in columns_dtype_map if col in value } elif is_scalar(value) or _is_series(value): to_replace_columns = { - col: to_replace[col] for col in columns_dtype_map if col in to_replace + col: to_replace[col] + for col in columns_dtype_map + if col in to_replace } values_columns = { col: [value] if is_scalar(value) else value @@ -6355,7 +6483,9 @@ def _get_replacement_values_for_columns( for i in to_replace_columns: if i in values_columns: if isinstance(values_columns[i], list): - all_na = values_columns[i].count(None) == len(values_columns[i]) + all_na = values_columns[i].count(None) == len( + values_columns[i] + ) else: all_na = False all_na_columns[i] = all_na @@ -6422,7 +6552,9 @@ def _drop_rows_by_labels( join_res = working_df.join(to_join, how="leftanti") # 4. Reconstruct original layout, and rename - join_res._insert(ilevel, name=join_res._index.name, value=join_res._index) + join_res._insert( + ilevel, name=join_res._index.name, value=join_res._index + ) midx = cudf.MultiIndex.from_frame( join_res.iloc[:, 0:idx_nlv], names=obj._index.names @@ -6445,7 +6577,9 @@ def _drop_rows_by_labels( key_df = cudf.DataFrame._from_data( data={}, - index=cudf.Index(labels, name=getattr(labels, "name", obj.index.name)), + index=cudf.Index( + labels, name=getattr(labels, "name", obj.index.name) + ), ) if isinstance(obj, cudf.DataFrame): res = obj.join(key_df, how="leftanti") diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 66b364a1686..7242de9964f 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. from __future__ import annotations @@ -108,13 +108,17 @@ def destructure_iloc_key( # shape of frame indexers = key + (slice(None),) * (n - len(key)) if len(indexers) > n: - raise IndexError(f"Too many indexers: got {len(indexers)} expected {n}") + raise IndexError( + f"Too many indexers: got {len(indexers)} expected {n}" + ) else: # Key indexes rows, slice-expand to shape of frame indexers = (key, *(slice(None),) * (n - 1)) indexers = tuple(k(frame) if callable(k) else k for k in indexers) if any(isinstance(k, tuple) for k in indexers): - raise IndexError("Too many indexers: can't have nested tuples in iloc indexing") + raise IndexError( + "Too many indexers: can't have nested tuples in iloc indexing" + ) return indexers @@ -150,14 +154,17 @@ def destructure_dataframe_iloc_indexer( cols = slice(None) scalar = is_integer(cols) try: - column_names: ColumnLabels = list(frame._data.get_labels_by_index(cols)) + column_names: ColumnLabels = list( + frame._data.get_labels_by_index(cols) + ) if len(set(column_names)) != len(column_names): raise NotImplementedError( "cudf DataFrames do not support repeated column names" ) except TypeError: raise TypeError( - "Column indices must be integers, slices, " "or list-like of integers" + "Column indices must be integers, slices, " + "or list-like of integers" ) if scalar: assert ( @@ -231,5 +238,6 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: return MapIndexer(GatherMap(key, n, nullify=False)) else: raise TypeError( - "Cannot index by location " f"with non-integer key of type {type(key)}" + "Cannot index by location " + f"with non-integer key of type {type(key)}" ) diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index c5163407c30..6a619945e75 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -91,7 +91,8 @@ def _match_join_keys( np.issubdtype(ltype, np.number) and np.issubdtype(rtype, np.number) and not ( - np.issubdtype(ltype, np.timedelta64) or np.issubdtype(rtype, np.timedelta64) + np.issubdtype(ltype, np.timedelta64) + or np.issubdtype(rtype, np.timedelta64) ) ): common_type = ( @@ -100,20 +101,24 @@ def _match_join_keys( else np.result_type(ltype, rtype) ) elif ( - np.issubdtype(ltype, np.datetime64) and np.issubdtype(rtype, np.datetime64) + np.issubdtype(ltype, np.datetime64) + and np.issubdtype(rtype, np.datetime64) ) or ( - np.issubdtype(ltype, np.timedelta64) and np.issubdtype(rtype, np.timedelta64) + np.issubdtype(ltype, np.timedelta64) + and np.issubdtype(rtype, np.timedelta64) ): common_type = max(ltype, rtype) elif ( - np.issubdtype(ltype, np.datetime64) or np.issubdtype(ltype, np.timedelta64) + np.issubdtype(ltype, np.datetime64) + or np.issubdtype(ltype, np.timedelta64) ) and not rcol.fillna(0).can_cast_safely(ltype): raise TypeError( f"Cannot join between {ltype} and {rtype}, please type-cast both " "columns to the same type." ) elif ( - np.issubdtype(rtype, np.datetime64) or np.issubdtype(rtype, np.timedelta64) + np.issubdtype(rtype, np.datetime64) + or np.issubdtype(rtype, np.timedelta64) ) and not lcol.fillna(0).can_cast_safely(rtype): raise TypeError( f"Cannot join between {rtype} and {ltype}, please type-cast both " @@ -140,7 +145,8 @@ def _match_categorical_dtypes_both( # ambiguous and not allowed. if ltype.ordered != rtype.ordered: raise TypeError( - "Merging on categorical variables with mismatched" " ordering is ambiguous" + "Merging on categorical variables with mismatched" + " ordering is ambiguous" ) if ltype.ordered and rtype.ordered: @@ -170,7 +176,9 @@ def _match_categorical_dtypes_both( merged_categories = cudf.concat( [ltype.categories, rtype.categories] ).unique() - common_type = cudf.CategoricalDtype(categories=merged_categories, ordered=False) + common_type = cudf.CategoricalDtype( + categories=merged_categories, ordered=False + ) return lcol.astype(common_type), rcol.astype(common_type) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index ccabb93c1fe..1ef2915bc59 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -125,7 +125,9 @@ def __init__( self.sort = sort or ( cudf.get_option("mode.pandas_compatible") and how == "outer" ) - self.preserve_key_order = cudf.get_option("mode.pandas_compatible") and how in { + self.preserve_key_order = cudf.get_option( + "mode.pandas_compatible" + ) and how in { "inner", "outer", "left", @@ -137,10 +139,16 @@ def __init__( # don't have any other args, so we can apply it directly to left_on and # right_on. self._using_left_index = bool(left_index) - left_on = lhs.index._data.names if left_index else left_on if left_on else on + left_on = ( + lhs.index._data.names if left_index else left_on if left_on else on + ) self._using_right_index = bool(right_index) right_on = ( - rhs.index._data.names if right_index else right_on if right_on else on + rhs.index._data.names + if right_index + else right_on + if right_on + else on ) if left_on or right_on: @@ -184,7 +192,8 @@ def __init__( for lkey, rkey in zip(self._left_keys, self._right_keys) if lkey.name == rkey.name and not ( - isinstance(lkey, _IndexIndexer) or isinstance(rkey, _IndexIndexer) + isinstance(lkey, _IndexIndexer) + or isinstance(rkey, _IndexIndexer) ) } ) @@ -222,7 +231,11 @@ def _gather_maps(self, left_cols, right_cols): key_order = list( itertools.chain.from_iterable( libcudf.copying.gather( - [cudf.core.column.as_column(range(n), dtype=size_type_dtype)], + [ + cudf.core.column.as_column( + range(n), dtype=size_type_dtype + ) + ], map_, nullify=null, ) @@ -262,13 +275,17 @@ def perform_merge(self) -> cudf.DataFrame: left_key.set(self.lhs, lcol_casted, validate=False) right_key.set(self.rhs, rcol_casted, validate=False) - left_rows, right_rows = self._gather_maps(left_join_cols, right_join_cols) + left_rows, right_rows = self._gather_maps( + left_join_cols, right_join_cols + ) gather_kwargs = { "keep_index": self._using_left_index or self._using_right_index, } left_result = ( self.lhs._gather( - GatherMap.from_column_unchecked(left_rows, len(self.lhs), nullify=True), + GatherMap.from_column_unchecked( + left_rows, len(self.lhs), nullify=True + ), **gather_kwargs, ) if left_rows is not None @@ -294,7 +311,9 @@ def perform_merge(self) -> cudf.DataFrame: result = self._sort_result(result) return result - def _merge_results(self, left_result: cudf.DataFrame, right_result: cudf.DataFrame): + def _merge_results( + self, left_result: cudf.DataFrame, right_result: cudf.DataFrame + ): # Merge the DataFrames `left_result` and `right_result` into a single # `DataFrame`, suffixing column names if necessary. @@ -316,7 +335,9 @@ def _merge_results(self, left_result: cudf.DataFrame, right_result: cudf.DataFra # All columns from the left table make it into the output. Non-key # columns that share a name with a column in the right table are # suffixed with the provided suffix. - common_names = set(left_result._data.names) & set(right_result._data.names) + common_names = set(left_result._data.names) & set( + right_result._data.names + ) cols_to_suffix = common_names - self._key_columns_with_same_name data = { (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col @@ -339,7 +360,9 @@ def _merge_results(self, left_result: cudf.DataFrame, right_result: cudf.DataFra # - either one of `lhs` or `rhs` have a MultiIndex columns, # and the other is empty (i.e., no columns) if self.lhs._data and self.rhs._data: - multiindex_columns = self.lhs._data.multiindex and self.rhs._data.multiindex + multiindex_columns = ( + self.lhs._data.multiindex and self.rhs._data.multiindex + ) elif self.lhs._data: multiindex_columns = self.lhs._data.multiindex elif self.rhs._data: @@ -359,7 +382,9 @@ def _merge_results(self, left_result: cudf.DataFrame, right_result: cudf.DataFra # Construct result from data and index: return ( - left_result._data.__class__(data=data, multiindex=multiindex_columns), + left_result._data.__class__( + data=data, multiindex=multiindex_columns + ), index, ) diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py index 2adc454b132..7bbb299d643 100644 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. import inspect @@ -59,7 +59,9 @@ def __get__(self, obj, owner=None): retfunc.__annotations__.pop("op", None) retfunc_params = [ v - for k, v in inspect.signature(self._base_operation).parameters.items() + for k, v in inspect.signature( + self._base_operation + ).parameters.items() if k != "op" ] retfunc.__signature__ = inspect.Signature(retfunc_params) @@ -228,10 +230,12 @@ def __init_subclass__(cls): base_operation = getattr(cls, base_operation_name) for operation in valid_operations: - if _should_define_operation(cls, operation, base_operation_name): - docstring_format_args = getattr(cls, docstring_attr, {}).get( - operation, {} - ) + if _should_define_operation( + cls, operation, base_operation_name + ): + docstring_format_args = getattr( + cls, docstring_attr, {} + ).get(operation, {}) op_attr = Operation( operation, docstring_format_args, base_operation ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6046b462982..019daacddba 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -134,7 +134,9 @@ def __init__( if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") if name is not None: - raise NotImplementedError("Use `names`, `name` is not yet supported") + raise NotImplementedError( + "Use `names`, `name` is not yet supported" + ) if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") if not isinstance(codes, cudf.DataFrame) and not isinstance( @@ -145,7 +147,9 @@ def __init__( if copy: if isinstance(codes, cudf.DataFrame): codes = codes.copy(deep=True) - if len(levels) > 0 and isinstance(levels[0], (cudf.Index, cudf.Series)): + if len(levels) > 0 and isinstance( + levels[0], (cudf.Index, cudf.Series) + ): levels = [level.copy(deep=True) for level in levels] if not isinstance(codes, cudf.DataFrame): @@ -171,7 +175,8 @@ def __init__( ) if len({c.size for c in codes._data.columns}) != 1: raise ValueError( - "MultiIndex length of codes does not match " "and is inconsistent!" + "MultiIndex length of codes does not match " + "and is inconsistent!" ) source_data = {} @@ -179,12 +184,18 @@ def __init__( if len(code): lo, hi = libcudf.reduce.minmax(code) if lo.value < -1 or hi.value > len(level) - 1: - raise ValueError(f"Codes must be -1 <= codes <= {len(level) - 1}") + raise ValueError( + f"Codes must be -1 <= codes <= {len(level) - 1}" + ) if lo.value == -1: # Now we can gather and insert null automatically code[code == -1] = np.iinfo(size_type_dtype).min - result_col = libcudf.copying.gather([level._column], code, nullify=True) - source_data[column_name] = result_col[0]._with_type_metadata(level.dtype) + result_col = libcudf.copying.gather( + [level._column], code, nullify=True + ) + source_data[column_name] = result_col[0]._with_type_metadata( + level.dtype + ) super().__init__(source_data) self._levels = levels @@ -226,7 +237,9 @@ def names(self, value): @_cudf_nvtx_annotate def to_series(self, index=None, name=None): - raise NotImplementedError("MultiIndex.to_series isn't implemented yet.") + raise NotImplementedError( + "MultiIndex.to_series isn't implemented yet." + ) @_cudf_nvtx_annotate def astype(self, dtype, copy: bool = True): @@ -290,7 +303,9 @@ def set_names(self, names, level=None, inplace=False): level_is_list_like = is_list_like(level) if level is not None and not level_is_list_like and names_is_list_like: - raise TypeError("Names must be a string when a single level is provided.") + raise TypeError( + "Names must be a string when a single level is provided." + ) if not names_is_list_like and level is None and self.nlevels > 1: raise TypeError("Must pass list-like as `names`.") @@ -437,13 +452,17 @@ def __repr__(self): column.timedelta.TimeDeltaColumn, ), ): - preprocess_df[name] = col.astype("str").fillna(str(cudf.NaT)) + preprocess_df[name] = col.astype("str").fillna( + str(cudf.NaT) + ) tuples_list = list( zip( *list( map(lambda val: pd.NA if val is None else val, col) - for col in preprocess_df.to_arrow().to_pydict().values() + for col in preprocess_df.to_arrow() + .to_pydict() + .values() ) ) ) @@ -642,7 +661,9 @@ def isin(self, values, level=None): "squences when `level=None`." ) else: - values_idx = cudf.MultiIndex.from_tuples(values, names=self.names) + values_idx = cudf.MultiIndex.from_tuples( + values, names=self.names + ) self_df = self.to_frame(index=False).reset_index() values_df = values_idx.to_frame(index=False) idx = self_df.merge(values_df, how="leftsemi")._data["index"] @@ -656,7 +677,9 @@ def isin(self, values, level=None): return result def where(self, cond, other=None, inplace=False): - raise NotImplementedError(".where is not supported for MultiIndex operations") + raise NotImplementedError( + ".where is not supported for MultiIndex operations" + ) @_cudf_nvtx_annotate def _compute_levels_and_codes(self): @@ -687,7 +710,11 @@ def _compute_validity_mask(self, index, row_tuple, max_length): [ frame, cudf.DataFrame( - {"idx": cudf.Series(column.as_column(range(len(frame))))} + { + "idx": cudf.Series( + column.as_column(range(len(frame))) + ) + } ), ], axis=1, @@ -700,7 +727,9 @@ def _compute_validity_mask(self, index, row_tuple, max_length): if cudf.get_option("mode.pandas_compatible"): lookup_order = "_" + "_".join(map(str, lookup._data.names)) lookup[lookup_order] = column.as_column(range(len(lookup))) - postprocess = operator.methodcaller("sort_values", by=[lookup_order, "idx"]) + postprocess = operator.methodcaller( + "sort_values", by=[lookup_order, "idx"] + ) else: postprocess = lambda r: r # noqa: E731 result = postprocess(lookup.merge(data_table))["idx"] @@ -734,8 +763,12 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): start_values = self._compute_validity_mask( index, row_tuple.start, max_length ) - stop_values = self._compute_validity_mask(index, row_tuple.stop, max_length) - return column.as_column(range(start_values.min(), stop_values.max() + 1)) + stop_values = self._compute_validity_mask( + index, row_tuple.stop, max_length + ) + return column.as_column( + range(start_values.min(), stop_values.max() + 1) + ) elif isinstance(row_tuple, numbers.Number): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length) @@ -744,9 +777,9 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): def _index_and_downcast(self, result, index, index_key): if isinstance(index_key, (numbers.Number, slice)): index_key = [index_key] - if (len(index_key) > 0 and not isinstance(index_key, tuple)) or isinstance( - index_key[0], slice - ): + if ( + len(index_key) > 0 and not isinstance(index_key, tuple) + ) or isinstance(index_key[0], slice): index_key = index_key[0] slice_access = isinstance(index_key, slice) @@ -812,7 +845,9 @@ def _index_and_downcast(self, result, index, index_key): def _get_row_major( self, df: DataFrameOrSeries, - row_tuple: Union[numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]], + row_tuple: Union[ + numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] + ], ) -> DataFrameOrSeries: if pd.api.types.is_bool_dtype( list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple @@ -835,14 +870,18 @@ def _get_row_major( @_cudf_nvtx_annotate def _validate_indexer( self, - indexer: Union[numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]], + indexer: Union[ + numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] + ], ): if isinstance(indexer, numbers.Number): return if isinstance(indexer, tuple): # drop any slice(None) from the end: indexer = tuple( - itertools.dropwhile(lambda x: x == slice(None), reversed(indexer)) + itertools.dropwhile( + lambda x: x == slice(None), reversed(indexer) + ) )[::-1] # now check for size @@ -908,7 +947,9 @@ def __getitem__(self, index): start, stop, step = index.indices(len(self)) index = column.as_column(range(start, stop, step)) result = MultiIndex.from_frame( - self.to_frame(index=False, name=range(0, self.nlevels)).take(index), + self.to_frame(index=False, name=range(0, self.nlevels)).take( + index + ), names=self.names, ) @@ -978,11 +1019,14 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False): # modifications of the resulting DataFrame will affect the MultiIndex. if name is no_default: column_names = [ - level if name is None else name for level, name in enumerate(self.names) + level if name is None else name + for level, name in enumerate(self.names) ] else: if not is_list_like(name): - raise TypeError("'name' must be a list / sequence of column names.") + raise TypeError( + "'name' must be a list / sequence of column names." + ) if len(name) != len(self.levels): raise ValueError( "'name' should have the same length as " @@ -991,9 +1035,9 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False): column_names = name all_none_names = None - if not (all_none_names := all(x is None for x in column_names)) and len( - column_names - ) != len(set(column_names)): + if not ( + all_none_names := all(x is None for x in column_names) + ) and len(column_names) != len(set(column_names)): raise ValueError("Duplicate column names are not allowed") df = cudf.DataFrame._from_data( data=self._data, @@ -1182,7 +1226,9 @@ def values(self): """ if cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError("Unable to create a cupy array with tuples.") + raise NotImplementedError( + "Unable to create a cupy array with tuples." + ) return self.to_frame(index=False).values @classmethod @@ -1358,7 +1404,9 @@ def from_arrays( code, level = factorize(array, sort=True) codes.append(code) levels.append(level) - return cls(codes=codes, levels=levels, sortorder=sortorder, names=names) + return cls( + codes=codes, levels=levels, sortorder=sortorder, names=names + ) @_cudf_nvtx_annotate def _poplevels(self, level): @@ -1565,12 +1613,16 @@ def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default): if not isinstance(multiindex, pd.MultiIndex): raise TypeError("not a pandas.MultiIndex") if nan_as_null is no_default: - nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None + nan_as_null = ( + False if cudf.get_option("mode.pandas_compatible") else None + ) levels = [ cudf.Index.from_pandas(level, nan_as_null=nan_as_null) for level in multiindex.levels ] - return cls(levels=levels, codes=multiindex.codes, names=multiindex.names) + return cls( + levels=levels, codes=multiindex.codes, names=multiindex.names + ) @cached_property # type: ignore @_cudf_nvtx_annotate @@ -1597,7 +1649,9 @@ def is_monotonic_decreasing(self): Return if the index is monotonic decreasing (only equal or decreasing) values. """ - return self._is_sorted(ascending=[False] * len(self.levels), null_position=None) + return self._is_sorted( + ascending=[False] * len(self.levels), null_position=None + ) @_cudf_nvtx_annotate def fillna(self, value): @@ -1776,20 +1830,27 @@ def _level_index_from_level(self, level): level += self.nlevels if level >= self.nlevels: raise IndexError( - f"Level {level} out of bounds. " f"Index has {self.nlevels} levels." + f"Level {level} out of bounds. " + f"Index has {self.nlevels} levels." ) from None return level @_cudf_nvtx_annotate def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is not None: - raise NotImplementedError("Parameter tolerance is not supported yet.") + raise NotImplementedError( + "Parameter tolerance is not supported yet." + ) if method == "nearest": - raise NotImplementedError(f"{method=} is not supported yet for MultiIndex.") + raise NotImplementedError( + f"{method=} is not supported yet for MultiIndex." + ) if method in {"ffill", "bfill", "pad", "backfill"} and not ( self.is_monotonic_increasing or self.is_monotonic_decreasing ): - raise ValueError("index must be monotonic increasing or decreasing") + raise ValueError( + "index must be monotonic increasing or decreasing" + ) result = column.as_column( -1, @@ -1820,7 +1881,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): index=self, positions=result_series, method=method, - target_col=target.to_frame(index=False)[list(range(0, self.nlevels))], + target_col=target.to_frame(index=False)[ + list(range(0, self.nlevels)) + ], tolerance=tolerance, ) elif method is not None: @@ -1833,7 +1896,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): @_cudf_nvtx_annotate def get_loc(self, key): - is_sorted = self.is_monotonic_increasing or self.is_monotonic_decreasing + is_sorted = ( + self.is_monotonic_increasing or self.is_monotonic_decreasing + ) is_unique = self.is_unique key = (key,) if not isinstance(key, tuple) else key @@ -1857,7 +1922,11 @@ def get_loc(self, key): if is_unique and lower_bound + 1 == upper_bound: # Indices are unique (Pandas constraint), search result is unique, # return int. - return lower_bound if is_sorted else sort_inds.element_indexing(lower_bound) + return ( + lower_bound + if is_sorted + else sort_inds.element_indexing(lower_bound) + ) if is_sorted: # In monotonic index, lex search result is continuous. A slice for @@ -2001,7 +2070,8 @@ def _split_columns_by_levels(self, levels): # Normalize named levels into indices level_names = list(self.names) level_indices = { - lv if isinstance(lv, int) else level_names.index(lv) for lv in levels + lv if isinstance(lv, int) else level_names.index(lv) + for lv in levels } # Split the columns diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index e69701ee838..ec191a974e4 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 # All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -98,7 +98,9 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): obj_type = pickle.loads(header["obj_type"]) - obj = obj_type.deserialize(header["obj"], frames[: header["num_obj_frames"]]) + obj = obj_type.deserialize( + header["obj"], frames[: header["num_obj_frames"]] + ) grouping = _ResampleGrouping.deserialize( header["grouping"], frames[header["num_obj_frames"] :] ) @@ -181,7 +183,9 @@ def _handle_frequency_grouper(self, by): "Resampling by DateOffset objects is not yet supported." ) if not isinstance(freq, str): - raise TypeError(f"Unsupported type for freq: {type(freq).__name__}") + raise TypeError( + f"Unsupported type for freq: {type(freq).__name__}" + ) # convert freq to a pd.DateOffset: offset = pd.tseries.frequencies.to_offset(freq) @@ -243,7 +247,9 @@ def _handle_frequency_grouper(self, by): # column to have the same dtype, so we compute a `result_type` # and cast them both to that type. try: - result_type = np.dtype(_unit_dtype_map[_offset_alias_to_code[offset.name]]) + result_type = np.dtype( + _unit_dtype_map[_offset_alias_to_code[offset.name]] + ) except KeyError: # unsupported resolution (we don't support resolutions >s) # fall back to using datetime64[s] @@ -328,7 +334,9 @@ def _get_timestamp_range_edges( if isinstance(origin, pd.Timestamp) and (origin.tz is None) != ( index_tz is None ): - raise ValueError("The origin must have the same timezone as the index.") + raise ValueError( + "The origin must have the same timezone as the index." + ) elif origin == "epoch": # set the epoch based on the timezone to have similar bins results # when resampling on the same kind of indexes on different diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 9a213260847..2ef39e9357d 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -42,7 +42,9 @@ def _align_objs(objs, how="outer", sort=None): i_objs = iter(objs) first = next(i_objs) - not_matching_index = any(not first.index.equals(rest.index) for rest in i_objs) + not_matching_index = any( + not first.index.equals(rest.index) for rest in i_objs + ) if not_matching_index: if not all(o.index.is_unique for o in objs): @@ -57,7 +59,9 @@ def _align_objs(objs, how="outer", sort=None): final_index.name = name return [ - obj.reindex(final_index) if not final_index.equals(obj.index) else obj + obj.reindex(final_index) + if not final_index.equals(obj.index) + else obj for obj in objs ] else: @@ -238,7 +242,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): axis = _AXIS_MAP.get(axis, None) if axis is None: - raise ValueError(f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}') + raise ValueError( + f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}' + ) # Return for single object if len(objs) == 1: @@ -319,7 +325,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): any_empty = any(obj.empty for obj in objs) if any_empty: # Do not remove until pandas-3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The behavior of array concatenation with empty entries is " "deprecated. In a future version, this will no longer exclude " @@ -355,7 +363,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): # if join is inner and it contains an empty df # we return an empty df, hence creating an empty # column with dtype metadata retained. - df[name] = cudf.core.column.column_empty_like(col, newsize=0) + df[name] = cudf.core.column.column_empty_like( + col, newsize=0 + ) else: df[name] = col @@ -710,7 +720,9 @@ def get_dummies( encode_fallback_dtypes = ["object", "category"] if columns is None or len(columns) == 0: - columns = df.select_dtypes(include=encode_fallback_dtypes)._column_names + columns = df.select_dtypes( + include=encode_fallback_dtypes + )._column_names _length_check_params(prefix, columns, "prefix") _length_check_params(prefix_sep, columns, "prefix_sep") @@ -745,7 +757,9 @@ def get_dummies( for name in columns: if name not in cats: - unique = _get_unique(column=df._data[name], dummy_na=dummy_na) + unique = _get_unique( + column=df._data[name], dummy_na=dummy_na + ) else: unique = as_column(cats[name]) @@ -825,7 +839,9 @@ def _merge_sorted( if keys is None: key_columns_indices = list(range(0, objs[0]._num_columns)) else: - key_columns_indices = [objs[0]._column_names.index(key) for key in keys] + key_columns_indices = [ + objs[0]._column_names.index(key) for key in keys + ] if not ignore_index: key_columns_indices = [ idx + objs[0]._index.nlevels for idx in key_columns_indices @@ -894,7 +910,10 @@ def as_tuple(x): target._data[None][scatter_map] = col result_frames = target._split(range(nrows, nrows * ncols, nrows)) result.update( - {name: next(iter(f._columns)) for name, f in zip(names, result_frames)} + { + name: next(iter(f._columns)) + for name, f in zip(names, result_frames) + } ) return cudf.DataFrame._from_data( @@ -1103,7 +1122,9 @@ def unstack(df, level, fill_value=None): ) res = df.T.stack(future_stack=False) # Result's index is a multiindex - res.index.names = tuple(df._data.to_pandas_index().names) + df.index.names + res.index.names = ( + tuple(df._data.to_pandas_index().names) + df.index.names + ) return res else: columns = df.index._poplevels(level) @@ -1424,7 +1445,9 @@ def pivot_table( # discard the top level if values_passed and not values_multi and table._data.multiindex: column_names = table._data.level_names[1:] - table_columns = tuple(map(lambda column: column[1:], table._data.names)) + table_columns = tuple( + map(lambda column: column[1:], table._data.names) + ) table.columns = cudf.MultiIndex.from_tuples( tuples=table_columns, names=column_names ) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index f9816c1c811..f7d05e53ce7 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -190,7 +190,9 @@ def _preprocess_host_value(self, value, dtype): if dtype is not None: raise TypeError("Lists may not be cast to a different dtype") else: - dtype = ListDtype.from_arrow(pa.infer_type([value], from_pandas=True)) + dtype = ListDtype.from_arrow( + pa.infer_type([value], from_pandas=True) + ) return value, dtype elif isinstance(dtype, ListDtype): if value not in {None, NA}: @@ -200,7 +202,9 @@ def _preprocess_host_value(self, value, dtype): if isinstance(value, dict): if dtype is None: - dtype = StructDtype.from_arrow(pa.infer_type([value], from_pandas=True)) + dtype = StructDtype.from_arrow( + pa.infer_type([value], from_pandas=True) + ) return value, dtype elif isinstance(dtype, StructDtype): if value not in {None, NA}: @@ -222,11 +226,15 @@ def _preprocess_host_value(self, value, dtype): if isinstance(value, (np.datetime64, np.timedelta64)): unit, _ = np.datetime_data(value) if unit == "generic": - raise TypeError("Cant convert generic NaT to null scalar") + raise TypeError( + "Cant convert generic NaT to null scalar" + ) else: dtype = value.dtype else: - raise TypeError("dtype required when constructing a null scalar") + raise TypeError( + "dtype required when constructing a null scalar" + ) else: dtype = value.dtype @@ -235,7 +243,9 @@ def _preprocess_host_value(self, value, dtype): if not valid: value = ( - NaT if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype) else NA + NaT + if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype) + else NA ) return value, dtype @@ -293,13 +303,18 @@ def __neg__(self): def __repr__(self): # str() fixes a numpy bug with NaT # https://github.com/numpy/numpy/issues/17552 - return f"{self.__class__.__name__}" f"({str(self.value)}, dtype={self.dtype})" + return ( + f"{self.__class__.__name__}" + f"({str(self.value)}, dtype={self.dtype})" + ) def _binop_result_dtype_or_error(self, other, op): if op in {"__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"}: return np.bool_ - out_dtype = get_allowed_combinations_for_operator(self.dtype, other.dtype, op) + out_dtype = get_allowed_combinations_for_operator( + self.dtype, other.dtype, op + ) # datetime handling if out_dtype in {"M", "m"}: @@ -314,7 +329,10 @@ def _binop_result_dtype_or_error(self, other, op): }: return other.dtype else: - if op == "__sub__" and self.dtype.char == other.dtype.char == "M": + if ( + op == "__sub__" + and self.dtype.char == other.dtype.char == "M" + ): res, _ = np.datetime_data(max(self.dtype, other.dtype)) return cudf.dtype("m8" + f"[{res}]") return np.result_type(self.dtype, other.dtype) @@ -353,7 +371,8 @@ def _dispatch_scalar_binop(self, other, op): def _unaop_result_type_or_error(self, op): if op == "__neg__" and self.dtype == "bool": raise TypeError( - "Boolean scalars in cuDF do not support" " negation, use logical not" + "Boolean scalars in cuDF do not support" + " negation, use logical not" ) if op in {"__ceil__", "__floor__"}: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 399b62edee6..275dc664175 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -158,7 +158,9 @@ def _describe_categorical(obj, percentiles): # In case there's a tie, break the tie by sorting the index # and take the top. val_counts = obj.value_counts(ascending=False) - tied_val_counts = val_counts[val_counts == val_counts.iloc[0]].sort_index() + tied_val_counts = val_counts[ + val_counts == val_counts.iloc[0] + ].sort_index() data.update( { "top": tied_val_counts.index[0], @@ -212,19 +214,26 @@ def __setitem__(self, key, value): ) and cudf.utils.utils._isnat(value) and not ( - isinstance(self._frame._column, cudf.core.column.StringColumn) + isinstance( + self._frame._column, cudf.core.column.StringColumn + ) and isinstance(value, str) ) ): raise MixedTypeError( - f"Cannot assign {value=} to non-datetime/non-timedelta " "columns" + f"Cannot assign {value=} to non-datetime/non-timedelta " + "columns" ) elif ( not ( is_float_dtype(self._frame._column.dtype) or ( - isinstance(self._frame._column.dtype, cudf.CategoricalDtype) - and is_float_dtype(self._frame._column.dtype.categories.dtype) + isinstance( + self._frame._column.dtype, cudf.CategoricalDtype + ) + and is_float_dtype( + self._frame._column.dtype.categories.dtype + ) ) ) and isinstance(value, (np.float32, np.float64)) @@ -267,7 +276,9 @@ def __setitem__(self, key, value): value = value.astype(to_dtype) if to_dtype != self._frame._column.dtype: # Do not remove until pandas-3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( f"Setting an item of incompatible dtype is deprecated " "and will raise in a future error of pandas. " @@ -374,7 +385,9 @@ def _loc_to_iloc(self, arg): and is_integer_dtype(index_dtype.categories.dtype) ): # TODO: switch to cudf.utils.dtypes.is_integer(arg) - if isinstance(arg, cudf.Scalar) and is_integer_dtype(arg.dtype): + if isinstance(arg, cudf.Scalar) and is_integer_dtype( + arg.dtype + ): # Do not remove until pandas 3.0 support is added. assert ( PANDAS_LT_300 @@ -479,7 +492,9 @@ def _constructor(self): @property def _constructor_sliced(self): - raise NotImplementedError("_constructor_sliced not supported for Series!") + raise NotImplementedError( + "_constructor_sliced not supported for Series!" + ) @property def _constructor_expanddim(self): @@ -623,7 +638,11 @@ def __init__( # be expensive or mark a buffer as # unspillable. has_cai = ( - type(inspect.getattr_static(data, "__cuda_array_interface__", None)) + type( + inspect.getattr_static( + data, "__cuda_array_interface__", None + ) + ) is property ) column = as_column( @@ -722,7 +741,9 @@ def from_pandas(cls, s: pd.Series, nan_as_null=no_default): dtype: float64 """ if nan_as_null is no_default: - nan_as_null = False if cudf.get_option("mode.pandas_compatible") else None + nan_as_null = ( + False if cudf.get_option("mode.pandas_compatible") else None + ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) result = cls(s, nan_as_null=nan_as_null) @@ -777,7 +798,9 @@ def dt(self): elif isinstance(self._column, TimeDeltaColumn): return TimedeltaProperties(self) else: - raise AttributeError("Can only use .dt accessor with datetimelike values") + raise AttributeError( + "Can only use .dt accessor with datetimelike values" + ) @property # type: ignore @_cudf_nvtx_annotate @@ -837,7 +860,9 @@ def serialize(self): @_cudf_nvtx_annotate def deserialize(cls, header, frames): index_nframes = header["index_frame_count"] - obj = super().deserialize(header, frames[header["index_frame_count"] :]) + obj = super().deserialize( + header, frames[header["index_frame_count"] :] + ) idx_typ = pickle.loads(header["index"]["type-serialized"]) index = idx_typ.deserialize(header["index"], frames[:index_nframes]) @@ -875,7 +900,9 @@ def drop( # Ignore columns for Series if columns is not None: columns = [] - return super().drop(labels, axis, index, columns, level, inplace, errors) + return super().drop( + labels, axis, index, columns, level, inplace, errors + ) def tolist(self): # noqa: D102 raise TypeError( @@ -974,7 +1001,9 @@ def reindex(self, *args, **kwargs): """ if len(args) > 1: - raise TypeError("Only one positional argument ('index') is allowed") + raise TypeError( + "Only one positional argument ('index') is allowed" + ) if args: (index,) = args if "index" in kwargs: @@ -1059,10 +1088,13 @@ def reindex(self, *args, **kwargs): """, ) ) - def reset_index(self, level=None, drop=False, name=no_default, inplace=False): + def reset_index( + self, level=None, drop=False, name=no_default, inplace=False + ): if not drop and inplace: raise TypeError( - "Cannot reset_index inplace on a Series " "to create a DataFrame" + "Cannot reset_index inplace on a Series " + "to create a DataFrame" ) data, index = self._reset_index(level=level, drop=drop) if not drop: @@ -1123,7 +1155,9 @@ def to_frame(self, name=None): @_cudf_nvtx_annotate def memory_usage(self, index=True, deep=False): - return self._column.memory_usage + (self._index.memory_usage() if index else 0) + return self._column.memory_usage + ( + self._index.memory_usage() if index else 0 + ) @_cudf_nvtx_annotate def __array_function__(self, func, types, args, kwargs): @@ -1251,7 +1285,9 @@ def map(self, arg, na_action=None) -> "Series": raise NotImplementedError( "default values in dicts are currently not supported." ) - lhs = cudf.DataFrame({"x": self, "orig_order": as_column(range(len(self)))}) + lhs = cudf.DataFrame( + {"x": self, "orig_order": as_column(range(len(self)))} + ) rhs = cudf.DataFrame( { "x": arg.keys(), @@ -1259,16 +1295,21 @@ def map(self, arg, na_action=None) -> "Series": "bool": as_column(True, length=len(arg), dtype=self.dtype), } ) - res = lhs.merge(rhs, on="x", how="left").sort_values(by="orig_order") + res = lhs.merge(rhs, on="x", how="left").sort_values( + by="orig_order" + ) result = res["s"] result.name = self.name result.index = self.index elif isinstance(arg, cudf.Series): if not arg.index.is_unique: raise ValueError( - "Reindexing only valid with" " uniquely valued Index objects" + "Reindexing only valid with" + " uniquely valued Index objects" ) - lhs = cudf.DataFrame({"x": self, "orig_order": as_column(range(len(self)))}) + lhs = cudf.DataFrame( + {"x": self, "orig_order": as_column(range(len(self)))} + ) rhs = cudf.DataFrame( { "x": arg.keys(), @@ -1276,7 +1317,9 @@ def map(self, arg, na_action=None) -> "Series": "bool": as_column(True, length=len(arg), dtype=self.dtype), } ) - res = lhs.merge(rhs, on="x", how="left").sort_values(by="orig_order") + res = lhs.merge(rhs, on="x", how="left").sort_values( + by="orig_order" + ) result = res["s"] result.name = self.name result.index = self.index @@ -1312,7 +1355,9 @@ def _getitem_preprocessed( elif isinstance(spec, indexing_utils.SliceIndexer): return self._slice(spec.key) elif isinstance(spec, indexing_utils.ScalarIndexer): - return self._gather(spec.key, keep_index=False)._column.element_indexing(0) + return self._gather( + spec.key, keep_index=False + )._column.element_indexing(0) elif isinstance(spec, indexing_utils.EmptyIndexer): return self._empty_like(keep_index=True) assert_never(spec) @@ -1377,8 +1422,12 @@ def __repr__(self): ) else str(cudf.NA) ) - output = repr(preprocess.astype("str").fillna(fill_value).to_pandas()) - elif isinstance(preprocess._column, cudf.core.column.CategoricalColumn): + output = repr( + preprocess.astype("str").fillna(fill_value).to_pandas() + ) + elif isinstance( + preprocess._column, cudf.core.column.CategoricalColumn + ): min_rows = ( height if pd.get_option("display.min_rows") == 0 @@ -1469,7 +1518,9 @@ def _make_operands_and_index_for_binop( and fn in cudf.utils.utils._EQUALITY_OPS and not self.index.equals(other.index) ): - raise ValueError("Can only compare identically-labeled Series objects") + raise ValueError( + "Can only compare identically-labeled Series objects" + ) lhs, other = _align_indices([self, other], allow_non_unique=True) else: lhs = self @@ -1524,7 +1575,9 @@ def _concat(cls, objs, axis=0, index=True): else: with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) - index = cudf.core.index.Index._concat([o.index for o in objs]) + index = cudf.core.index.Index._concat( + [o.index for o in objs] + ) names = {obj.name for obj in objs} if len(names) == 1: @@ -1538,8 +1591,12 @@ def _concat(cls, objs, axis=0, index=True): if ( obj.null_count == len(obj) or len(obj) == 0 - or isinstance(obj._column, cudf.core.column.CategoricalColumn) - or isinstance(objs[0]._column, cudf.core.column.CategoricalColumn) + or isinstance( + obj._column, cudf.core.column.CategoricalColumn + ) + or isinstance( + objs[0]._column, cudf.core.column.CategoricalColumn + ) ): continue @@ -1694,7 +1751,9 @@ def dropna(self, axis=0, inplace=False, how=None): dtype: object """ if axis not in (0, "index"): - raise ValueError("Series.dropna supports only one axis to drop values from") + raise ValueError( + "Series.dropna supports only one axis to drop values from" + ) result = super().dropna(axis=axis) @@ -1775,7 +1834,9 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): return self._mimic_inplace(result, inplace=inplace) @_cudf_nvtx_annotate - def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None): + def fillna( + self, value=None, method=None, axis=None, inplace=False, limit=None + ): if isinstance(value, pd.Series): value = Series.from_pandas(value) @@ -2267,7 +2328,8 @@ def argsort( def replace(self, to_replace=None, value=no_default, *args, **kwargs): if is_dict_like(to_replace) and value not in {None, no_default}: raise ValueError( - "Series.replace cannot use dict-like to_replace and non-None " "value" + "Series.replace cannot use dict-like to_replace and non-None " + "value" ) return super().replace(to_replace, value, *args, **kwargs) @@ -2621,7 +2683,9 @@ def mode(self, dropna=True): @_cudf_nvtx_annotate def round(self, decimals=0, how="half_even"): if not is_integer(decimals): - raise ValueError(f"decimals must be an int, got {type(decimals).__name__}") + raise ValueError( + f"decimals must be an int, got {type(decimals).__name__}" + ) decimals = int(decimals) return super().round(decimals, how) @@ -2656,7 +2720,9 @@ def cov(self, other, min_periods=None): """ if min_periods is not None: - raise NotImplementedError("min_periods parameter is not implemented yet") + raise NotImplementedError( + "min_periods parameter is not implemented yet" + ) if self.empty or other.empty: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -2670,7 +2736,8 @@ def cov(self, other, min_periods=None): return lhs._column.cov(rhs._column) except AttributeError: raise TypeError( - f"cannot perform covariance with types {self.dtype}, " f"{other.dtype}" + f"cannot perform covariance with types {self.dtype}, " + f"{other.dtype}" ) @_cudf_nvtx_annotate @@ -3069,9 +3136,9 @@ def value_counts( res = res[res.index.notna()] else: res = self.groupby(self, dropna=dropna).count(dropna=dropna) - if isinstance(self.dtype, cudf.CategoricalDtype) and len(res) != len( - self.dtype.categories - ): + if isinstance(self.dtype, cudf.CategoricalDtype) and len( + res + ) != len(self.dtype.categories): # For categorical dtypes: When there exists # categories in dtypes and they are missing in the # column, `value_counts` will have to return @@ -3100,7 +3167,9 @@ def value_counts( return res @_cudf_nvtx_annotate - def quantile(self, q=0.5, interpolation="linear", exact=True, quant_index=True): + def quantile( + self, q=0.5, interpolation="linear", exact=True, quant_index=True + ): """ Return values at the given quantile. @@ -3159,7 +3228,9 @@ def quantile(self, q=0.5, interpolation="linear", exact=True, quant_index=True): try: np_array_q = cudf.core.column.as_column(q).values_host except TypeError: - raise TypeError(f"q must be a scalar or array-like, got {type(q)}") + raise TypeError( + f"q must be a scalar or array-like, got {type(q)}" + ) result = self._column.quantile( np_array_q, interpolation, exact, return_scalar=return_scalar @@ -3251,7 +3322,9 @@ def digitize(self, bins, right=False): 3 2 dtype: int32 """ - return Series(cudf.core.column.numerical.digitize(self._column, bins, right)) + return Series( + cudf.core.column.numerical.digitize(self._column, bins, right) + ) @_cudf_nvtx_annotate def diff(self, periods=1): @@ -3553,7 +3626,9 @@ def pct_change( ) if fill_method not in (no_default, None) or limit is not no_default: # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( "The 'fill_method' and 'limit' keywords in " f"{type(self).__name__}.pct_change are deprecated and will be " @@ -4140,7 +4215,9 @@ def quarter(self): 1 4 dtype: int8 """ - res = libcudf.datetime.extract_quarter(self.series._column).astype(np.int8) + res = libcudf.datetime.extract_quarter(self.series._column).astype( + np.int8 + ) return Series._from_data( {None: res}, index=self.series._index, @@ -4336,7 +4413,9 @@ def is_quarter_start(self): dtype: bool """ day = self.series._column.get_dt_field("day") - first_month = self.series._column.get_dt_field("month").isin([1, 4, 7, 10]) + first_month = self.series._column.get_dt_field("month").isin( + [1, 4, 7, 10] + ) result = ((day == cudf.Scalar(1)) & first_month).fillna(False) return Series._from_data( @@ -4385,7 +4464,9 @@ def is_quarter_end(self): day = self.series._column.get_dt_field("day") last_day = libcudf.datetime.last_day_of_month(self.series._column) last_day = last_day.get_dt_field("day") - last_month = self.series._column.get_dt_field("month").isin([3, 6, 9, 12]) + last_month = self.series._column.get_dt_field("month").isin( + [3, 6, 9, 12] + ) result = ((day == last_day) & last_month).fillna(False) return Series._from_data( @@ -4420,7 +4501,9 @@ def is_year_start(self): 2 True dtype: bool """ - outcol = self.series._column.get_dt_field("day_of_year") == cudf.Scalar(1) + outcol = self.series._column.get_dt_field( + "day_of_year" + ) == cudf.Scalar(1) return Series._from_data( {None: outcol.fillna(False)}, index=self.series._index, @@ -4469,7 +4552,9 @@ def is_year_end(self): @_cudf_nvtx_annotate def _get_dt_field(self, field): out_column = self.series._column.get_dt_field(field) - return Series(data=out_column, index=self.series._index, name=self.series.name) + return Series( + data=out_column, index=self.series._index, name=self.series.name + ) @_cudf_nvtx_annotate def ceil(self, freq): @@ -4640,7 +4725,9 @@ def strftime(self, date_format, *args, **kwargs): """ if not isinstance(date_format, str): - raise TypeError(f"'date_format' must be str, not {type(date_format)}") + raise TypeError( + f"'date_format' must be str, not {type(date_format)}" + ) # TODO: Remove following validations # once https://github.com/rapidsai/cudf/issues/5991 @@ -4658,8 +4745,12 @@ def strftime(self, date_format, *args, **kwargs): f"https://github.com/rapidsai/cudf/issues/5991 " f"for tracking purposes." ) - str_col = self.series._column.as_string_column(dtype="str", format=date_format) - return Series(data=str_col, index=self.series._index, name=self.series.name) + str_col = self.series._column.as_string_column( + dtype="str", format=date_format + ) + return Series( + data=str_col, index=self.series._index, name=self.series.name + ) @copy_docstring(DatetimeIndex.tz_localize) def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): @@ -4668,7 +4759,9 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): if tz is None: result_col = delocalize(self.series._column) else: - result_col = localize(self.series._column, tz, ambiguous, nonexistent) + result_col = localize( + self.series._column, tz, ambiguous, nonexistent + ) return Series._from_data( data={self.series.name: result_col}, index=self.series._index, @@ -4935,7 +5028,9 @@ def components(self): @_cudf_nvtx_annotate def _get_td_field(self, field): out_column = getattr(self.series._column, field) - return Series(data=out_column, index=self.series._index, name=self.series.name) + return Series( + data=out_column, index=self.series._index, name=self.series.name + ) @_cudf_nvtx_annotate @@ -4991,7 +5086,9 @@ def _align_indices(series_list, how="outer", allow_non_unique=False): # align all Series to the combined index result = [ - sr._align_to_index(combined_index, how=how, allow_non_unique=allow_non_unique) + sr._align_to_index( + combined_index, how=how, allow_non_unique=allow_non_unique + ) for sr in series_list ] diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 5d4a3d49855..19dde2e51b9 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -316,9 +316,9 @@ def _make_operands_for_binop( # Get the appropriate name for output operations involving two objects # that are Series-like objects. The output shares the lhs's name unless # the rhs is a _differently_ named Series-like object. - if isinstance(other, SingleColumnFrame) and not cudf.utils.utils._is_same_name( - self.name, other.name - ): + if isinstance( + other, SingleColumnFrame + ) and not cudf.utils.utils._is_same_name(self.name, other.name): result_name = None else: result_name = self.name @@ -326,9 +326,9 @@ def _make_operands_for_binop( if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): - if not hasattr(other, "__cuda_array_interface__") and not isinstance( - other, cudf.RangeIndex - ): + if not hasattr( + other, "__cuda_array_interface__" + ) and not isinstance(other, cudf.RangeIndex): return NotImplemented # Non-scalar right operands are valid iff they convert to columns. @@ -381,7 +381,9 @@ def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]: return self._column.take(arg) if is_bool_dtype(arg.dtype): if (bn := len(arg)) != (n := len(self)): - raise IndexError(f"Boolean mask has wrong length: {bn} not {n}") + raise IndexError( + f"Boolean mask has wrong length: {bn} not {n}" + ) return self._column.apply_boolean_mask(arg) raise NotImplementedError(f"Unknown indexer {type(arg)}") @@ -393,10 +395,14 @@ def where(self, cond, other=None, inplace=False): ) if isinstance(other, cudf.DataFrame): - raise NotImplementedError("cannot align with a higher dimensional Frame") + raise NotImplementedError( + "cannot align with a higher dimensional Frame" + ) cond = as_column(cond) if len(cond) != len(self): - raise ValueError("""Array conditional must be same shape as self""") + raise ValueError( + """Array conditional must be same shape as self""" + ) if not cudf.api.types.is_scalar(other): other = cudf.core.column.as_column(other) diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 475e705b4a4..24c49e3662a 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -189,7 +189,8 @@ def __call__( if padding != "max_length": error_msg = ( - "Only padding to the provided max_length" "is currently supported" + "Only padding to the provided max_length" + "is currently supported" ) raise NotImplementedError(error_msg) @@ -199,7 +200,8 @@ def __call__( if return_tensors not in {"cp", "pt", "tf"}: error_msg = ( - "Only cupy(cp), pytorch(pt) and tensorflow(tf) " "tensors are supported" + "Only cupy(cp), pytorch(pt) and tensorflow(tf) " + "tensors are supported" ) raise NotImplementedError(error_msg) @@ -217,7 +219,9 @@ def __call__( tokenizer_output = { "input_ids": cp.asarray(input_ids).reshape(-1, max_length), - "attention_mask": cp.asarray(attention_mask).reshape(-1, max_length), + "attention_mask": cp.asarray(attention_mask).reshape( + -1, max_length + ), "metadata": cp.asarray(metadata).reshape(-1, 3), } @@ -244,7 +248,9 @@ def _bert_add_special_tokens(token_o): seq_end_col = cp.clip(seq_end_col + 1, a_min=None, a_max=max_length - 1) _bert_add_special_tokens_input_ids(token_o["input_ids"], seq_end_col) - _bert_add_special_tokens_attention_mask(token_o["attention_mask"], seq_end_col) + _bert_add_special_tokens_attention_mask( + token_o["attention_mask"], seq_end_col + ) _bert_add_special_tokens_metadata(token_o["metadata"], max_length) return token_o @@ -260,7 +266,9 @@ def _bert_add_special_tokens_input_ids(input_ids, seq_end_col): input_ids[:, 0] = 101 # Mark end of sequence [SEP] - input_ids[cp.arange(0, input_ids.shape[0], dtype=cp.uint32), seq_end_col] = 102 + input_ids[ + cp.arange(0, input_ids.shape[0], dtype=cp.uint32), seq_end_col + ] = 102 def _bert_add_special_tokens_attention_mask(attention_mask, seq_end_col): @@ -284,4 +292,6 @@ def _bert_add_special_tokens_metadata(metadata, max_length): # metadata seq starts from plus 1 metadata[:, 1] = metadata[:, 1] + 1 # clip done to take overflow into account - metadata[:, 2] = cp.clip(metadata[:, 2] + 1, a_min=None, a_max=max_length - 2) + metadata[:, 2] = cp.clip( + metadata[:, 2] + 1, a_min=None, a_max=max_length - 2 + ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index eee8370c019..65f97c99934 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -164,9 +164,9 @@ def to_datetime( if errors == "ignore": warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_datetime without passing `errors` and catch exceptions " - "explicitly instead", + "errors='ignore' is deprecated and will raise in a " + "future version. Use to_datetime without passing `errors` " + "and catch exceptions explicitly instead", FutureWarning, ) @@ -231,7 +231,9 @@ def to_datetime( + arg[unit_rev["day"]].astype("str").str.zfill(2) ) format = "%Y-%m-%d" - col = new_series._column.as_datetime_column("datetime64[s]", format=format) + col = new_series._column.as_datetime_column( + "datetime64[s]", format=format + ) for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) @@ -265,7 +267,9 @@ def to_datetime( factor = cudf.Scalar( column.datetime._unit_to_nanoseconds_conversion[u] / ( - column.datetime._unit_to_nanoseconds_conversion["s"] + column.datetime._unit_to_nanoseconds_conversion[ + "s" + ] if np.datetime_data(col.dtype)[0] == "s" else 1 ) @@ -276,7 +280,9 @@ def to_datetime( else: times_column = times_column + (current_col * factor) if times_column is not None: - col = (col.astype(dtype="int64") + times_column).astype(dtype=col.dtype) + col = (col.astype(dtype="int64") + times_column).astype( + dtype=col.dtype + ) col = _process_col( col=col, unit=unit, @@ -331,7 +337,9 @@ def _process_col( if col.dtype.kind == "f": if unit not in (None, "ns"): - factor = cudf.Scalar(column.datetime._unit_to_nanoseconds_conversion[unit]) + factor = cudf.Scalar( + column.datetime._unit_to_nanoseconds_conversion[unit] + ) col = col * factor if format is not None: @@ -345,7 +353,9 @@ def _process_col( col.astype("int") .astype("str") .as_datetime_column( - dtype="datetime64[us]" if "%f" in format else "datetime64[s]", + dtype="datetime64[us]" + if "%f" in format + else "datetime64[s]", format=format, ) ) @@ -526,7 +536,9 @@ class DateOffset: def __init__(self, n=1, normalize=False, **kwds): if normalize: - raise NotImplementedError("normalize not yet supported for DateOffset") + raise NotImplementedError( + "normalize not yet supported for DateOffset" + ) all_possible_units = { "years", @@ -607,7 +619,9 @@ def kwds(self): def _combine_months_and_years(self, **kwargs): # TODO: if months is zero, don't do a binop - kwargs["months"] = kwargs.pop("years", 0) * 12 + kwargs.pop("months", 0) + kwargs["months"] = kwargs.pop("years", 0) * 12 + kwargs.pop( + "months", 0 + ) return kwargs def _combine_kwargs_to_seconds(self, **kwargs): @@ -632,7 +646,9 @@ def _combine_kwargs_to_seconds(self, **kwargs): kwargs["seconds"] = seconds return kwargs - def _datetime_binop(self, datetime_col, op, reflect=False) -> column.DatetimeColumn: + def _datetime_binop( + self, datetime_col, op, reflect=False + ) -> column.DatetimeColumn: if reflect and op == "__sub__": raise TypeError( f"Can not subtract a {type(datetime_col).__name__}" @@ -962,7 +978,8 @@ def date_range( # are dropped in conversion during the binops warnings.simplefilter("ignore", UserWarning) end_estim = ( - pd.Timestamp(start.value) + periods * offset._maybe_as_fast_pandas_offset() + pd.Timestamp(start.value) + + periods * offset._maybe_as_fast_pandas_offset() ).to_datetime64() if "months" in offset.kwds or "years" in offset.kwds: @@ -1049,10 +1066,13 @@ def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int: def _to_iso_calendar(arg): formats = ["%G", "%V", "%u"] if not isinstance(arg, (cudf.Index, cudf.core.series.DatetimeProperties)): - raise AttributeError("Can only use .isocalendar accessor with series or index") + raise AttributeError( + "Can only use .isocalendar accessor with series or index" + ) if isinstance(arg, cudf.Index): iso_params = [ - arg._column.as_string_column(arg._values.dtype, fmt) for fmt in formats + arg._column.as_string_column(arg._values.dtype, fmt) + for fmt in formats ] index = arg._column elif isinstance(arg.series, cudf.Series): diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 19ae632d7f7..68b23f1e059 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -97,16 +97,18 @@ def to_numeric(arg, errors="raise", downcast=None): raise ValueError("invalid error value specified") elif errors == "ignore": warnings.warn( - "errors='ignore' is deprecated and will raise in a future version. " - "Use to_numeric without passing `errors` and catch exceptions " - "explicitly instead", + "errors='ignore' is deprecated and will raise in " + "a future version. Use to_numeric without passing `errors` " + "and catch exceptions explicitly instead", FutureWarning, ) if downcast not in {None, "integer", "signed", "unsigned", "float"}: raise ValueError("invalid downcasting method provided") - if not can_convert_to_column(arg) or (hasattr(arg, "ndim") and arg.ndim > 1): + if not can_convert_to_column(arg) or ( + hasattr(arg, "ndim") and arg.ndim > 1 + ): raise ValueError("arg must be column convertible") col = as_column(arg) diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index cb39dfdb196..fe0637cfaef 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from functools import partial @@ -29,7 +29,9 @@ def group_reduction_impl_basic(context, builder, sig, args, function): retty = sig.return_type # a variable logically corresponding to the calling `Group` - grp = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) # what specific (numba) GroupType grp_type = sig.args[0] @@ -53,8 +55,12 @@ def group_corr(context, builder, sig, args): """ Instruction boilerplate used for calling a groupby correlation """ - lhs_grp = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) - rhs_grp = cgutils.create_struct_proxy(sig.args[1])(context, builder, value=args[1]) + lhs_grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + rhs_grp = cgutils.create_struct_proxy(sig.args[1])( + context, builder, value=args[1] + ) device_func = call_cuda_functions["corr"][ ( @@ -68,8 +74,12 @@ def group_corr(context, builder, sig, args): device_func, nb_signature( types.float64, - types.CPointer(sig.args[0].group_scalar_type), # this group calls corr - types.CPointer(sig.args[1].group_scalar_type), # this group is passed + types.CPointer( + sig.args[0].group_scalar_type + ), # this group calls corr + types.CPointer( + sig.args[1].group_scalar_type + ), # this group is passed group_size_type, ), ( @@ -110,7 +120,9 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function): """ retty = sig.return_type - grp = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) grp_type = sig.args[0] if grp_type.index_type != index_default_type: @@ -142,12 +154,18 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function): cuda_Group_std = partial(group_reduction_impl_basic, function="std") cuda_Group_var = partial(group_reduction_impl_basic, function="var") -cuda_Group_idxmax = partial(group_reduction_impl_idx_max_or_min, function="idxmax") -cuda_Group_idxmin = partial(group_reduction_impl_idx_max_or_min, function="idxmin") +cuda_Group_idxmax = partial( + group_reduction_impl_idx_max_or_min, function="idxmax" +) +cuda_Group_idxmin = partial( + group_reduction_impl_idx_max_or_min, function="idxmin" +) def cuda_Group_size(context, builder, sig, args): - grp = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) return grp.size @@ -163,6 +181,10 @@ def cuda_Group_size(context, builder, sig, args): cuda_lower("GroupType.mean", GroupType(ty))(cuda_Group_mean) cuda_lower("GroupType.std", GroupType(ty))(cuda_Group_std) cuda_lower("GroupType.var", GroupType(ty))(cuda_Group_var) - cuda_lower("GroupType.idxmax", GroupType(ty, types.int64))(cuda_Group_idxmax) - cuda_lower("GroupType.idxmin", GroupType(ty, types.int64))(cuda_Group_idxmin) + cuda_lower("GroupType.idxmax", GroupType(ty, types.int64))( + cuda_Group_idxmax + ) + cuda_lower("GroupType.idxmin", GroupType(ty, types.int64))( + cuda_Group_idxmin + ) cuda_lower("GroupType.corr", GroupType(ty), GroupType(ty))(group_corr) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index e288ececb88..72088493074 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from typing import Any, Dict import numba @@ -30,7 +30,9 @@ numpy_support.as_dtype(dt) for dt in SUPPORTED_GROUPBY_NUMBA_TYPES ] -_UDF_DOC_URL = "https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs/" +_UDF_DOC_URL = ( + "https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs/" +) class Group: @@ -52,8 +54,9 @@ class GroupType(numba.types.Type): """ def __init__(self, group_scalar_type, index_type=index_default_type): - if group_scalar_type not in SUPPORTED_GROUPBY_NUMBA_TYPES and not isinstance( - group_scalar_type, types.Poison + if ( + group_scalar_type not in SUPPORTED_GROUPBY_NUMBA_TYPES + and not isinstance(group_scalar_type, types.Poison) ): # A frame containing an column with an unsupported dtype # is calling groupby apply. Construct a GroupType with @@ -65,7 +68,9 @@ def __init__(self, group_scalar_type, index_type=index_default_type): self.group_data_type = types.CPointer(group_scalar_type) self.group_size_type = group_size_type self.group_index_type = types.CPointer(index_type) - super().__init__(name=f"Group({self.group_scalar_type}, {self.index_type})") + super().__init__( + name=f"Group({self.group_scalar_type}, {self.index_type})" + ) class GroupByJITDataFrame(Row): @@ -196,7 +201,9 @@ def generic(self, args, kws): ) if funcs := call_cuda_functions.get(self.key.__name__): for sig in funcs.keys(): - if all(arg.group_scalar_type == ty for arg, ty in zip(args, sig)): + if all( + arg.group_scalar_type == ty for arg, ty in zip(args, sig) + ): return nb_signature(sig[0], *args) raise UDFError(self.make_error_string(args)) @@ -232,7 +239,8 @@ def generic(self, args, kws): for sig in funcs.keys(): retty, selfty, *argtys = sig if self.this.group_scalar_type == selfty and all( - arg.group_scalar_type == ty for arg, ty in zip(args, argtys) + arg.group_scalar_type == ty + for arg, ty in zip(args, argtys) ): return nb_signature(retty, *args, recvr=self.this) raise UDFError(self.make_error_string(args)) @@ -299,7 +307,9 @@ class GroupCorr(GroupBinaryAttrBase): class DataFrameAttributeTemplate(AttributeTemplate): def resolve(self, value, attr): - raise UDFError(f"JIT GroupBy.apply() does not support DataFrame.{attr}(). ") + raise UDFError( + f"JIT GroupBy.apply() does not support DataFrame.{attr}(). " + ) @cuda_registry.register_attr @@ -319,8 +329,12 @@ class GroupAttr(AttributeTemplate): resolve_var = _make_unary_attr("var") resolve_std = _make_unary_attr("std") - resolve_size = _create_reduction_attr("GroupType.size", retty=group_size_type) - resolve_count = _create_reduction_attr("GroupType.count", retty=types.int64) + resolve_size = _create_reduction_attr( + "GroupType.size", retty=group_size_type + ) + resolve_count = _create_reduction_attr( + "GroupType.count", retty=types.int64 + ) def resolve_idxmax(self, mod): return types.BoundFunction( diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 183ea9c6c73..06d9296ca0f 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -94,7 +94,9 @@ def _groupby_apply_kernel_string_from_template(frame, args): # Generate the initializers for each device function argument initializers = [] for i, colname in enumerate(frame.keys()): - initializers.append(group_initializer_template.format(idx=i, name=colname)) + initializers.append( + group_initializer_template.format(idx=i, name=colname) + ) return groupby_apply_kernel_template.format( input_columns=input_columns, @@ -105,7 +107,9 @@ def _groupby_apply_kernel_string_from_template(frame, args): def _get_groupby_apply_kernel(frame, func, args): np_field_types = np.dtype(list(_all_dtypes_from_frame(frame).items())) - dataframe_group_type = _get_frame_groupby_type(np_field_types, frame.index.dtype) + dataframe_group_type = _get_frame_groupby_type( + np_field_types, frame.index.dtype + ) return_type = _get_udf_return_type(dataframe_group_type, func, args) @@ -215,7 +219,9 @@ def _can_be_jitted(frame, func, args): ).items() ) ) - dataframe_group_type = _get_frame_groupby_type(np_field_types, frame.index.dtype) + dataframe_group_type = _get_frame_groupby_type( + np_field_types, frame.index.dtype + ) try: _get_udf_return_type(dataframe_group_type, func, args) return True diff --git a/python/cudf/cudf/core/udf/masked_lowering.py b/python/cudf/cudf/core/udf/masked_lowering.py index 18e2a33f2fc..ae09294e3f9 100644 --- a/python/cudf/cudf/core/udf/masked_lowering.py +++ b/python/cudf/cudf/core/udf/masked_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import operator @@ -55,11 +55,17 @@ def masked_scalar_op_impl(context, builder, sig, args): # Let there be two actual LLVM structs backing the two inputs # https://mapping-high-level-constructs-to-llvm-ir.readthedocs.io/en/latest/basic-constructs/structures.html - m1 = cgutils.create_struct_proxy(masked_type_1)(context, builder, value=args[0]) - m2 = cgutils.create_struct_proxy(masked_type_2)(context, builder, value=args[1]) + m1 = cgutils.create_struct_proxy(masked_type_1)( + context, builder, value=args[0] + ) + m2 = cgutils.create_struct_proxy(masked_type_2)( + context, builder, value=args[1] + ) # we will return an output struct - result = cgutils.create_struct_proxy(masked_return_type)(context, builder) + result = cgutils.create_struct_proxy(masked_return_type)( + context, builder + ) # compute output validity valid = builder.and_(m1.valid, m2.valid) result.valid = valid @@ -97,10 +103,14 @@ def masked_scalar_unary_op_impl(context, builder, sig, args): # MaskedType(...) masked_return_type = sig.return_type - m1 = cgutils.create_struct_proxy(masked_type_1)(context, builder, value=args[0]) + m1 = cgutils.create_struct_proxy(masked_type_1)( + context, builder, value=args[0] + ) # we will return an output struct - result = cgutils.create_struct_proxy(masked_return_type)(context, builder) + result = cgutils.create_struct_proxy(masked_return_type)( + context, builder + ) # compute output validity result.valid = m1.valid @@ -249,7 +259,9 @@ def masked_scalar_is_null_impl(context, builder, sig, args): na, masked_type = sig.args value = args[1] - indata = cgutils.create_struct_proxy(masked_type)(context, builder, value=value) + indata = cgutils.create_struct_proxy(masked_type)( + context, builder, value=value + ) result = cgutils.alloca_once(builder, ir.IntType(1)) with builder.if_else(indata.valid) as (then, otherwise): with then: @@ -283,7 +295,9 @@ def pack_return_scalar_impl(context, builder, sig, args): @cuda_lower(operator.truth, MaskedType) @cuda_lower(bool, MaskedType) def masked_scalar_bool_impl(context, builder, sig, args): - indata = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) + indata = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) result = cgutils.alloca_once(builder, ir.IntType(1)) with builder.if_else(indata.valid) as (then, otherwise): with then: @@ -304,7 +318,9 @@ def masked_scalar_bool_impl(context, builder, sig, args): @cuda_lower(float, MaskedType) @cuda_lower(int, MaskedType) def masked_scalar_cast_impl(context, builder, sig, args): - input = cgutils.create_struct_proxy(sig.args[0])(context, builder, value=args[0]) + input = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) result = cgutils.create_struct_proxy(sig.return_type)(context, builder) casted = context.cast( @@ -351,7 +367,9 @@ def cast_masked_to_masked(context, builder, fromty, toty, val): # We will operand = cgutils.create_struct_proxy(fromty)(context, builder, value=val) - casted = context.cast(builder, operand.value, fromty.value_type, toty.value_type) + casted = context.cast( + builder, operand.value, fromty.value_type, toty.value_type + ) ext = cgutils.create_struct_proxy(toty)(context, builder) ext.value = casted ext.valid = operand.valid diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py index c84549adfc9..4c90c5bbba0 100644 --- a/python/cudf/cudf/core/udf/masked_typing.py +++ b/python/cudf/cudf/core/udf/masked_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import operator @@ -47,7 +47,9 @@ TIMEDELTA_TYPES, ) -SUPPORTED_NUMPY_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES +SUPPORTED_NUMPY_TYPES = ( + NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES +) supported_type_str = "\n".join(sorted(list(SUPPORTED_NUMPY_TYPES) + ["bool"])) _units = ["ns", "ms", "us", "s"] @@ -148,7 +150,9 @@ def f(x): # two MaskedType unify to a new MaskedType whose value_type # is the result of unifying `self` and `other` `value_type` elif isinstance(other, MaskedType): - return MaskedType(context.unify_pairs(self.value_type, other.value_type)) + return MaskedType( + context.unify_pairs(self.value_type, other.value_type) + ) # if we have MaskedType and something that results in a # scalar, unify between the MaskedType's value_type @@ -184,7 +188,8 @@ def typeof_masked(val, c): class MaskedConstructor(ConcreteTemplate): key = api.Masked cases = [ - nb_signature(MaskedType(t), t, types.boolean) for t in _supported_masked_types + nb_signature(MaskedType(t), t, types.boolean) + for t in _supported_masked_types ] @@ -200,7 +205,9 @@ def resolve_Masked(self, mod): # Registration of the global is also needed for Numba to type api.Masked cuda_decl_registry.register_global(api, types.Module(api)) # For typing bare Masked (as in `from .api import Masked` -cuda_decl_registry.register_global(api.Masked, types.Function(MaskedConstructor)) +cuda_decl_registry.register_global( + api.Masked, types.Function(MaskedConstructor) +) # Provide access to `m.value` and `m.valid` in a kernel for a Masked `m`. @@ -606,10 +613,14 @@ class MaskedStringViewAttrs(AttributeTemplate): key = MaskedType(string_view) def resolve_replace(self, mod): - return types.BoundFunction(MaskedStringViewReplace, MaskedType(string_view)) + return types.BoundFunction( + MaskedStringViewReplace, MaskedType(string_view) + ) def resolve_count(self, mod): - return types.BoundFunction(MaskedStringViewCount, MaskedType(string_view)) + return types.BoundFunction( + MaskedStringViewCount, MaskedType(string_view) + ) def resolve_value(self, mod): return string_view diff --git a/python/cudf/cudf/core/udf/row_function.py b/python/cudf/cudf/core/udf/row_function.py index 843521e1956..e040836f97d 100644 --- a/python/cudf/cudf/core/udf/row_function.py +++ b/python/cudf/cudf/core/udf/row_function.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import math import numpy as np @@ -122,7 +122,9 @@ def f(row, c, k): else unmasked_input_initializer_template ) initializers.append(template.format(idx=idx)) - row_initializers.append(row_initializer_template.format(idx=idx, name=colname)) + row_initializers.append( + row_initializer_template.format(idx=idx, name=colname) + ) return row_kernel_template.format( input_columns=input_columns, @@ -143,7 +145,9 @@ def _get_row_kernel(frame, func, args): sig = _construct_signature(frame, scalar_return_type, args) # this row type is used within the kernel to pack up the column and # mask data into the dict like data structure the user udf expects - np_field_types = np.dtype(list(_supported_dtypes_from_frame(frame).items())) + np_field_types = np.dtype( + list(_supported_dtypes_from_frame(frame).items()) + ) row_type = _get_frame_row_type(np_field_types) # Dict of 'local' variables into which `_kernel` is defined diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index fd3fe3d4370..f2d58e97910 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -125,7 +125,9 @@ def cast_string_literal_to_string_view(context, builder, fromty, toty, val): sv = cgutils.create_struct_proxy(string_view)(context, builder) # set the empty strview data pointer to point to the literal value - sv.data = context.insert_string_const_addrspace(builder, fromty.literal_value) + sv.data = context.insert_string_const_addrspace( + builder, fromty.literal_value + ) sv.length = context.get_constant(size_type, len(fromty.literal_value)) sv.bytes = context.get_constant( size_type, len(fromty.literal_value.encode("UTF-8")) @@ -255,7 +257,9 @@ def replace_impl(context, builder, sig, args): _ = context.compile_internal( builder, call_string_view_replace, - types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), + types.void( + _UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR + ), (udf_str_ptr, src_ptr, to_replace_ptr, replacement_ptr), ) @@ -325,9 +329,9 @@ def binary_func_impl(context, builder, sig, args): f"UDFString.{binary_func}", string_view, string_view )(binary_func_impl) else: - binary_func_impl = cuda_lower(binary_func, string_view, string_view)( - binary_func_impl - ) + binary_func_impl = cuda_lower( + binary_func, string_view, string_view + )(binary_func_impl) return binary_func_impl @@ -426,7 +430,9 @@ def id_func_impl(context, builder, sig, args): # Lookup table required for conversion functions # must be resolved at runtime after context initialization, # therefore cannot be a global variable - tbl_ptr = context.get_constant(types.uintp, get_character_flags_table_ptr()) + tbl_ptr = context.get_constant( + types.uintp, get_character_flags_table_ptr() + ) result = context.compile_internal( builder, cuda_func, @@ -467,7 +473,9 @@ def id_func_impl(context, builder, sig, args): special_tbl_ptr = context.get_constant( types.uintp, get_special_case_mapping_table_ptr() ) - udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) + udf_str_ptr = builder.alloca( + default_manager[udf_string].get_value_type() + ) _ = context.compile_internal( builder, @@ -561,7 +569,9 @@ def masked_len_impl(context, builder, sig, args): masked_sv = cgutils.create_struct_proxy(masked_sv_ty)( context, builder, value=args[0] ) - result = len_impl(context, builder, size_type(string_view), (masked_sv.value,)) + result = len_impl( + context, builder, size_type(string_view), (masked_sv.value,) + ) ret.value = result ret.valid = masked_sv.valid @@ -690,11 +700,15 @@ def upper_or_lower_impl(context, builder, sig, args): startswith_impl, types.boolean, ) -create_masked_binary_string_func("MaskedType.endswith", endswith_impl, types.boolean) +create_masked_binary_string_func( + "MaskedType.endswith", endswith_impl, types.boolean +) create_masked_binary_string_func("MaskedType.find", find_impl, size_type) create_masked_binary_string_func("MaskedType.rfind", rfind_impl, size_type) create_masked_binary_string_func("MaskedType.count", count_impl, size_type) -create_masked_binary_string_func(operator.contains, contains_impl, types.boolean) +create_masked_binary_string_func( + operator.contains, contains_impl, types.boolean +) create_masked_unary_identifier_func("MaskedType.isalnum", isalnum_impl) diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index 6268ff1b2cd..43604ab21a7 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -190,7 +190,9 @@ class StringViewReplace(AbstractTemplate): key = "StringView.replace" def generic(self, args, kws): - return nb_signature(udf_string, string_view, string_view, recvr=self.this) + return nb_signature( + udf_string, string_view, string_view, recvr=self.this + ) class StringViewAttrs(AttributeTemplate): @@ -235,7 +237,9 @@ def resolve_replace(self, mod): for func in int_binary_funcs: - setattr(StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type)) + setattr( + StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type) + ) for func in id_unary_funcs: setattr( diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index ba0d997e3ff..bc1f4f2557e 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -47,7 +47,11 @@ JIT_SUPPORTED_TYPES = ( - NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES + NUMERIC_TYPES + | BOOL_TYPES + | DATETIME_TYPES + | TIMEDELTA_TYPES + | STRING_TYPES ) libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32")) MASK_BITSIZE = np.dtype("int32").itemsize * 8 @@ -59,7 +63,9 @@ @functools.cache def _ptx_file(): return _get_ptx_file( - os.path.join(os.path.dirname(strings_udf.__file__), "..", "core", "udf"), + os.path.join( + os.path.dirname(strings_udf.__file__), "..", "core", "udf" + ), "shim_", ) @@ -116,7 +122,9 @@ def _get_udf_return_type(argty, func: Callable, args=()): def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype if str(col.dtype) in supported_types else np.dtype("O") + colname: col.dtype + if str(col.dtype) in supported_types + else np.dtype("O") for colname, col in frame._data.items() } @@ -217,7 +225,9 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): """ scalar_argtypes = tuple(typeof(arg) for arg in args) return ( - *cudautils.make_cache_key(func, tuple(_all_dtypes_from_frame(frame).values())), + *cudautils.make_cache_key( + func, tuple(_all_dtypes_from_frame(frame).values()) + ), *(col.mask is None for col in frame._data.values()), *frame._data.keys(), scalar_argtypes, @@ -226,7 +236,9 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): @_cudf_nvtx_annotate -def _compile_or_get(frame, func, args, kernel_getter=None, suffix="__APPLY_UDF"): +def _compile_or_get( + frame, func, args, kernel_getter=None, suffix="__APPLY_UDF" +): """ Return a compiled kernel in terms of MaskedTypes that launches a kernel equivalent of `f` for the dtypes of `df`. The kernel uses @@ -277,9 +289,9 @@ def _get_kernel(kernel_string, globals_, sig, func): globals_["f_"] = f_ exec(kernel_string, globals_) _kernel = globals_["_kernel"] - kernel = cuda.jit(sig, link=[_ptx_file()], extensions=[str_view_arg_handler])( - _kernel - ) + kernel = cuda.jit( + sig, link=[_ptx_file()], extensions=[str_view_arg_handler] + )(_kernel) return kernel diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index b9b3b4f8e76..2037b1682db 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -226,8 +226,12 @@ def _apply_agg_column(self, source_column, agg_name): end = as_column(end, dtype="int32") idx = as_column(range(len(start))) - preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype("int32") - following_window = (end - idx - cudf.Scalar(1, "int32")).astype("int32") + preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype( + "int32" + ) + following_window = (end - idx - cudf.Scalar(1, "int32")).astype( + "int32" + ) window = None else: preceding_window = as_column(self.window) @@ -259,7 +263,11 @@ def _apply_agg_dataframe(self, df, agg_name): def _apply_agg(self, agg_name): if isinstance(self.obj, cudf.Series): return cudf.Series._from_data( - {self.obj.name: self._apply_agg_column(self.obj._column, agg_name)}, + { + self.obj.name: self._apply_agg_column( + self.obj._column, agg_name + ) + }, index=self.obj.index, ) else: @@ -431,14 +439,18 @@ def _normalize(self): if self.min_periods is None: min_periods = window else: - if isinstance(window, (numba.cuda.devicearray.DeviceNDArray, BaseIndexer)): + if isinstance( + window, (numba.cuda.devicearray.DeviceNDArray, BaseIndexer) + ): # window is a device_array of window sizes or BaseIndexer self.window = window self.min_periods = min_periods return if not isinstance(self.obj.index, cudf.core.index.DatetimeIndex): - raise ValueError("window must be an integer for non datetime index") + raise ValueError( + "window must be an integer for non datetime index" + ) self._time_window = True @@ -494,10 +506,14 @@ def __init__(self, groupby, window, min_periods=None, center=False): # of `groupby.grouping.keys` and `groupby.obj`. # As an optimization, avoid gathering those twice. self._group_keys = groupby.grouping.keys.take(sort_order) - obj = groupby.obj.drop(columns=groupby.grouping._named_columns).take(sort_order) + obj = groupby.obj.drop(columns=groupby.grouping._named_columns).take( + sort_order + ) gb_size = groupby.size().sort_index() - self._group_starts = gb_size.cumsum().shift(1).fillna(0).repeat(gb_size) + self._group_starts = ( + gb_size.cumsum().shift(1).fillna(0).repeat(gb_size) + ) super().__init__(obj, window, min_periods=min_periods, center=center) diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index e311f214d62..7b183d5f1a3 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -55,7 +55,9 @@ def timeseries( if dtypes is None: dtypes = {"name": "category", "id": int, "x": float, "y": float} - index = pd.DatetimeIndex(pd.date_range(start, end, freq=freq, name="timestamp")) + index = pd.DatetimeIndex( + pd.date_range(start, end, freq=freq, name="timestamp") + ) state = np.random.RandomState(seed) columns = {k: make[dt](len(index), state) for k, dt in dtypes.items()} df = pd.DataFrame(columns, index=index, columns=sorted(columns)) @@ -156,7 +158,9 @@ def make_string(n, rstate): def make_categorical(n, rstate): - return pd.Categorical.from_codes(rstate.randint(0, len(names), size=n), names) + return pd.Categorical.from_codes( + rstate.randint(0, len(names), size=n), names + ) def make_bool(n, rstate): diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index 54fbd833b99..728b34045bf 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import cudf from cudf import _lib as libcudf @@ -33,5 +33,7 @@ def read_avro( ValueError("URL content-encoding decompression is not supported") return cudf.DataFrame._from_data( - *libcudf.avro.read_avro(filepath_or_buffer, columns, skiprows, num_rows) + *libcudf.avro.read_avro( + filepath_or_buffer, columns, skiprows, num_rows + ) ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 3591072e4e8..3eeeac405b3 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -65,7 +65,8 @@ def read_csv( if use_python_file_object and bytes_per_thread is not None: raise ValueError( - "bytes_per_thread is only supported when " "`use_python_file_object=False`" + "bytes_per_thread is only supported when " + "`use_python_file_object=False`" ) if bytes_per_thread is None: @@ -198,16 +199,20 @@ def to_csv( try: df = df[columns] except KeyError: - raise NameError("Dataframe doesn't have the labels provided in columns") + raise NameError( + "Dataframe doesn't have the labels provided in columns" + ) for col in df._data.columns: if isinstance(col, cudf.core.column.ListColumn): raise NotImplementedError( - "Writing to csv format is not yet supported with " "list columns." + "Writing to csv format is not yet supported with " + "list columns." ) elif isinstance(col, cudf.core.column.StructColumn): raise NotImplementedError( - "Writing to csv format is not yet supported with " "Struct columns." + "Writing to csv format is not yet supported with " + "Struct columns." ) # TODO: Need to typecast categorical columns to the underlying @@ -215,7 +220,8 @@ def to_csv( # workaround once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6661 if any( - isinstance(col, cudf.core.column.CategoricalColumn) for col in df._data.columns + isinstance(col, cudf.core.column.CategoricalColumn) + for col in df._data.columns ) or isinstance(df.index, cudf.CategoricalIndex): df = df.copy(deep=False) for col_name, col in df._data.items(): diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 37b77eb14ad..d3d99aab0cd 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -74,7 +74,8 @@ def to_dlpack(cudf_obj): gdf = cudf.Series._from_data({None: cudf_obj}) else: raise TypeError( - f"Input of type {type(cudf_obj)} cannot be converted " "to DLPack tensor" + f"Input of type {type(cudf_obj)} cannot be converted " + "to DLPack tensor" ) if any( @@ -83,7 +84,9 @@ def to_dlpack(cudf_obj): ): raise TypeError("non-numeric data not yet supported") - dtype = cudf.utils.dtypes.find_common_type([col.dtype for col in gdf._data.columns]) + dtype = cudf.utils.dtypes.find_common_type( + [col.dtype for col in gdf._data.columns] + ) gdf = gdf.astype(dtype) return libdlpack.to_dlpack([*gdf._columns]) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 60530ad56dc..b2f3fd09146 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -60,7 +60,9 @@ def read_json( if engine == "auto": engine = "cudf" if lines else "pandas" if engine != "cudf" and keep_quotes: - raise ValueError("keep_quotes='True' is supported only with engine='cudf'") + raise ValueError( + "keep_quotes='True' is supported only with engine='cudf'" + ) if engine == "cudf_legacy" or engine == "cudf": if dtype is None: @@ -130,7 +132,8 @@ def read_json( storage_options=storage_options, ): raise NotImplementedError( - "`read_json` does not yet support reading " "multiple files via pandas" + "`read_json` does not yet support reading " + "multiple files via pandas" ) path_or_buf, compression = ioutils.get_reader_filepath_or_buffer( @@ -223,9 +226,13 @@ def to_json( if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) - libjson.write_json(cudf_val, path_or_buf=file_obj, *args, **kwargs) + libjson.write_json( + cudf_val, path_or_buf=file_obj, *args, **kwargs + ) else: - libjson.write_json(cudf_val, path_or_buf=path_or_buf, *args, **kwargs) + libjson.write_json( + cudf_val, path_or_buf=path_or_buf, *args, **kwargs + ) if return_as_string: path_or_buf.seek(0) @@ -249,5 +256,6 @@ def to_json( ) else: raise ValueError( - f"`engine` only support {{'auto', 'cudf', 'pandas'}}, " f"got: {engine}" + f"`engine` only support {{'auto', 'cudf', 'pandas'}}, " + f"got: {engine}" ) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 7e7e161bea7..d135a31438e 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import datetime import warnings @@ -45,10 +45,14 @@ def _parse_column_statistics(cs, column_statistics_blob): if cs.HasField("intStatistics"): column_statistics["minimum"] = ( - cs.intStatistics.minimum if cs.intStatistics.HasField("minimum") else None + cs.intStatistics.minimum + if cs.intStatistics.HasField("minimum") + else None ) column_statistics["maximum"] = ( - cs.intStatistics.maximum if cs.intStatistics.HasField("maximum") else None + cs.intStatistics.maximum + if cs.intStatistics.HasField("maximum") + else None ) column_statistics["sum"] = ( cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None @@ -66,7 +70,9 @@ def _parse_column_statistics(cs, column_statistics_blob): else None ) column_statistics["sum"] = ( - cs.doubleStatistics.sum if cs.doubleStatistics.HasField("sum") else None + cs.doubleStatistics.sum + if cs.doubleStatistics.HasField("sum") + else None ) elif cs.HasField("stringStatistics"): @@ -85,7 +91,8 @@ def _parse_column_statistics(cs, column_statistics_blob): elif cs.HasField("bucketStatistics"): column_statistics["true_count"] = cs.bucketStatistics.count[0] column_statistics["false_count"] = ( - column_statistics["number_of_values"] - column_statistics["true_count"] + column_statistics["number_of_values"] + - column_statistics["true_count"] ) elif cs.HasField("decimalStatistics"): @@ -180,7 +187,9 @@ def read_orc_statistics( ) = liborc.read_raw_orc_statistics(path_or_buf) # Parse column names - column_names = [column_name.decode("utf-8") for column_name in column_names] + column_names = [ + column_name.decode("utf-8") for column_name in column_names + ] # Parse statistics cs = cs_pb2.ColumnStatistics() @@ -190,7 +199,10 @@ def read_orc_statistics( for i, raw_file_stats in enumerate(raw_file_statistics) if columns is None or column_names[i] in columns } - if any(not parsed_statistics for parsed_statistics in file_statistics.values()): + if any( + not parsed_statistics + for parsed_statistics in file_statistics.values() + ): continue else: files_statistics.append(file_statistics) @@ -313,11 +325,15 @@ def read_orc( # Must ensure a stripe for each source is specified, unless None if not len(stripes) == len(filepath_or_buffer): - raise ValueError("A list of stripes must be provided for each input source") + raise ValueError( + "A list of stripes must be provided for each input source" + ) filepaths_or_buffers = [] for source in filepath_or_buffer: - if ioutils.is_directory(path_or_data=source, storage_options=storage_options): + if ioutils.is_directory( + path_or_data=source, storage_options=storage_options + ): fs = ioutils._ensure_filesystem( passed_filesystem=None, path=source, @@ -334,7 +350,9 @@ def read_orc( bytes_per_thread=bytes_per_thread, ) if compression is not None: - raise ValueError("URL content-encoding decompression is not supported") + raise ValueError( + "URL content-encoding decompression is not supported" + ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: @@ -375,14 +393,16 @@ def read_orc_stripe(orc_file, stripe, columns): warnings.warn("Using CPU via PyArrow to read ORC dataset.") if len(filepath_or_buffer) > 1: raise NotImplementedError( - "Using CPU via PyArrow only supports a single a " "single input source" + "Using CPU via PyArrow only supports a single a " + "single input source" ) orc_file = orc.ORCFile(filepath_or_buffer[0]) if stripes is not None and len(stripes) > 0: for stripe_source_file in stripes: pa_tables = [ - read_orc_stripe(orc_file, i, columns) for i in stripe_source_file + read_orc_stripe(orc_file, i, columns) + for i in stripe_source_file ] pa_table = pa.concat_tables(pa_tables) else: @@ -416,7 +436,8 @@ def to_orc( if isinstance(df.index, cudf.CategoricalIndex): raise NotImplementedError( - "Writing to ORC format is not yet supported with " "Categorical columns." + "Writing to ORC format is not yet supported with " + "Categorical columns." ) if cols_as_map_type is not None and not isinstance(cols_as_map_type, list): diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 73986a407af..bead9c352ef 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -392,9 +392,14 @@ def _process_dataset( # Extract hive-partition keys, and make sure they # are ordered the same as they are in `partition_categories` if partition_categories: - raw_keys = ds._get_partition_keys(file_fragment.partition_expression) + raw_keys = ds._get_partition_keys( + file_fragment.partition_expression + ) partition_keys.append( - [(name, raw_keys[name]) for name in partition_categories.keys()] + [ + (name, raw_keys[name]) + for name in partition_categories.keys() + ] ) # Apply row-group filtering @@ -414,7 +419,11 @@ def _process_dataset( row_groups.append(filtered_row_groups) else: row_groups.append( - [rg_id for rg_id in filtered_row_groups if rg_id in selection] + [ + rg_id + for rg_id in filtered_row_groups + if rg_id in selection + ] ) return ( @@ -530,7 +539,9 @@ def read_parquet( ) if compression is not None: - raise ValueError("URL content-encoding decompression is not supported") + raise ValueError( + "URL content-encoding decompression is not supported" + ) if isinstance(tmp_source, list): filepath_or_buffer.extend(tmp_source) else: @@ -558,7 +569,8 @@ def read_parquet( if columns and filters: projected_columns = columns columns = sorted( - set(v[0] for v in itertools.chain.from_iterable(filters)) | set(columns) + set(v[0] for v in itertools.chain.from_iterable(filters)) + | set(columns) ) # Convert parquet data to a cudf.DataFrame @@ -641,7 +653,9 @@ def _handle_in(column: cudf.Series, value, *, negate) -> cudf.Series: def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: if value not in {np.nan, None}: - raise TypeError("Value of 'is'/'is not' filter must be np.nan or None.") + raise TypeError( + "Value of 'is'/'is not' filter must be np.nan or None." + ) return ~column.isna() if negate else column.isna() handlers: Dict[str, Callable] = { @@ -673,7 +687,10 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: ( reduce( operator.and_, - (handlers[op](df[column], value) for (column, op, value) in expr), + ( + handlers[op](df[column], value) + for (column, op, value) in expr + ), ) for expr in filters ), @@ -682,7 +699,9 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: return df[selection].reset_index(drop=True) return df[selection] except (KeyError, TypeError): - warnings.warn(f"Row-wise filtering failed in read_parquet for {filters}") + warnings.warn( + f"Row-wise filtering failed in read_parquet for {filters}" + ) return df @@ -709,7 +728,9 @@ def _parquet_to_frame( partition_meta = None partitioning = (dataset_kwargs or {}).get("partitioning", None) if hasattr(partitioning, "schema"): - partition_meta = cudf.DataFrame.from_arrow(partitioning.schema.empty_table()) + partition_meta = cudf.DataFrame.from_arrow( + partitioning.schema.empty_table() + ) # For partitioned data, we need a distinct read for each # unique set of partition keys. Therefore, we start by @@ -756,7 +777,9 @@ def _parquet_to_frame( # Not building categorical columns, so # `value` is already what we want _dtype = ( - partition_meta[name].dtype if partition_meta is not None else None + partition_meta[name].dtype + if partition_meta is not None + else None ) if pd.isna(value): dfs[-1][name] = column_empty( @@ -812,7 +835,10 @@ def _read_parquet( use_pandas_metadata=use_pandas_metadata, ) else: - if isinstance(filepaths_or_buffers, list) and len(filepaths_or_buffers) == 1: + if ( + isinstance(filepaths_or_buffers, list) + and len(filepaths_or_buffers) == 1 + ): filepaths_or_buffers = filepaths_or_buffers[0] return cudf.DataFrame.from_pandas( @@ -901,7 +927,10 @@ def to_parquet( ) partition_info = ( - [(i, j - i) for i, j in zip(partition_offsets, partition_offsets[1:])] + [ + (i, j - i) + for i, j in zip(partition_offsets, partition_offsets[1:]) + ] if partition_offsets is not None else None ) @@ -928,7 +957,9 @@ def to_parquet( import pyarrow.parquet as pq if partition_offsets is not None: - warnings.warn("partition_offsets will be ignored when engine is not cudf") + warnings.warn( + "partition_offsets will be ignored when engine is not cudf" + ) # If index is empty set it to the expected default value of True if index is None: @@ -987,7 +1018,9 @@ def _get_partitioned( preserve_index=False, storage_options=None, ): - fs = ioutils._ensure_filesystem(fs, root_path, storage_options=storage_options) + fs = ioutils._ensure_filesystem( + fs, root_path, storage_options=storage_options + ) fs.mkdirs(root_path, exist_ok=True) part_names, grouped_df, part_offsets = _get_groups_and_offsets( @@ -998,7 +1031,10 @@ def _get_partitioned( metadata_file_paths = [] for keys in part_names.itertuples(index=False): subdir = fs.sep.join( - [_hive_dirname(name, val) for name, val in zip(partition_cols, keys)] + [ + _hive_dirname(name, val) + for name, val in zip(partition_cols, keys) + ] ) prefix = fs.sep.join([root_path, subdir]) fs.mkdirs(prefix, exist_ok=True) @@ -1029,7 +1065,9 @@ def _get_groups_and_offsets( grouped_df.drop(columns=partition_cols, inplace=True) # Copy the entire keys df in one operation rather than using iloc part_names = ( - part_keys.take(part_offsets[:-1]).to_pandas(nullable=True).to_frame(index=False) + part_keys.take(part_offsets[:-1]) + .to_pandas(nullable=True) + .to_frame(index=False) ) return part_names, grouped_df, part_offsets @@ -1084,12 +1122,16 @@ def _parse_bytes(s): try: n = float(prefix) except ValueError as e: - raise ValueError("Could not interpret '%s' as a number" % prefix) from e + raise ValueError( + "Could not interpret '%s' as a number" % prefix + ) from e try: multiplier = BYTE_SIZES[suffix.lower()] except KeyError as e: - raise ValueError("Could not interpret '%s' as a byte unit" % suffix) from e + raise ValueError( + "Could not interpret '%s' as a byte unit" % suffix + ) from e result = n * multiplier return int(result) @@ -1207,7 +1249,8 @@ def __init__( if max_file_size is not None: if file_name_prefix is None: raise ValueError( - "file_name_prefix cannot be None if max_file_size is " "passed" + "file_name_prefix cannot be None if max_file_size is " + "passed" ) self.max_file_size = _parse_bytes(max_file_size) @@ -1232,7 +1275,10 @@ def write_table(self, df): for idx, keys in enumerate(part_names.itertuples(index=False)): subdir = fs.sep.join( - [f"{name}={val}" for name, val in zip(self.partition_cols, keys)] + [ + f"{name}={val}" + for name, val in zip(self.partition_cols, keys) + ] ) prefix = fs.sep.join([self.path, subdir]) fs.mkdirs(prefix, exist_ok=True) @@ -1250,9 +1296,9 @@ def write_table(self, df): # if the file is too large, compute metadata for # smaller chunks parts = math.ceil(current_file_size / self.max_file_size) - new_offsets = list(range(start, end, int((end - start) / parts)))[ - 1: - ] + new_offsets = list( + range(start, end, int((end - start) / parts)) + )[1:] new_offsets.append(end) num_chunks = len(new_offsets) parts = len(new_offsets) @@ -1269,24 +1315,31 @@ def write_table(self, df): # Check if the same `new_file_name` exists and # generate a `new_file_name` while new_full_path in self._file_sizes and ( - self._file_sizes[new_full_path] + (current_file_size / parts) + self._file_sizes[new_full_path] + + (current_file_size / parts) ) > (self.max_file_size): curr_file_num += 1 - new_file_name = f"{self.filename}_{curr_file_num}.parquet" + new_file_name = ( + f"{self.filename}_{curr_file_num}.parquet" + ) new_full_path = fs.sep.join([prefix, new_file_name]) self._file_sizes[new_full_path] = self._file_sizes.get( new_full_path, 0 ) + (current_file_size / parts) full_paths.append(new_full_path) - metadata_file_paths.append(fs.sep.join([subdir, new_file_name])) + metadata_file_paths.append( + fs.sep.join([subdir, new_file_name]) + ) num_chunks += 1 curr_file_num += 1 else: self.filename = self.filename or _generate_filename() full_path = fs.sep.join([prefix, self.filename]) full_paths.append(full_path) - metadata_file_paths.append(fs.sep.join([subdir, self.filename])) + metadata_file_paths.append( + fs.sep.join([subdir, self.filename]) + ) full_offsets.append(current_offset[1]) paths, metadata_file_paths, offsets = ( @@ -1372,7 +1425,9 @@ def __exit__(self, *args): self.close() -def _default_open_file_options(open_file_options, columns, row_groups, fs=None): +def _default_open_file_options( + open_file_options, columns, row_groups, fs=None +): """ Set default fields in open_file_options. diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index cc80fda3792..7a0db49bd20 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import os import textwrap @@ -62,7 +62,9 @@ def _register_option( Raised by validator if the value is invalid. """ validator(default_value) - _OPTIONS[name] = Option(default_value, default_value, description, validator) + _OPTIONS[name] = Option( + default_value, default_value, description, validator + ) def get_option(name: str) -> Any: @@ -142,7 +144,8 @@ def _make_contains_validator(valid_options: Container) -> Callable: def _validator(val): if val not in valid_options: raise ValueError( - f"{val} is not a valid option. " f"Must be one of {set(valid_options)}." + f"{val} is not a valid option. " + f"Must be one of {set(valid_options)}." ) return _validator @@ -180,7 +183,9 @@ def _integer_validator(val): int(val) return True except ValueError: - raise ValueError(f"{val} is not a valid option. " f"Must be an integer.") + raise ValueError( + f"{val} is not a valid option. " f"Must be an integer." + ) def _integer_and_none_validator(val): @@ -334,7 +339,8 @@ class option_context(ContextDecorator): def __init__(self, *args) -> None: if len(args) % 2 != 0: raise ValueError( - "Need to invoke as option_context(pat, val, " "[(pat, val), ...])." + "Need to invoke as option_context(pat, val, " + "[(pat, val), ...])." ) self.ops = tuple(zip(args[::2], args[1::2])) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index c948993f556..47403befd6e 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 # All rights reserved. # SPDX-License-Identifier: Apache-2.0 import copyreg @@ -412,7 +412,9 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.BooleanArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={"__array_ufunc__": _FastSlowAttribute("__array_ufunc__")}, + additional_attributes={ + "__array_ufunc__": _FastSlowAttribute("__array_ufunc__") + }, ) BooleanDtype = make_final_proxy_type( @@ -430,7 +432,9 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.IntegerArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={"__array_ufunc__": _FastSlowAttribute("__array_ufunc__")}, + additional_attributes={ + "__array_ufunc__": _FastSlowAttribute("__array_ufunc__") + }, ) Int8Dtype = make_final_proxy_type( @@ -548,7 +552,9 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.FloatingArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={"__array_ufunc__": _FastSlowAttribute("__array_ufunc__")}, + additional_attributes={ + "__array_ufunc__": _FastSlowAttribute("__array_ufunc__") + }, ) Float32Dtype = make_final_proxy_type( @@ -813,7 +819,9 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs): "_TextFileReader", _Unusable, pd.io.parsers.readers.TextFileReader ) -_XportReader = make_intermediate_proxy_type("_XportReader", _Unusable, pd_XportReader) +_XportReader = make_intermediate_proxy_type( + "_XportReader", _Unusable, pd_XportReader +) _SAS7BDATReader = make_intermediate_proxy_type( "_SAS7BDATReader", _Unusable, pd_SAS7BDATReader diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 9d3186e5de4..e811ba1351a 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -267,7 +267,9 @@ def __init__(self, *args, **kwargs): # disallow __init__. An intermediate proxy type can only be # instantiated from (possibly chained) operations on a final # proxy type. - raise TypeError(f"Cannot directly instantiate object of type {type(self)}") + raise TypeError( + f"Cannot directly instantiate object of type {type(self)}" + ) @property # type: ignore def _fsproxy_state(self): @@ -404,7 +406,8 @@ def __get__(self, obj, owner=None) -> Any: obj = owner if not ( - isinstance(obj, _FastSlowProxy) or issubclass(type(obj), _FastSlowProxyMeta) + isinstance(obj, _FastSlowProxy) + or issubclass(type(obj), _FastSlowProxyMeta) ): # we only want to look up attributes on the underlying # fast/slow objects for instances of _FastSlowProxy or @@ -574,7 +577,9 @@ def __getattr__(self, name: str) -> Any: return obj if not _is_function_or_method(obj): - return _maybe_wrap_result(obj, getattr, self._fsproxy_slow, name) + return _maybe_wrap_result( + obj, getattr, self._fsproxy_slow, name + ) @functools.wraps(obj) def _wrapped_private_slow(*args, **kwargs): @@ -932,7 +937,8 @@ def _transform_arg( # transformed pieces # This handles scipy._lib._bunch._make_tuple_bunch args, kwargs = ( - _transform_arg(a, attribute_name, seen) for a in arg.__getnewargs_ex__() + _transform_arg(a, attribute_name, seen) + for a in arg.__getnewargs_ex__() ) obj = type(arg).__new__(type(arg), *args, **kwargs) if hasattr(obj, "__setstate__"): @@ -954,7 +960,9 @@ def _transform_arg( return type(arg).__new__(type(arg), *args) else: # Hope we can just call the constructor with transformed entries. - return type(arg)(_transform_arg(a, attribute_name, seen) for a in args) + return type(arg)( + _transform_arg(a, attribute_name, seen) for a in args + ) elif isinstance(arg, dict): return { _transform_arg(k, attribute_name, seen): _transform_arg( @@ -963,7 +971,9 @@ def _transform_arg( for k, a in arg.items() } elif isinstance(arg, np.ndarray) and arg.dtype == "O": - transformed = [_transform_arg(a, attribute_name, seen) for a in arg.flat] + transformed = [ + _transform_arg(a, attribute_name, seen) for a in arg.flat + ] # Keep the same memory layout as arg (the default is C_CONTIGUOUS) if arg.flags["F_CONTIGUOUS"] and not arg.flags["C_CONTIGUOUS"]: order = "F" @@ -1037,7 +1047,9 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any: elif isinstance(result, Iterator): return (_maybe_wrap_result(r, lambda x: x, r) for r in result) elif _is_function_or_method(result): - return _MethodProxy._fsproxy_wrap(result, method_chain=(func, args, kwargs)) + return _MethodProxy._fsproxy_wrap( + result, method_chain=(func, args, kwargs) + ) else: return result diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index abcf0f3e98c..bae31499280 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -79,16 +79,22 @@ def deduce_cudf_pandas_mode(slow_lib: str, fast_lib: str) -> DeducedMode: if "CUDF_PANDAS_FALLBACK_MODE" not in os.environ: try: importlib.import_module(fast_lib) - return DeducedMode(use_fast_lib=True, slow_lib=slow_lib, fast_lib=fast_lib) + return DeducedMode( + use_fast_lib=True, slow_lib=slow_lib, fast_lib=fast_lib + ) except Exception as e: warnings.warn( f"Exception encountered importing {fast_lib}: {e}." f"Falling back to only using {slow_lib}." ) - return DeducedMode(use_fast_lib=False, slow_lib=slow_lib, fast_lib=slow_lib) + return DeducedMode( + use_fast_lib=False, slow_lib=slow_lib, fast_lib=slow_lib + ) -class ModuleAcceleratorBase(importlib.abc.MetaPathFinder, importlib.abc.Loader): +class ModuleAcceleratorBase( + importlib.abc.MetaPathFinder, importlib.abc.Loader +): _instance: ModuleAcceleratorBase | None = None mod_name: str fast_lib: str @@ -121,7 +127,9 @@ def __new__( Name of package that provides "slow" fallback implementation """ if ModuleAcceleratorBase._instance is not None: - raise RuntimeError("Only one instance of ModuleAcceleratorBase allowed") + raise RuntimeError( + "Only one instance of ModuleAcceleratorBase allowed" + ) self = object.__new__(cls) self.mod_name = mod_name self.fast_lib = fast_lib @@ -143,7 +151,8 @@ def __new__( def __repr__(self) -> str: return ( - f"{self.__class__.__name__}" f"(fast={self.fast_lib}, slow={self.slow_lib})" + f"{self.__class__.__name__}" + f"(fast={self.fast_lib}, slow={self.slow_lib})" ) def find_spec( @@ -163,7 +172,9 @@ def find_spec( A ModuleSpec with ourself as loader if we're interposing, otherwise None to pass off to the next loader. """ - if fullname == self.mod_name or fullname.startswith(f"{self.mod_name}."): + if fullname == self.mod_name or fullname.startswith( + f"{self.mod_name}." + ): return importlib.machinery.ModuleSpec( name=fullname, loader=self, @@ -305,7 +316,9 @@ def _wrap_attribute( # now, attempt to import the wrapped module, which will # recursively wrap all of its attributes: return importlib.import_module( - rename_root_module(slow_attr.__name__, self.slow_lib, self.mod_name) + rename_root_module( + slow_attr.__name__, self.slow_lib, self.mod_name + ) ) if slow_attr in self._wrapped_objs: if type(fast_attr) is _Unusable: @@ -544,8 +557,11 @@ def getattr_real_or_wrapped( # We cannot possibly be at the top level. assert frame.f_back calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename) - use_real = not calling_module.is_relative_to(CUDF_PANDAS_PATH) and any( - calling_module.is_relative_to(path) for path in loader._denylist + use_real = not calling_module.is_relative_to( + CUDF_PANDAS_PATH + ) and any( + calling_module.is_relative_to(path) + for path in loader._denylist ) try: if use_real: @@ -580,7 +596,9 @@ def install( ) mode = deduce_cudf_pandas_mode(slow_lib, fast_lib) if mode.use_fast_lib: - importlib.import_module(f".._wrappers.{mode.slow_lib}", __name__) + importlib.import_module( + f".._wrappers.{mode.slow_lib}", __name__ + ) try: (self,) = ( p diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py index 2f0dcd44ac8..49a417eec09 100644 --- a/python/cudf/cudf/pandas/profiler.py +++ b/python/cudf/cudf/pandas/profiler.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -145,11 +145,16 @@ def get_namespaced_function_name( def _tracefunc(self, frame, event, arg): if event == "line" and frame.f_code.co_filename == self._currfile: key = "".join(inspect.stack()[1].code_context) - if not any(ignore_word in key for ignore_word in Profiler._IGNORE_LIST): + if not any( + ignore_word in key for ignore_word in Profiler._IGNORE_LIST + ): self._currkey = (frame.f_lineno, self._currfile, key) self._results.setdefault(self._currkey, {}) self._timer[self._currkey] = time.perf_counter() - elif event == "call" and frame.f_code.co_name == "_fast_slow_function_call": + elif ( + event == "call" + and frame.f_code.co_name == "_fast_slow_function_call" + ): if self._currkey is not None: self._timer[self._currkey] = time.perf_counter() @@ -165,18 +170,23 @@ def _tracefunc(self, frame, event, arg): ): func_name = self.get_namespaced_function_name(func_obj) self._call_stack.append((func_name, time.perf_counter())) - elif event == "return" and frame.f_code.co_name == "_fast_slow_function_call": + elif ( + event == "return" + and frame.f_code.co_name == "_fast_slow_function_call" + ): if self._currkey is not None and arg is not None: if arg[1]: # fast run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey]["gpu_time"] = run_time + self._results[ - self._currkey - ].get("gpu_time", 0) + self._results[self._currkey]["gpu_time"] = ( + run_time + + self._results[self._currkey].get("gpu_time", 0) + ) else: run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey]["cpu_time"] = run_time + self._results[ - self._currkey - ].get("cpu_time", 0) + self._results[self._currkey]["cpu_time"] = ( + run_time + + self._results[self._currkey].get("cpu_time", 0) + ) frame_locals = inspect.getargvalues(frame).locals if ( diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index c4f1161399a..f1744c9e92b 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -35,7 +35,9 @@ def count_failures(log_file_name, pattern): and line["when"] == "call" and line["outcome"] == "failed" ): - line_module_name = line["location"][0].removeprefix(PANDAS_TEST_PREFIX) + line_module_name = line["location"][0].removeprefix( + PANDAS_TEST_PREFIX + ) if fnmatch(line_module_name, pattern): if "longrepr" in line and line["longrepr"]: if isinstance(line["longrepr"], (tuple, list)): diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index 0144abb0dca..bfc56319d82 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -47,7 +47,9 @@ def get_per_module_results(log_file_name): # it's an xpassed test outcome = "failed" module_name = ( - line["nodeid"].split("::")[0].removeprefix(PANDAS_TEST_PREFIX) + line["nodeid"] + .split("::")[0] + .removeprefix(PANDAS_TEST_PREFIX) ) per_module_results.setdefault(module_name, {}) per_module_results[module_name].setdefault("total", 0) @@ -58,7 +60,9 @@ def get_per_module_results(log_file_name): def sort_results(results): - sorted_keys = sorted(results, key=lambda key: results[key].get("failed", 0)) + sorted_keys = sorted( + results, key=lambda key: results[key].get("failed", 0) + ) return {key: results[key] for key in sorted_keys} @@ -94,7 +98,9 @@ def print_results_as_table(results): if __name__ == "__main__": # parse arguments parser = argparse.ArgumentParser() - parser.add_argument("log_file_name", nargs=1, help="The input log file name") + parser.add_argument( + "log_file_name", nargs=1, help="The input log file name" + ) parser.add_argument( "--output", choices=["json", "table"], diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 78fc826212e..e067d15af4c 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -125,7 +125,9 @@ def assert_eq(left, right, **kwargs): # `object`. Check equality before that happens: if kwargs.get("check_dtype", True): if hasattr(left, "dtype") and hasattr(right, "dtype"): - if isinstance(left.dtype, cudf.core.dtypes._BaseDtype) and not isinstance( + if isinstance( + left.dtype, cudf.core.dtypes._BaseDtype + ) and not isinstance( left.dtype, cudf.CategoricalDtype ): # leave categorical comparison to Pandas assert_eq(left.dtype, right.dtype) @@ -148,7 +150,9 @@ def assert_eq(left, right, **kwargs): # This warning comes from a call from pandas to numpy. It is ignored # here because it cannot be fixed within cudf. with warnings.catch_warnings(): - warnings.simplefilter("ignore", (DeprecationWarning, FutureWarning)) + warnings.simplefilter( + "ignore", (DeprecationWarning, FutureWarning) + ) if isinstance(left, pd.DataFrame): tm.assert_frame_equal(left, right, **kwargs) elif isinstance(left, pd.Series): @@ -308,7 +312,9 @@ def gen_rand(dtype, size, **kwargs): elif dtype.kind == "b": low = kwargs.get("low", 0) high = kwargs.get("high", 2) - return np.random.randint(low=low, high=high, size=size).astype(np.bool_) + return np.random.randint(low=low, high=high, size=size).astype( + np.bool_ + ) elif dtype.kind == "M": low = kwargs.get("low", 0) time_unit, _ = np.datetime_data(dtype) @@ -325,7 +331,9 @@ def gen_rand(dtype, size, **kwargs): nchars = np.random.randint(low=low, high=high, size=1)[0] char_options = np.array(list(string.ascii_letters + string.digits)) all_chars = "".join(np.random.choice(char_options, nchars * size)) - return np.array([all_chars[nchars * i : nchars * (i + 1)] for i in range(size)]) + return np.array( + [all_chars[nchars * i : nchars * (i + 1)] for i in range(size)] + ) raise NotImplementedError(f"dtype.kind={dtype.kind}") diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index c1bf9ac0746..13c194d6be0 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -104,7 +104,10 @@ def _generate_column(column_params, num_rows): # Construct set of values to sample from where # set size = cardinality - if isinstance(column_params.dtype, str) and column_params.dtype == "category": + if ( + isinstance(column_params.dtype, str) + and column_params.dtype == "category" + ): vals = pa.array( column_params.generator, size=column_params.cardinality, @@ -112,7 +115,9 @@ def _generate_column(column_params, num_rows): ) return pa.DictionaryArray.from_arrays( dictionary=vals, - indices=np.random.randint(low=0, high=len(vals), size=num_rows), + indices=np.random.randint( + low=0, high=len(vals), size=num_rows + ), mask=np.random.choice( [True, False], size=num_rows, @@ -173,7 +178,9 @@ def _generate_column(column_params, num_rows): else None, size=num_rows, safe=False, - type=None if isinstance(arrow_type, pa.lib.Decimal128Type) else arrow_type, + type=None + if isinstance(arrow_type, pa.lib.Decimal128Type) + else arrow_type, ) if isinstance(arrow_type, pa.lib.Decimal128Type): vals = vals.cast(arrow_type, safe=False) @@ -235,7 +242,10 @@ def get_dataframe(parameters, use_threads): # Get schema for each column table_fields = [] for i, column_params in enumerate(parameters.column_parameters): - if isinstance(column_params.dtype, str) and column_params.dtype == "category": + if ( + isinstance(column_params.dtype, str) + and column_params.dtype == "category" + ): arrow_type = pa.dictionary( index_type=pa.int64(), value_type=np_to_pa_dtype( @@ -270,7 +280,9 @@ def get_dataframe(parameters, use_threads): # Generate data if not use_threads: for i, column_params in enumerate(parameters.column_parameters): - column_data[i] = _generate_column(column_params, parameters.num_rows) + column_data[i] = _generate_column( + column_params, parameters.num_rows + ) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( @@ -386,7 +398,9 @@ def rand_dataframe( ) ) elif dtype == "decimal64": - max_precision = meta.get("max_precision", cudf.Decimal64Dtype.MAX_PRECISION) + max_precision = meta.get( + "max_precision", cudf.Decimal64Dtype.MAX_PRECISION + ) precision = np.random.randint(1, max_precision) scale = np.random.randint(0, precision) dtype = cudf.Decimal64Dtype(precision=precision, scale=scale) @@ -400,7 +414,9 @@ def rand_dataframe( ) ) elif dtype == "decimal32": - max_precision = meta.get("max_precision", cudf.Decimal32Dtype.MAX_PRECISION) + max_precision = meta.get( + "max_precision", cudf.Decimal32Dtype.MAX_PRECISION + ) precision = np.random.randint(1, max_precision) scale = np.random.randint(0, precision) dtype = cudf.Decimal32Dtype(precision=precision, scale=scale) @@ -668,9 +684,13 @@ def get_values_for_nested_data(dtype, lists_max_length=None, size=None): for _ in range(cardinality) ] elif dtype.kind == "M": - values = datetime_generator(dtype=dtype, size=cardinality)().astype(dtype) + values = datetime_generator(dtype=dtype, size=cardinality)().astype( + dtype + ) elif dtype.kind == "m": - values = timedelta_generator(dtype=dtype, size=cardinality)().astype(dtype) + values = timedelta_generator(dtype=dtype, size=cardinality)().astype( + dtype + ) elif dtype.kind == "b": values = boolean_generator(cardinality)().astype(dtype) else: diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 5eb53335e2e..fc253c5c197 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -35,9 +35,13 @@ def dtype_can_compare_equal_to_other(dtype): def _check_isinstance(left, right, obj): if not isinstance(left, obj): - raise AssertionError(f"{obj} Expected type {obj}, found {type(left)} instead") + raise AssertionError( + f"{obj} Expected type {obj}, found {type(left)} instead" + ) elif not isinstance(right, obj): - raise AssertionError(f"{obj} Expected type {obj}, found {type(right)} instead") + raise AssertionError( + f"{obj} Expected type {obj}, found {type(right)} instead" + ) def raise_assert_detail(obj, message, left, right, diff=None): @@ -53,7 +57,9 @@ def raise_assert_detail(obj, message, left, right, diff=None): raise AssertionError(msg) -def _check_types(left, right, check_categorical=True, exact="equiv", obj="Index"): +def _check_types( + left, right, check_categorical=True, exact="equiv", obj="Index" +): if not exact or exact == "equiv": if ( isinstance(left, cudf.RangeIndex) @@ -77,9 +83,15 @@ def _check_types(left, right, check_categorical=True, exact="equiv", obj="Index" obj, "Class types are different", f"{type(left)}", f"{type(right)}" ) - if exact and not isinstance(left, cudf.MultiIndex) and _is_categorical_dtype(left): + if ( + exact + and not isinstance(left, cudf.MultiIndex) + and _is_categorical_dtype(left) + ): if left.dtype != right.dtype: - raise_assert_detail(obj, "Categorical difference", f"{left}", f"{right}") + raise_assert_detail( + obj, "Categorical difference", f"{left}", f"{right}" + ) def assert_column_equal( @@ -193,7 +205,11 @@ def assert_column_equal( f"{obj} category", "Orders are different", msg1, msg2 ) - if not check_dtype and _is_categorical_dtype(left) and _is_categorical_dtype(right): + if ( + not check_dtype + and _is_categorical_dtype(left) + and _is_categorical_dtype(right) + ): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) columns_equal = False @@ -211,18 +227,30 @@ def assert_column_equal( ): try: # nulls must be in the same places for all dtypes - columns_equal = cp.all(left.isnull().values == right.isnull().values) + columns_equal = cp.all( + left.isnull().values == right.isnull().values + ) - if columns_equal and not check_exact and is_numeric_dtype(left.dtype): + if ( + columns_equal + and not check_exact + and is_numeric_dtype(left.dtype) + ): # non-null values must be the same columns_equal = cp.allclose( - left.apply_boolean_mask(left.isnull().unary_operator("not")).values, + left.apply_boolean_mask( + left.isnull().unary_operator("not") + ).values, right.apply_boolean_mask( right.isnull().unary_operator("not") ).values, ) - if columns_equal and (left.dtype.kind == right.dtype.kind == "f"): - columns_equal = cp.all(is_nan(left).values == is_nan(right).values) + if columns_equal and ( + left.dtype.kind == right.dtype.kind == "f" + ): + columns_equal = cp.all( + is_nan(left).values == is_nan(right).values + ) else: columns_equal = left.equals(right) except TypeError as e: @@ -345,7 +373,9 @@ def assert_index_equal( # instance validation _check_isinstance(left, right, cudf.BaseIndex) - _check_types(left, right, exact=exact, check_categorical=check_categorical, obj=obj) + _check_types( + left, right, exact=exact, check_categorical=check_categorical, obj=obj + ) if len(left) != len(right): raise_assert_detail( @@ -391,7 +421,9 @@ def assert_index_equal( # metadata comparison if check_names and (left.name != right.name): - raise_assert_detail(obj, "name mismatch", f"{left.name}", f"{right.name}") + raise_assert_detail( + obj, "name mismatch", f"{left.name}", f"{right.name}" + ) def assert_series_equal( @@ -516,7 +548,9 @@ def assert_series_equal( # metadata comparison if check_names and (left.name != right.name): - raise_assert_detail(obj, "name mismatch", f"{left.name}", f"{right.name}") + raise_assert_detail( + obj, "name mismatch", f"{left.name}", f"{right.name}" + ) def assert_frame_equal( diff --git a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py index bd1dbba5428..f2c2d9a263b 100644 --- a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py +++ b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. import pandas as pd @@ -8,8 +8,12 @@ def test_slice_datetimetz_index(): data = ["2001-01-01", "2001-01-02", None, None, "2001-01-03"] - pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize("US/Eastern") - idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize("US/Eastern") + pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize( + "US/Eastern" + ) + idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize( + "US/Eastern" + ) expected = pidx[1:4] got = idx[1:4] assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py index 43edebb5b4f..b28ef131025 100644 --- a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py +++ b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py @@ -21,7 +21,9 @@ def test_tz_convert(): idx = cudf.from_pandas(pidx) pidx = pidx.tz_localize("UTC") idx = idx.tz_localize("UTC") - assert_eq(pidx.tz_convert("America/New_York"), idx.tz_convert("America/New_York")) + assert_eq( + pidx.tz_convert("America/New_York"), idx.tz_convert("America/New_York") + ) def test_delocalize_naive(): diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py index 2d47f051342..d59041e32d5 100644 --- a/python/cudf/cudf/tests/indexes/test_interval.py +++ b/python/cudf/cudf/tests/indexes/test_interval.py @@ -79,7 +79,9 @@ def test_interval_range_empty(closed): @pytest.mark.parametrize("end", [6, 8, 10, 43, 70]) def test_interval_range_freq_basic(start, end, freq, closed): pindex = pd.interval_range(start=start, end=end, freq=freq, closed=closed) - gindex = cudf.interval_range(start=start, end=end, freq=freq, closed=closed) + gindex = cudf.interval_range( + start=start, end=end, freq=freq, closed=closed + ) assert_eq(pindex, gindex) @@ -95,7 +97,9 @@ def test_interval_range_freq_basic_dtype(start_t, end_t, freq_t): pindex = pd.interval_range( start=start_val, end=end_val, freq=freq_val, closed="left" ) - gindex = cudf.interval_range(start=start, end=end, freq=freq, closed="left") + gindex = cudf.interval_range( + start=start, end=end, freq=freq, closed="left" + ) if gindex.dtype.subtype.kind == "f": gindex = gindex.astype( cudf.IntervalDtype(subtype="float64", closed=gindex.dtype.closed) @@ -115,8 +119,12 @@ def test_interval_range_freq_basic_dtype(start_t, end_t, freq_t): @pytest.mark.parametrize("start", [0, 0.0, 1.0, 1, 2, 2.0, 3.0, 3]) @pytest.mark.parametrize("end", [4, 4.0, 5.0, 5, 6, 6.0, 7.0, 7]) def test_interval_range_periods_basic(start, end, periods, closed): - pindex = pd.interval_range(start=start, end=end, periods=periods, closed=closed) - gindex = cudf.interval_range(start=start, end=end, periods=periods, closed=closed) + pindex = pd.interval_range( + start=start, end=end, periods=periods, closed=closed + ) + gindex = cudf.interval_range( + start=start, end=end, periods=periods, closed=closed + ) assert_eq(pindex, gindex) @@ -128,11 +136,15 @@ def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t): start, end, periods = start_t(0), end_t(4), periods_t(1) start_val = start.value if isinstance(start, cudf.Scalar) else start end_val = end.value if isinstance(end, cudf.Scalar) else end - periods_val = periods.value if isinstance(periods, cudf.Scalar) else periods + periods_val = ( + periods.value if isinstance(periods, cudf.Scalar) else periods + ) pindex = pd.interval_range( start=start_val, end=end_val, periods=periods_val, closed="left" ) - gindex = cudf.interval_range(start=start, end=end, periods=periods, closed="left") + gindex = cudf.interval_range( + start=start, end=end, periods=periods, closed="left" + ) assert_eq(pindex, gindex) @@ -157,8 +169,12 @@ def test_interval_range_periods_warnings(): @pytest.mark.parametrize("freq", [1, 2, 3, 4]) @pytest.mark.parametrize("end", [4, 8, 9, 10]) def test_interval_range_periods_freq_end(end, freq, periods, closed): - pindex = pd.interval_range(end=end, freq=freq, periods=periods, closed=closed) - gindex = cudf.interval_range(end=end, freq=freq, periods=periods, closed=closed) + pindex = pd.interval_range( + end=end, freq=freq, periods=periods, closed=closed + ) + gindex = cudf.interval_range( + end=end, freq=freq, periods=periods, closed=closed + ) assert_eq(pindex, gindex) @@ -170,11 +186,15 @@ def test_interval_range_periods_freq_end_dtype(periods_t, freq_t, end_t): periods, freq, end = periods_t(2), freq_t(3), end_t(10) freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq end_val = end.value if isinstance(end, cudf.Scalar) else end - periods_val = periods.value if isinstance(periods, cudf.Scalar) else periods + periods_val = ( + periods.value if isinstance(periods, cudf.Scalar) else periods + ) pindex = pd.interval_range( end=end_val, freq=freq_val, periods=periods_val, closed="left" ) - gindex = cudf.interval_range(end=end, freq=freq, periods=periods, closed="left") + gindex = cudf.interval_range( + end=end, freq=freq, periods=periods, closed="left" + ) assert_eq(pindex, gindex) @@ -184,8 +204,12 @@ def test_interval_range_periods_freq_end_dtype(periods_t, freq_t, end_t): @pytest.mark.parametrize("freq", [1, 2, 3, 4]) @pytest.mark.parametrize("start", [1, 4, 9, 12]) def test_interval_range_periods_freq_start(start, freq, periods, closed): - pindex = pd.interval_range(start=start, freq=freq, periods=periods, closed=closed) - gindex = cudf.interval_range(start=start, freq=freq, periods=periods, closed=closed) + pindex = pd.interval_range( + start=start, freq=freq, periods=periods, closed=closed + ) + gindex = cudf.interval_range( + start=start, freq=freq, periods=periods, closed=closed + ) assert_eq(pindex, gindex) @@ -197,11 +221,15 @@ def test_interval_range_periods_freq_start_dtype(periods_t, freq_t, start_t): periods, freq, start = periods_t(2), freq_t(3), start_t(9) freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq start_val = start.value if isinstance(start, cudf.Scalar) else start - periods_val = periods.value if isinstance(periods, cudf.Scalar) else periods + periods_val = ( + periods.value if isinstance(periods, cudf.Scalar) else periods + ) pindex = pd.interval_range( start=start_val, freq=freq_val, periods=periods_val, closed="left" ) - gindex = cudf.interval_range(start=start, freq=freq, periods=periods, closed="left") + gindex = cudf.interval_range( + start=start, freq=freq, periods=periods, closed="left" + ) # pandas upcasts to 64 bit https://github.com/pandas-dev/pandas/issues/57268 # using Series to use check_dtype @@ -309,7 +337,9 @@ def test_interval_index_from_breaks(closed): ], ) def test_interval_range_floating(start, stop, freq, periods): - expected = pd.interval_range(start=start, end=stop, freq=freq, periods=periods) + expected = pd.interval_range( + start=start, end=stop, freq=freq, periods=periods + ) got = interval_range(start=start, end=stop, freq=freq, periods=periods) assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/input_output/test_text.py b/python/cudf/cudf/tests/input_output/test_text.py index 3839908d4d5..acba13bb5b0 100644 --- a/python/cudf/cudf/tests/input_output/test_text.py +++ b/python/cudf/cudf/tests/input_output/test_text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. from io import StringIO @@ -23,7 +23,10 @@ def test_read_text(datadir): # Since Python split removes the delimiter and read_text does # not we need to add it back to the 'content' expected = cudf.Series( - [c + delimiter if i < (len(content) - 1) else c for i, c in enumerate(content)] + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] ) actual = cudf.read_text(chess_file, delimiter=delimiter) @@ -42,7 +45,10 @@ def test_read_text_byte_range(datadir): # Since Python split removes the delimiter and read_text does # not we need to add it back to the 'content' expected = cudf.Series( - [c + delimiter if i < (len(content) - 1) else c for i, c in enumerate(content)] + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] ) byte_range_size = (len(data) // 3) + (len(data) % 3 != 0) @@ -78,7 +84,9 @@ def test_read_text_byte_range_large(tmpdir): expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) - actual = cudf.read_text(temp_file, delimiter=delimiter, byte_range=[1000, 1000]) + actual = cudf.read_text( + temp_file, delimiter=delimiter, byte_range=[1000, 1000] + ) assert_eq(expected, actual) @@ -98,7 +106,9 @@ def test_read_text_in_memory_strip_delimiter(datadir): # not we need to add it back to the 'content' expected = cudf.Series(["x", "y", "z"]) - actual = cudf.read_text(StringIO("x::y::z"), delimiter="::", strip_delimiters=True) + actual = cudf.read_text( + StringIO("x::y::z"), delimiter="::", strip_delimiters=True + ) assert_eq(expected, actual) @@ -114,7 +124,10 @@ def test_read_text_bgzip(datadir): # Since Python split removes the delimiter and read_text does # not we need to add it back to the 'content' expected = cudf.Series( - [c + delimiter if i < (len(content) - 1) else c for i, c in enumerate(content)] + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] ) actual = cudf.read_text( @@ -135,7 +148,10 @@ def test_read_text_bgzip_offsets(datadir): # Since Python split removes the delimiter and read_text does # not we need to add it back to the 'content' expected = cudf.Series( - [c + delimiter if i < (len(content) - 1) else c for i, c in enumerate(content)] + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] ) actual = cudf.read_text( diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 976e9e5216e..98be7045923 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -33,7 +33,9 @@ def unit(request): return request.param -@pytest.fixture(params=["America/New_York", "Asia/Tokyo", "CET", "Etc/GMT+1", "UTC"]) +@pytest.fixture( + params=["America/New_York", "Asia/Tokyo", "CET", "Etc/GMT+1", "UTC"] +) def tz(request): return request.param @@ -67,7 +69,9 @@ def test_localize_ambiguous(request, unit, zone_name): ], dtype=f"datetime64[{unit}]", ) - expect = s.to_pandas().dt.tz_localize(zone_name, ambiguous="NaT", nonexistent="NaT") + expect = s.to_pandas().dt.tz_localize( + zone_name, ambiguous="NaT", nonexistent="NaT" + ) got = s.dt.tz_localize(zone_name) assert_eq(expect, got) @@ -91,15 +95,17 @@ def test_localize_nonexistent(request, unit, zone_name): ], dtype=f"datetime64[{unit}]", ) - expect = s.to_pandas().dt.tz_localize(zone_name, ambiguous="NaT", nonexistent="NaT") + expect = s.to_pandas().dt.tz_localize( + zone_name, ambiguous="NaT", nonexistent="NaT" + ) got = s.dt.tz_localize(zone_name) assert_eq(expect, got) def test_delocalize(unit, tz): - psr = pd.Series(pd.date_range("2001-01-01", "2001-01-02", freq="1s")).astype( - f"datetime64[{unit}]" - ) + psr = pd.Series( + pd.date_range("2001-01-01", "2001-01-02", freq="1s") + ).astype(f"datetime64[{unit}]") sr = cudf.from_pandas(psr) expect = psr.dt.tz_localize(tz).dt.tz_localize(None) @@ -117,8 +123,12 @@ def test_delocalize_naive(): assert_eq(expect, got) -@pytest.mark.parametrize("from_tz", ["Europe/London", "America/Chicago", "UTC"]) -@pytest.mark.parametrize("to_tz", ["Europe/London", "America/Chicago", "UTC", None]) +@pytest.mark.parametrize( + "from_tz", ["Europe/London", "America/Chicago", "UTC"] +) +@pytest.mark.parametrize( + "to_tz", ["Europe/London", "America/Chicago", "UTC", None] +) def test_convert(from_tz, to_tz): ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="h")) gs = cudf.from_pandas(ps) @@ -159,8 +169,12 @@ def test_convert_from_naive(): ], ) def test_convert_edge_cases(data, original_timezone, target_timezone): - ps = pd.Series(data, dtype="datetime64[s]").dt.tz_localize(original_timezone) - gs = cudf.Series(data, dtype="datetime64[s]").dt.tz_localize(original_timezone) + ps = pd.Series(data, dtype="datetime64[s]").dt.tz_localize( + original_timezone + ) + gs = cudf.Series(data, dtype="datetime64[s]").dt.tz_localize( + original_timezone + ) expect = ps.dt.tz_convert(target_timezone) got = gs.dt.tz_convert(target_timezone) assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 7bf20bf97b3..58939f0ddd9 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -58,7 +58,9 @@ def test_array_func_cudf_series(np_ar, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize("pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]) +@pytest.mark.parametrize( + "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))] +) @pytest.mark.parametrize( "func", [ @@ -80,7 +82,9 @@ def test_array_func_cudf_dataframe(pd_df, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize("pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))]) +@pytest.mark.parametrize( + "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))] +) @pytest.mark.parametrize( "func", [ diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index c9d423a59c5..b036c1f13f3 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -22,7 +22,9 @@ ) _UFUNCS = [ - obj for obj in (getattr(np, name) for name in dir(np)) if isinstance(obj, np.ufunc) + obj + for obj in (getattr(np, name) for name in dir(np)) + if isinstance(obj, np.ufunc) ] @@ -265,7 +267,9 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed): @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("indexed", [True, False]) @pytest.mark.parametrize("reflect", [True, False]) -def test_binary_ufunc_series_array(request, ufunc, has_nulls, indexed, reflect): +def test_binary_ufunc_series_array( + request, ufunc, has_nulls, indexed, reflect +): fname = ufunc.__name__ request.applymarker( pytest.mark.xfail( @@ -282,9 +286,13 @@ def test_binary_ufunc_series_array(request, ufunc, has_nulls, indexed, reflect): request.applymarker( pytest.mark.xfail( condition=( - fname in {"greater", "greater_equal", "logical_and"} and has_nulls + fname in {"greater", "greater_equal", "logical_and"} + and has_nulls + ), + reason=( + "cudf and pandas incompatible casting nans " + "to nulls in binops" ), - reason=("cudf and pandas incompatible casting nans " "to nulls in binops"), ) ) N = 100 @@ -417,7 +425,9 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): if indexed and ufunc.nin == 2 else args ) - mask = reduce(operator.or_, (a["foo"].isna() for a in aligned)).to_pandas() + mask = reduce( + operator.or_, (a["foo"].isna() for a in aligned) + ).to_pandas() got = ufunc(*args) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 609012a4c6d..0e38b10ed52 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -64,7 +64,9 @@ def test_can_detect_dtype_from_avro_type( actual = cudf_from_avro_util(schema, []) - expected = cudf.DataFrame({"prop": cudf.Series(None, None, expected_dtype)}) + expected = cudf.DataFrame( + {"prop": cudf.Series(None, None, expected_dtype)} + ) assert_eq(expected, actual) @@ -102,7 +104,9 @@ def test_can_detect_dtype_from_avro_type_nested( ns="" if namespace is None else namespace + "." ) - expected = cudf.DataFrame({col_name: cudf.Series(None, None, expected_dtype)}) + expected = cudf.DataFrame( + {col_name: cudf.Series(None, None, expected_dtype)} + ) assert_eq(expected, actual) @@ -133,7 +137,9 @@ def test_can_parse_single_value(avro_type, cudf_type, avro_val, cudf_val): actual = cudf_from_avro_util(schema_root, records) - expected = cudf.DataFrame({"prop": cudf.Series(data=[cudf_val], dtype=cudf_type)}) + expected = cudf.DataFrame( + {"prop": cudf.Series(data=[cudf_val], dtype=cudf_type)} + ) assert_eq(expected, actual) @@ -150,7 +156,9 @@ def test_can_parse_single_null(avro_type, cudf_type): actual = cudf_from_avro_util(schema_root, records) - expected = cudf.DataFrame({"prop": cudf.Series(data=[None], dtype=cudf_type)}) + expected = cudf.DataFrame( + {"prop": cudf.Series(data=[None], dtype=cudf_type)} + ) assert_eq(expected, actual) @@ -172,7 +180,9 @@ def test_can_parse_no_data(avro_type, cudf_type): assert_eq(expected, actual) -@pytest.mark.xfail(reason="cudf avro reader is unable to parse zero-field metadata.") +@pytest.mark.xfail( + reason="cudf avro reader is unable to parse zero-field metadata." +) @pytest.mark.parametrize("avro_type, cudf_type", avro_type_params) def test_can_parse_no_fields(avro_type, cudf_type): schema_root = { @@ -275,7 +285,9 @@ def test_can_detect_dtypes_from_avro_logical_type( actual = cudf_from_avro_util(schema, []) - expected = cudf.DataFrame({"prop": cudf.Series(None, None, expected_dtype)}) + expected = cudf.DataFrame( + {"prop": cudf.Series(None, None, expected_dtype)} + ) assert_eq(expected, actual) @@ -337,7 +349,9 @@ def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null): actual = cudf_from_avro_util(schema, records) - expected = cudf.DataFrame({"o_date": cudf.Series(dates, dtype="datetime64[s]")}) + expected = cudf.DataFrame( + {"o_date": cudf.Series(dates, dtype="datetime64[s]")} + ) assert_eq(expected, actual) @@ -597,7 +611,9 @@ def test_avro_reader_multiblock( source_df = cudf.DataFrame({"0": pd.Series(values)}) if limit_rows: - expected_df = source_df[skip_rows : skip_rows + num_rows].reset_index(drop=True) + expected_df = source_df[skip_rows : skip_rows + num_rows].reset_index( + drop=True + ) else: expected_df = source_df[skip_rows:].reset_index(drop=True) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index bf54964219e..438f3e35ec8 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -240,7 +240,9 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @pytest.mark.parametrize("binop", _bitwise_binops) -@pytest.mark.parametrize("lhs_dtype,rhs_dtype", list(product(_int_types, _int_types))) +@pytest.mark.parametrize( + "lhs_dtype,rhs_dtype", list(product(_int_types, _int_types)) +) def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): arr1 = (np.random.random(100) * 100).astype(lhs_dtype) sr1 = Series(arr1) @@ -340,9 +342,13 @@ def cmp_scalar(request): return request.param -def test_str_series_compare_str(str_series_cmp_data, str_series_compare_str_cmpop): +def test_str_series_compare_str( + str_series_cmp_data, str_series_compare_str_cmpop +): expect = str_series_compare_str_cmpop(str_series_cmp_data, "a") - got = str_series_compare_str_cmpop(Series.from_pandas(str_series_cmp_data), "a") + got = str_series_compare_str_cmpop( + Series.from_pandas(str_series_cmp_data), "a" + ) utils.assert_eq(expect, got.to_pandas(nullable=True)) @@ -351,7 +357,9 @@ def test_str_series_compare_str_reflected( str_series_cmp_data, str_series_compare_str_cmpop ): expect = str_series_compare_str_cmpop("a", str_series_cmp_data) - got = str_series_compare_str_cmpop("a", Series.from_pandas(str_series_cmp_data)) + got = str_series_compare_str_cmpop( + "a", Series.from_pandas(str_series_cmp_data) + ) utils.assert_eq(expect, got.to_pandas(nullable=True)) @@ -383,7 +391,9 @@ def test_str_series_compare_num_reflected( @pytest.mark.parametrize("cmpop", _cmpops) @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES + ["datetime64[ms]"]) @pytest.mark.parametrize("use_cudf_scalar", [True, False]) -def test_series_compare_scalar(nelem, cmpop, obj_class, dtype, use_cudf_scalar): +def test_series_compare_scalar( + nelem, cmpop, obj_class, dtype, use_cudf_scalar +): arr1 = np.random.randint(0, 100, 100).astype(dtype) sr1 = Series(arr1) rhs = random.choice(arr1).item() @@ -441,13 +451,13 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls): utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool_ )[:nelem] if lhs_nulls == "some" and rhs_nulls == "none": - res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask), dtype=np.bool_)[ - :nelem - ] + res_mask = np.asarray( + utils.expand_bits_to_bytes(lhs_mask), dtype=np.bool_ + )[:nelem] if lhs_nulls == "none" and rhs_nulls == "some": - res_mask = np.asarray(utils.expand_bits_to_bytes(rhs_mask), dtype=np.bool_)[ - :nelem - ] + res_mask = np.asarray( + utils.expand_bits_to_bytes(rhs_mask), dtype=np.bool_ + )[:nelem] # Fill NA values na_value = -10000 got = res.fillna(na_value).to_numpy() @@ -633,8 +643,12 @@ def test_different_shapes_and_same_columns(binop): if binop is operator.pow: return - pd_frame = binop(pd.DataFrame({"x": [1, 2]}), pd.DataFrame({"x": [1, 2, 3]})) - cd_frame = binop(cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({"x": [1, 2, 3]})) + pd_frame = binop( + pd.DataFrame({"x": [1, 2]}), pd.DataFrame({"x": [1, 2, 3]}) + ) + cd_frame = binop( + cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({"x": [1, 2, 3]}) + ) # cast x as float64 so it matches pandas dtype cd_frame["x"] = cd_frame["x"].astype(np.float64) utils.assert_eq(cd_frame, pd_frame) @@ -650,7 +664,9 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): # Test with a RangeIndex pdf1 = pd.DataFrame({"x": [4, 3, 2, 1], "y": [7, 3, 8, 6]}) # Test with an Index - pdf2 = pd.DataFrame({"x": [1, 2, 3, 7], "y": [4, 5, 6, 7]}, index=[0, 1, 3, 4]) + pdf2 = pd.DataFrame( + {"x": [1, 2, 3, 7], "y": [4, 5, 6, 7]}, index=[0, 1, 3, 4] + ) # Test with an Index in a different order pdf3 = pd.DataFrame( {"x": [4, 5, 6, 7], "y": [1, 2, 3, 7], "z": [0, 5, 3, 7]}, @@ -719,12 +735,18 @@ def test_operator_func_between_series(dtype, func, has_nulls, fill_value): gdf_series_a = utils.gen_rand_series( dtype, count, has_nulls=has_nulls, stride=10000 ) - gdf_series_b = utils.gen_rand_series(dtype, count, has_nulls=has_nulls, stride=100) + gdf_series_b = utils.gen_rand_series( + dtype, count, has_nulls=has_nulls, stride=100 + ) pdf_series_a = gdf_series_a.to_pandas() pdf_series_b = gdf_series_b.to_pandas() - gdf_result = getattr(gdf_series_a, func)(gdf_series_b, fill_value=fill_value) - pdf_result = getattr(pdf_series_a, func)(pdf_series_b, fill_value=fill_value) + gdf_result = getattr(gdf_series_a, func)( + gdf_series_b, fill_value=fill_value + ) + pdf_result = getattr(pdf_series_a, func)( + pdf_series_b, fill_value=fill_value + ) utils.assert_eq(pdf_result, gdf_result) @@ -739,14 +761,18 @@ def test_operator_func_series_and_scalar( ): count = 1000 scalar = 59 - gdf_series = utils.gen_rand_series(dtype, count, has_nulls=has_nulls, stride=10000) + gdf_series = utils.gen_rand_series( + dtype, count, has_nulls=has_nulls, stride=10000 + ) pdf_series = gdf_series.to_pandas() gdf_series_result = getattr(gdf_series, func)( cudf.Scalar(scalar) if use_cudf_scalar else scalar, fill_value=fill_value, ) - pdf_series_result = getattr(pdf_series, func)(scalar, fill_value=fill_value) + pdf_series_result = getattr(pdf_series, func)( + scalar, fill_value=fill_value + ) utils.assert_eq(pdf_series_result, gdf_series_result) @@ -768,8 +794,12 @@ def test_operator_func_between_series_logical( pdf_series_a = gdf_series_a.to_pandas(nullable=True) pdf_series_b = gdf_series_b.to_pandas(nullable=True) - gdf_series_result = getattr(gdf_series_a, func)(gdf_series_b, fill_value=fill_value) - pdf_series_result = getattr(pdf_series_a, func)(pdf_series_b, fill_value=fill_value) + gdf_series_result = getattr(gdf_series_a, func)( + gdf_series_b, fill_value=fill_value + ) + pdf_series_result = getattr(pdf_series_a, func)( + pdf_series_b, fill_value=fill_value + ) expect = pdf_series_result got = gdf_series_result.to_pandas(nullable=True) @@ -815,7 +845,9 @@ def test_operator_func_series_and_scalar_logical( cudf.Scalar(scalar) if use_cudf_scalar else scalar, fill_value=fill_value, ) - pdf_series_result = getattr(pdf_series, func)(scalar, fill_value=fill_value) + pdf_series_result = getattr(pdf_series, func)( + scalar, fill_value=fill_value + ) expect = pdf_series_result got = gdf_series_result.to_pandas(nullable=True) @@ -841,7 +873,9 @@ def gen_df(): colname = ascii_lowercase[cols[i]] data = utils.gen_rand("float64", num_rows) * 10000 if nulls == "some": - idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) + idx = np.random.choice( + num_rows, size=int(num_rows / 2), replace=False + ) data[idx] = np.nan pdf[colname] = data return pdf @@ -875,7 +909,9 @@ def gen_df(): colname = ascii_lowercase[cols[i]] data = utils.gen_rand("float64", num_rows) * 10000 if nulls == "some": - idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) + idx = np.random.choice( + num_rows, size=int(num_rows / 2), replace=False + ) data[idx] = np.nan pdf[colname] = data return pdf @@ -914,10 +950,14 @@ def gen_df(): def test_binop_bool_uint(func, rhs): psr = pd.Series([True, False, False]) gsr = cudf.from_pandas(psr) - utils.assert_eq(getattr(psr, func)(rhs), getattr(gsr, func)(rhs), check_dtype=False) + utils.assert_eq( + getattr(psr, func)(rhs), getattr(gsr, func)(rhs), check_dtype=False + ) -@pytest.mark.parametrize("series_dtype", (np.int8, np.uint8, np.int64, np.uint64)) +@pytest.mark.parametrize( + "series_dtype", (np.int8, np.uint8, np.int64, np.uint64) +) @pytest.mark.parametrize( "divisor_dtype", ( @@ -968,7 +1008,9 @@ def test_floordiv_zero_bool(scalar_divisor): pytest.param( np.bool_, marks=pytest_xfail( - reason=("Pandas handling of division by zero-bool is too strange") + reason=( + "Pandas handling of division by zero-bool is too strange" + ) ), ), np.int8, @@ -1047,19 +1089,29 @@ def make_scalar_add_data(): ) # to any float, we may add any int, float, or bool - valid |= set(product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES)) + valid |= set( + product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES) + ) # to any datetime, we may add any int, timedelta, or bool - valid |= set(product(DATETIME_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES)) + valid |= set( + product(DATETIME_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES) + ) # to any timedelta, we may add any int, datetime, other timedelta, or bool - valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | DATETIME_TYPES | BOOL_TYPES)) + valid |= set( + product(TIMEDELTA_TYPES, INTEGER_TYPES | DATETIME_TYPES | BOOL_TYPES) + ) # to any bool, we may add any int, float, datetime, timedelta, or bool valid |= set( product( BOOL_TYPES, - INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, + INTEGER_TYPES + | FLOAT_TYPES + | DATETIME_TYPES + | TIMEDELTA_TYPES + | BOOL_TYPES, ) ) @@ -1133,7 +1185,9 @@ def make_scalar_difference_data(): ) # from any float, we may subtract any int, float, or bool - valid |= set(product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES)) + valid |= set( + product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES) + ) # from any datetime we may subtract any int, datetime, timedelta, or bool valid |= set( @@ -1144,10 +1198,14 @@ def make_scalar_difference_data(): ) # from any timedelta we may subtract any int, timedelta, or bool - valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES)) + valid |= set( + product(TIMEDELTA_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES) + ) # from any bool we may subtract any int, float or timedelta - valid |= set(product(BOOL_TYPES, INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES)) + valid |= set( + product(BOOL_TYPES, INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES) + ) return sorted(list(valid)) @@ -1190,7 +1248,9 @@ def test_scalar_difference(dtype_l, dtype_r): assert expect.dtype == got.dtype -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_difference_data_invalid()) +@pytest.mark.parametrize( + "dtype_l,dtype_r", make_scalar_difference_data_invalid() +) def test_scalar_difference_invalid(dtype_l, dtype_r): test_value = 1 @@ -1216,7 +1276,9 @@ def make_scalar_product_data(): valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | BOOL_TYPES)) # we can multiply a float by any int, float, or bool - valid |= set(product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES)) + valid |= set( + product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES) + ) return sorted(list(valid)) @@ -1228,7 +1290,11 @@ def make_scalar_product_data_invalid(): # or bools by datetimes invalid |= set( product( - INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, + INTEGER_TYPES + | FLOAT_TYPES + | DATETIME_TYPES + | TIMEDELTA_TYPES + | BOOL_TYPES, DATETIME_TYPES, ) ) @@ -1237,7 +1303,11 @@ def make_scalar_product_data_invalid(): invalid |= set( product( DATETIME_TYPES, - INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, + INTEGER_TYPES + | FLOAT_TYPES + | DATETIME_TYPES + | TIMEDELTA_TYPES + | BOOL_TYPES, ) ) @@ -1312,7 +1382,11 @@ def make_scalar_floordiv_data_invalid(): invalid |= set( product( DATETIME_TYPES, - INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, + INTEGER_TYPES + | FLOAT_TYPES + | DATETIME_TYPES + | TIMEDELTA_TYPES + | BOOL_TYPES, ) ) @@ -1339,7 +1413,9 @@ def test_scalar_floordiv(dtype_l, dtype_r): assert expect.dtype == got.dtype -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_floordiv_data_invalid()) +@pytest.mark.parametrize( + "dtype_l,dtype_r", make_scalar_floordiv_data_invalid() +) def test_scalar_floordiv_invalid(dtype_l, dtype_r): test_value = 1 @@ -1384,12 +1460,18 @@ def make_scalar_truediv_data_invalid(): invalid |= set( product( DATETIME_TYPES, - INTEGER_TYPES | FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, + INTEGER_TYPES + | FLOAT_TYPES + | DATETIME_TYPES + | TIMEDELTA_TYPES + | BOOL_TYPES, ) ) # we cant true divide timedeltas by datetimes or bools or floats - invalid |= set(product(TIMEDELTA_TYPES, DATETIME_TYPES | BOOL_TYPES | FLOAT_TYPES)) + invalid |= set( + product(TIMEDELTA_TYPES, DATETIME_TYPES | BOOL_TYPES | FLOAT_TYPES) + ) return sorted(list(invalid)) @@ -1491,7 +1573,9 @@ def test_scalar_remainder(dtype_l, dtype_r): assert expect.dtype == got.dtype -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_remainder_data_invalid()) +@pytest.mark.parametrize( + "dtype_l,dtype_r", make_scalar_remainder_data_invalid() +) def test_scalar_remainder_invalid(dtype_l, dtype_r): test_value = 1 @@ -1518,7 +1602,11 @@ def make_scalar_power_data_invalid(): # datetimes and timedeltas cant go in exponents invalid |= set( product( - INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES | DATETIME_TYPES | BOOL_TYPES, + INTEGER_TYPES + | FLOAT_TYPES + | TIMEDELTA_TYPES + | DATETIME_TYPES + | BOOL_TYPES, DATETIME_TYPES | TIMEDELTA_TYPES, ) ) @@ -1528,7 +1616,11 @@ def make_scalar_power_data_invalid(): invalid |= set( product( DATETIME_TYPES | TIMEDELTA_TYPES, - DATETIME_TYPES | TIMEDELTA_TYPES | INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, + DATETIME_TYPES + | TIMEDELTA_TYPES + | INTEGER_TYPES + | FLOAT_TYPES + | BOOL_TYPES, ) ) @@ -1569,8 +1661,14 @@ def make_scalar_null_binops_data(): + [(operator.sub, *dtypes) for dtypes in make_scalar_difference_data()] + [(operator.mul, *dtypes) for dtypes in make_scalar_product_data()] + [(operator.add, *dtypes) for dtypes in make_scalar_add_data()] - + [(operator.floordiv, *dtypes) for dtypes in make_scalar_floordiv_data()] - + [(operator.truediv, *dtypes) for dtypes in make_scalar_truediv_data()] + + [ + (operator.floordiv, *dtypes) + for dtypes in make_scalar_floordiv_data() + ] + + [ + (operator.truediv, *dtypes) + for dtypes in make_scalar_truediv_data() + ] + [(operator.mod, *dtypes) for dtypes in make_scalar_remainder_data()] + [(operator.pow, *dtypes) for dtypes in make_scalar_power_data()] ) @@ -1616,7 +1714,9 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"], ) @pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_datetime_dateoffset_binaryop(request, n_periods, frequency, dtype, op): +def test_datetime_dateoffset_binaryop( + request, n_periods, frequency, dtype, op +): request.applymarker( pytest.mark.xfail( PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION @@ -1675,7 +1775,9 @@ def test_datetime_dateoffset_binaryop(request, n_periods, frequency, dtype, op): @pytest.mark.filterwarnings( "ignore:Non-vectorized DateOffset:pandas.errors.PerformanceWarning" ) -@pytest.mark.filterwarnings("ignore:Discarding nonzero nanoseconds:UserWarning") +@pytest.mark.filterwarnings( + "ignore:Discarding nonzero nanoseconds:UserWarning" +) @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): gsr = cudf.Series(date_col, dtype="datetime64[ns]") @@ -2233,7 +2335,9 @@ def test_binops_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype): ), ], ) -def test_binops_reflect_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype): +def test_binops_reflect_decimal( + op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype +): a = utils._decimal_series(lhs, l_dtype) b = utils._decimal_series(rhs, r_dtype) expect = utils._decimal_series(expect, expect_dtype) @@ -2914,7 +3018,9 @@ def test_column_null_scalar_comparison(dtype, null_scalar, cmpop): @pytest.mark.parametrize("fn", ["eq", "ne", "lt", "gt", "le", "ge"]) def test_equality_ops_index_mismatch(fn): - a = cudf.Series([1, 2, 3, None, None, 4], index=["a", "b", "c", "d", "e", "f"]) + a = cudf.Series( + [1, 2, 3, None, None, 4], index=["a", "b", "c", "d", "e", "f"] + ) b = cudf.Series( [-5, 4, 3, 2, 1, 0, 19, 11], index=["aa", "b", "c", "d", "e", "f", "y", "z"], @@ -3046,7 +3152,9 @@ def test_empty_column(binop, data, scalar): "other", [ cudf.DataFrame([[9, 10], [11, 12], [13, 14], [15, 16]]), - cudf.DataFrame([[9.4, 10.5], [11.6, 12.7], [13.8, 14.9], [15.1, 16.2]]), + cudf.DataFrame( + [[9.4, 10.5], [11.6, 12.7], [13.8, 14.9], [15.1, 16.2]] + ), cudf.Series([5, 6, 7, 8]), cudf.Series([5.6, 6.7, 7.8, 8.9]), np.array([5, 6, 7, 8]), diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 372911ced54..ad32ebce01b 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -48,7 +48,9 @@ def test_categorical_basic(): assert_eq(pdsr.cat.categories, sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered - np.testing.assert_array_equal(pdsr.cat.codes.values, sr.cat.codes.to_numpy()) + np.testing.assert_array_equal( + pdsr.cat.codes.values, sr.cat.codes.to_numpy() + ) string = str(sr) expect_str = """ @@ -189,7 +191,9 @@ def test_categorical_masking(): expect_matches = pdsr == "a" got_matches = sr == "a" - np.testing.assert_array_equal(expect_matches.values, got_matches.to_numpy()) + np.testing.assert_array_equal( + expect_matches.values, got_matches.to_numpy() + ) # mask series expect_masked = pdsr[expect_matches] @@ -253,7 +257,9 @@ def test_categorical_unique(num_elements): np.random.seed(12) pd_cat = pd.Categorical( pd.Series( - np.random.choice(list(string.ascii_letters + string.digits), num_elements), + np.random.choice( + list(string.ascii_letters + string.digits), num_elements + ), dtype="category", ) ) @@ -278,7 +284,9 @@ def test_categorical_unique_count(nelem): np.random.seed(12) pd_cat = pd.Categorical( pd.Series( - np.random.choice(list(string.ascii_letters + string.digits), nelem), + np.random.choice( + list(string.ascii_letters + string.digits), nelem + ), dtype="category", ) ) @@ -307,7 +315,9 @@ def test_categorical_empty(): assert_eq(pdsr.cat.categories, sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered - np.testing.assert_array_equal(pdsr.cat.codes.values, sr.cat.codes.to_numpy()) + np.testing.assert_array_equal( + pdsr.cat.codes.values, sr.cat.codes.to_numpy() + ) def test_categorical_set_categories(): @@ -638,7 +648,9 @@ def test_add_categories(data, add): with _hide_cudf_safe_casting_warning(): actual = gds.cat.add_categories(add) - assert_eq(expected.cat.codes, actual.cat.codes.astype(expected.cat.codes.dtype)) + assert_eq( + expected.cat.codes, actual.cat.codes.astype(expected.cat.codes.dtype) + ) # Need to type-cast pandas object to str due to mixed-type # support in "object" @@ -742,14 +754,16 @@ def test_categorical_allow_nan(): assert_eq(expected_categories, gs.cat.categories) actual_ps = gs.to_pandas() - expected_ps = pd.Series([1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category") + expected_ps = pd.Series( + [1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category" + ) assert_eq(actual_ps, expected_ps) def test_categorical_setitem_with_nan(): - gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False).astype( - "category" - ) + gs = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") gs[[1, 3]] = np.nan expected_series = cudf.Series( @@ -762,7 +776,9 @@ def test_categorical_setitem_with_nan(): @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) def test_series_construction_with_nulls(input_obj, dtype): dtype = cudf.dtype(dtype) - input_obj = [dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj] + input_obj = [ + dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj + ] expect = pd.Series(input_obj, dtype="category") got = cudf.Series(input_obj, dtype="category").to_pandas() diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 9f9612f4933..8e8555b2005 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -88,7 +88,8 @@ def test_column_offset_and_size(pandas_input, offset, size): if col.size > 0: assert col.size == (col.children[0].size - 1) assert col.size == ( - (col.children[0].data.size / col.children[0].dtype.itemsize) - 1 + (col.children[0].data.size / col.children[0].dtype.itemsize) + - 1 ) else: assert col.size == (col.data.size / col.dtype.itemsize) @@ -199,7 +200,11 @@ def test_column_mixed_dtype(data, error): ) @pytest.mark.parametrize("size", [1, 10]) def test_as_column_scalar_with_nan(nan_as_null, scalar, size): - expected = cudf.Series([scalar] * size, nan_as_null=nan_as_null).dropna().to_numpy() + expected = ( + cudf.Series([scalar] * size, nan_as_null=nan_as_null) + .dropna() + .to_numpy() + ) got = ( cudf.Series(as_column(scalar, length=size, nan_as_null=nan_as_null)) @@ -330,7 +335,8 @@ def test_column_view_valid_string_to_numeric(data, to_dtype): def test_column_view_nulls_widths_even(): data = [1, 2, None, 4, None] expect_data = [ - np.int32(val).view("float32") if val is not None else np.nan for val in data + np.int32(val).view("float32") if val is not None else np.nan + for val in data ] sr = cudf.Series(data, dtype="int32") @@ -341,7 +347,8 @@ def test_column_view_nulls_widths_even(): data = [None, 2.1, None, 5.3, 8.8] expect_data = [ - np.float64(val).view("int64") if val is not None else val for val in data + np.float64(val).view("int64") if val is not None else val + for val in data ] sr = cudf.Series(data, dtype="float64") @@ -362,7 +369,9 @@ def test_column_view_numeric_slice(slc): assert_eq(expect, got) -@pytest.mark.parametrize("slc", [slice(3, 5), slice(0, 4), slice(2, 5), slice(1, 3)]) +@pytest.mark.parametrize( + "slc", [slice(3, 5), slice(0, 4), slice(2, 5), slice(1, 3)] +) def test_column_view_string_slice(slc): data = ["a", "bcde", "cd", "efg", "h"] @@ -498,7 +507,9 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): # check mask expect_mask = [x is not pd.NA for x in pd_data] - got_mask = mask_to_bools(gd_data._column.base_mask, 0, len(gd_data)).values_host + got_mask = mask_to_bools( + gd_data._column.base_mask, 0, len(gd_data) + ).values_host np.testing.assert_array_equal(expect_mask, got_mask) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 32425c78d26..a8eac2edf2b 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -76,7 +76,9 @@ def test_to_pandas_multiindex_names(): ) assert_eq( ca.to_pandas_index(), - pd.MultiIndex.from_tuples((("a", "b"), ("c", "d")), names=("foo", "bar")), + pd.MultiIndex.from_tuples( + (("a", "b"), ("c", "d")), names=("foo", "bar") + ), ) @@ -253,7 +255,9 @@ def test_select_by_index_empty(): }, multiindex=True, ) - expect = ColumnAccessor({}, multiindex=True, level_names=((None, None, None))) + expect = ColumnAccessor( + {}, multiindex=True, level_names=((None, None, None)) + ) got = ca.select_by_index(slice(None, 0)) check_ca_equal(expect, got) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 3c04f62d29f..cdb47ea79d8 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -25,7 +25,8 @@ def _hide_concat_empty_dtype_warning(): # being caught and validated in other tests. warnings.filterwarnings( "ignore", - "The behavior of array concatenation with empty entries " "is deprecated.", + "The behavior of array concatenation with empty entries " + "is deprecated.", category=FutureWarning, ) yield @@ -206,7 +207,9 @@ def test_concat_misordered_columns(): @pytest.mark.parametrize("axis", [1, "columns"]) def test_concat_columns(axis): pdf1 = pd.DataFrame(np.random.randint(10, size=(5, 3)), columns=[1, 2, 3]) - pdf2 = pd.DataFrame(np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7]) + pdf2 = pd.DataFrame( + np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7] + ) gdf1 = cudf.from_pandas(pdf1) gdf2 = cudf.from_pandas(pdf2) @@ -263,7 +266,9 @@ def test_concat_multiindex_series(): pd.concat([pdg1, pdg2]), check_index_type=True, ) - assert_eq(cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1)) + assert_eq( + cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1) + ) def test_concat_multiindex_dataframe_and_series(): @@ -324,11 +329,21 @@ def test_concat_string_index_name(myindex): def test_pandas_concat_compatibility_axis1(): - d1 = cudf.datasets.randomdata(3, dtypes={"a": float, "ind": float}).set_index("ind") - d2 = cudf.datasets.randomdata(3, dtypes={"b": float, "ind": float}).set_index("ind") - d3 = cudf.datasets.randomdata(3, dtypes={"c": float, "ind": float}).set_index("ind") - d4 = cudf.datasets.randomdata(3, dtypes={"d": float, "ind": float}).set_index("ind") - d5 = cudf.datasets.randomdata(3, dtypes={"e": float, "ind": float}).set_index("ind") + d1 = cudf.datasets.randomdata( + 3, dtypes={"a": float, "ind": float} + ).set_index("ind") + d2 = cudf.datasets.randomdata( + 3, dtypes={"b": float, "ind": float} + ).set_index("ind") + d3 = cudf.datasets.randomdata( + 3, dtypes={"c": float, "ind": float} + ).set_index("ind") + d4 = cudf.datasets.randomdata( + 3, dtypes={"d": float, "ind": float} + ).set_index("ind") + d5 = cudf.datasets.randomdata( + 3, dtypes={"e": float, "ind": float} + ).set_index("ind") pd1 = d1.to_pandas() pd2 = d2.to_pandas() @@ -447,11 +462,15 @@ def test_concat_mixed_input(): pd.DataFrame({"a": [1, 2]}), ], [ - pd.Series([1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130]), + pd.Series( + [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130] + ), pd.DataFrame({"a": [1, 2]}), ], [ - pd.Series([1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"]), + pd.Series( + [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"] + ), pd.DataFrame({"a": [1, 2]}, index=["a", "b"]), ], [ @@ -539,7 +558,9 @@ def test_concat_series_dataframe_input_str(objs): [ pd.DataFrame(), pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame({"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), @@ -553,7 +574,9 @@ def test_concat_series_dataframe_input_str(objs): [ [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], [ - pd.DataFrame({"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), pd.DataFrame(), pd.DataFrame(), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), @@ -562,12 +585,16 @@ def test_concat_series_dataframe_input_str(objs): pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame({"cat": pd.Series(["two", "three"], dtype="category")}), + pd.DataFrame( + {"cat": pd.Series(["two", "three"], dtype="category")} + ), ], [ pd.DataFrame([]), pd.DataFrame([], index=[100]), - pd.DataFrame({"cat": pd.Series(["two", "three"], dtype="category")}), + pd.DataFrame( + {"cat": pd.Series(["two", "three"], dtype="category")} + ), ], ], ) @@ -688,7 +715,9 @@ def test_concat_dataframe_with_multiindex(df1, df2): "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], } ), - pd.DataFrame({"x": range(10, 20), "y": list(map(float, range(10, 20)))}), + pd.DataFrame( + {"x": range(10, 20), "y": list(map(float, range(10, 20)))} + ), ], [ pd.DataFrame( @@ -726,7 +755,9 @@ def test_concat_join(objs, ignore_index, sort, join, axis): gpu_objs = [cudf.from_pandas(o) for o in objs] assert_eq( - pd.concat(objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis), + pd.concat( + objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis + ), cudf.concat( gpu_objs, sort=sort, @@ -749,7 +780,9 @@ def test_concat_join(objs, ignore_index, sort, join, axis): "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], } ), - pd.DataFrame({"x": range(10, 20), "y": list(map(float, range(10, 20)))}), + pd.DataFrame( + {"x": range(10, 20), "y": list(map(float, range(10, 20)))} + ), ], ], ) @@ -780,7 +813,9 @@ def test_concat_join_axis_1_dup_error(objs): "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], } ), - pd.DataFrame({"l": range(10, 20), "m": list(map(float, range(10, 20)))}), + pd.DataFrame( + {"l": range(10, 20), "m": list(map(float, range(10, 20)))} + ), ], ], ) @@ -818,7 +853,9 @@ def test_concat_join_many_df_and_empty_df(ignore_index, sort, join, axis): "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], } ) - pdf2 = pd.DataFrame({"l": range(10, 20), "m": list(map(float, range(10, 20)))}) + pdf2 = pd.DataFrame( + {"l": range(10, 20), "m": list(map(float, range(10, 20)))} + ) pdf3 = pd.DataFrame({"j": [1, 2], "k": [1, 2], "s": [1, 2], "t": [1, 2]}) pdf_empty1 = pd.DataFrame() @@ -879,8 +916,12 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}), ), ( - pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["p", "q", "r"]), - pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}, index=["r", "p", "z"]), + pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["p", "q", "r"] + ), + pd.DataFrame( + {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["r", "p", "z"] + ), ), ], ) @@ -888,7 +929,9 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0, 1]) -def test_concat_join_no_overlapping_columns(pdf1, pdf2, ignore_index, sort, join, axis): +def test_concat_join_no_overlapping_columns( + pdf1, pdf2, ignore_index, sort, join, axis +): gdf1 = cudf.from_pandas(pdf1) gdf2 = cudf.from_pandas(pdf2) @@ -959,8 +1002,12 @@ def test_concat_join_no_overlapping_columns_many_and_empty( "objs", [ [ - pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["z", "t", "k"]), - pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}, index=["z", "t", "k"]), + pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["z", "t", "k"] + ), + pd.DataFrame( + {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["z", "t", "k"] + ), pd.DataFrame( { "x": range(10), @@ -1105,7 +1152,9 @@ def test_concat_join_series(ignore_index, sort, join, axis): [ pd.DataFrame(), pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame({"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), @@ -1119,7 +1168,9 @@ def test_concat_join_series(ignore_index, sort, join, axis): [ [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], [ - pd.DataFrame({"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), + pd.DataFrame( + {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), pd.DataFrame(), pd.DataFrame(), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), @@ -1128,12 +1179,16 @@ def test_concat_join_series(ignore_index, sort, join, axis): pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), pd.DataFrame({"k": [10]}, index=[200]), - pd.DataFrame({"cat": pd.Series(["two", "three"], dtype="category")}), + pd.DataFrame( + {"cat": pd.Series(["two", "three"], dtype="category")} + ), ], [ pd.DataFrame([]), pd.DataFrame([], index=[100]), - pd.DataFrame({"cat": pd.Series(["two", "three"], dtype="category")}), + pd.DataFrame( + {"cat": pd.Series(["two", "three"], dtype="category")} + ), ], ], ) @@ -1141,7 +1196,9 @@ def test_concat_join_series(ignore_index, sort, join, axis): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0]) -def test_concat_join_empty_dataframes(df, other, ignore_index, axis, join, sort): +def test_concat_join_empty_dataframes( + df, other, ignore_index, axis, join, sort +): other_pd = [df] + other gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] @@ -1159,7 +1216,9 @@ def test_concat_join_empty_dataframes(df, other, ignore_index, axis, join, sort) if not _is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: # https://github.com/pandas-dev/pandas/issues/42840 - expected[key] = expected[key].fillna("-1").astype("str") + expected[key] = ( + expected[key].fillna("-1").astype("str") + ) else: expected[key] = ( expected[key] @@ -1176,7 +1235,9 @@ def test_concat_join_empty_dataframes(df, other, ignore_index, axis, join, sort) expected.fillna(-1), actual.fillna(-1), check_dtype=False, - check_index_type=False if len(expected) == 0 or actual.empty else True, + check_index_type=False + if len(expected) == 0 or actual.empty + else True, check_column_type=False, ) else: @@ -1201,7 +1262,9 @@ def test_concat_join_empty_dataframes(df, other, ignore_index, axis, join, sort) [ pd.DataFrame(), pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame({"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"l": [10]}), @@ -1215,7 +1278,9 @@ def test_concat_join_empty_dataframes(df, other, ignore_index, axis, join, sort) [ [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], [ - pd.DataFrame({"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20]), + pd.DataFrame( + {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), pd.DataFrame(), pd.DataFrame(), pd.DataFrame([[5, 6], [7, 8]], columns=list("CD")), @@ -1224,12 +1289,16 @@ def test_concat_join_empty_dataframes(df, other, ignore_index, axis, join, sort) pd.DataFrame({"g": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), pd.DataFrame({"h": [10]}), pd.DataFrame({"k": [10]}, index=[200]), - pd.DataFrame({"dog": pd.Series(["two", "three"], dtype="category")}), + pd.DataFrame( + {"dog": pd.Series(["two", "three"], dtype="category")} + ), ], [ pd.DataFrame([]), pd.DataFrame([], index=[100]), - pd.DataFrame({"bird": pd.Series(["two", "three"], dtype="category")}), + pd.DataFrame( + {"bird": pd.Series(["two", "three"], dtype="category")} + ), ], ], ) @@ -1237,7 +1306,9 @@ def test_concat_join_empty_dataframes(df, other, ignore_index, axis, join, sort) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [1]) -def test_concat_join_empty_dataframes_axis_1(df, other, ignore_index, axis, join, sort): +def test_concat_join_empty_dataframes_axis_1( + df, other, ignore_index, axis, join, sort +): # no duplicate columns other_pd = [df] + other gdf = cudf.from_pandas(df) @@ -1269,7 +1340,9 @@ def test_concat_join_empty_dataframes_axis_1(df, other, ignore_index, axis, join expected.fillna(-1), actual.fillna(-1), check_dtype=False, - check_index_type=False if len(expected) == 0 or actual.empty else True, + check_index_type=False + if len(expected) == 0 or actual.empty + else True, check_column_type=False, ) else: @@ -1280,7 +1353,9 @@ def test_concat_join_empty_dataframes_axis_1(df, other, ignore_index, axis, join check_index_type=False, check_column_type=False, ) - assert_eq(expected, actual, check_index_type=False, check_column_type=False) + assert_eq( + expected, actual, check_index_type=False, check_column_type=False + ) def test_concat_preserve_order(): @@ -1513,7 +1588,9 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): ), ), ( - cudf.Series([Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2)), + cudf.Series( + [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2) + ), cudf.Series([33, 984], dtype="uint32"), cudf.Series([593, -702], dtype="int32"), cudf.Series( @@ -1626,7 +1703,9 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): dtype=Decimal64Dtype(5, 2), ), cudf.Series( - np.arange("2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]"), + np.arange( + "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" + ), dtype="datetime64[s]", ), cudf.Series( @@ -1656,7 +1735,9 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): [Decimal("753.0"), Decimal("94.22")], dtype=Decimal64Dtype(5, 2), ), - cudf.Series([np.timedelta64(940252, "s"), np.timedelta64(758385, "s")]), + cudf.Series( + [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")] + ), cudf.Series( ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"], index=[0, 1, 0, 1], @@ -1736,7 +1817,9 @@ def test_concat_list_column(frame1, frame2, expected): def test_concat_categorical_ordering(): # https://github.com/rapidsai/cudf/issues/11486 - sr = pd.Series(["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category") + sr = pd.Series( + ["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category" + ) sr = sr.cat.set_categories(["d", "a", "b", "c", "e"]) df = pd.DataFrame({"a": sr}) @@ -1775,9 +1858,13 @@ def singleton_concat_obj(request, singleton_concat_index): @pytest.mark.parametrize("axis", [0, 1, "columns", "index"]) @pytest.mark.parametrize("sort", [False, True]) @pytest.mark.parametrize("ignore_index", [False, True]) -def test_concat_singleton_sorting(axis, sort, ignore_index, singleton_concat_obj): +def test_concat_singleton_sorting( + axis, sort, ignore_index, singleton_concat_obj +): gobj = cudf.from_pandas(singleton_concat_obj) - gconcat = cudf.concat([gobj], axis=axis, sort=sort, ignore_index=ignore_index) + gconcat = cudf.concat( + [gobj], axis=axis, sort=sort, ignore_index=ignore_index + ) pconcat = pd.concat( [singleton_concat_obj], axis=axis, sort=sort, ignore_index=ignore_index ) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 7b4caf41559..15dfa111860 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import datetime @@ -27,7 +27,9 @@ def cudf_num_series(start, stop, step=1): def get_categorical_series(): return Series( - pd.Categorical(["ab", "ac", "cd", "ab", "cd"], categories=["ab", "ac", "cd"]) + pd.Categorical( + ["ab", "ac", "cd", "ab", "cd"], categories=["ab", "ac", "cd"] + ) ) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index cb99f576a79..e737a73e86b 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -113,7 +113,9 @@ def test_series_setitem_partial_slice_cow_on(): assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) new_slice = actual[2:] - assert new_slice._column.base_data.owner == actual._column.base_data.owner + assert ( + new_slice._column.base_data.owner == actual._column.base_data.owner + ) new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) @@ -398,21 +400,29 @@ def test_series_cat_copy(copy_on_write): def test_dataframe_cow_slice_setitem(): with cudf.option_context("copy_on_write", True): - df = cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}) + df = cudf.DataFrame( + {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} + ) slice_df = df[1:4] assert_eq( slice_df, - cudf.DataFrame({"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3]), + cudf.DataFrame( + {"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3] + ), ) slice_df["a"][2] = 1111 assert_eq( slice_df, - cudf.DataFrame({"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3]), + cudf.DataFrame( + {"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3] + ), ) assert_eq( df, - cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}), + cudf.DataFrame( + {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} + ), ) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 70ff69f5a61..2d728fb94ba 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -71,7 +71,9 @@ def make_datetime_dataframe(include_non_standard=False): def make_numpy_mixed_dataframe(): df = pd.DataFrame() df["Integer"] = np.array([2345, 11987, 9027, 9027]) - df["Date"] = np.array(["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"]) + df["Date"] = np.array( + ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"] + ) df["Float"] = np.array([9.001, 8.343, 6, 2.781]) df["Integer2"] = np.array([2345, 106, 2088, 789277]) df["Category"] = np.array(["M", "F", "F", "F"]) @@ -549,7 +551,9 @@ def test_csv_reader_NaN_values(): custom_na_values = ["NV_NAN", "NotANumber"] # test default NA values. empty cells should also yield NaNs - gdf = read_csv(StringIO(default_na_cells + empty_cells), names=names, dtype=dtypes) + gdf = read_csv( + StringIO(default_na_cells + empty_cells), names=names, dtype=dtypes + ) pdf = pd.read_csv( StringIO(default_na_cells + empty_cells), names=names, dtype=np.float32 ) @@ -627,7 +631,9 @@ def test_csv_reader_thousands(tmpdir): uint32_ref = [1234567, 12345] uint64_ref = [1234567890, 123456789] - df = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1, thousands="'") + df = read_csv( + str(fname), names=names, dtype=dtypes, skiprows=1, thousands="'" + ) np.testing.assert_allclose(f32_ref, df["float32"].to_numpy()) np.testing.assert_allclose(f64_ref, df["float64"].to_numpy()) @@ -653,7 +659,9 @@ def test_csv_reader_buffer_strings(): assert df["text"][2] == "c" assert df["text"][3] == "d" - df2 = read_csv(BytesIO(str.encode(buffer)), names=names, dtype=dtypes, skiprows=1) + df2 = read_csv( + BytesIO(str.encode(buffer)), names=names, dtype=dtypes, skiprows=1 + ) assert len(df2.columns) == 2 assert df2["text"].dtype == np.dtype("object") assert df2["int"].dtype == np.dtype("int64") @@ -676,14 +684,18 @@ def test_csv_reader_buffer_strings(): ("", None, None), ], ) -def test_csv_reader_compression(tmpdir, ext, out_comp, in_comp, pd_mixed_dataframe): +def test_csv_reader_compression( + tmpdir, ext, out_comp, in_comp, pd_mixed_dataframe +): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_compression" + ext) df = pd_mixed_dataframe df.to_csv(fname, index=False, header=False, compression=out_comp) gdf = read_csv(fname, names=list(df.columns.values), compression=in_comp) - pdf = pd.read_csv(fname, names=list(df.columns.values), compression=in_comp) + pdf = pd.read_csv( + fname, names=list(df.columns.values), compression=in_comp + ) assert_eq(gdf, pdf) @@ -880,7 +892,9 @@ def test_csv_reader_nrows(tmpdir): assert df["int2"][read_rows - 1] == 2 * (read_rows - 1 + skip_rows) # with column name inference - df = read_csv(str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows) + df = read_csv( + str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows + ) assert df.shape == (read_rows, 2) assert str(skip_rows) in list(df)[0] assert str(2 * skip_rows) in list(df)[1] @@ -898,7 +912,9 @@ def test_csv_reader_nrows(tmpdir): assert df["int2"][rows - 1] == 2 * (rows - 1) # nrows + skiprows larger than the file - df = read_csv(str(fname), dtype=dtypes, nrows=read_rows, skiprows=read_rows) + df = read_csv( + str(fname), dtype=dtypes, nrows=read_rows, skiprows=read_rows + ) assert df.shape == (rows - read_rows, 2) # nrows equal to zero @@ -963,7 +979,9 @@ def test_csv_reader_skiprows_header(skip_rows, header_row): cu_df = read_csv( StringIO(buffer), dtype=dtypes, skiprows=skip_rows, header=header_row ) - pd_df = pd.read_csv(StringIO(buffer), skiprows=skip_rows, header=header_row) + pd_df = pd.read_csv( + StringIO(buffer), skiprows=skip_rows, header=header_row + ) assert cu_df.shape == pd_df.shape assert list(cu_df.columns.values) == list(pd_df.columns.values) @@ -1035,7 +1053,9 @@ def test_csv_reader_filenotfound(tmpdir): read_csv(str(dname)) -@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "string_io", "url"]) +@pytest.mark.parametrize( + "src", ["filepath", "pathobj", "bytes_io", "string_io", "url"] +) def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src): expect = pd.read_csv(path_or_buf("filepath")) got = cudf.read_csv(path_or_buf(src)) @@ -1253,7 +1273,9 @@ def test_csv_reader_delim_whitespace(): with pytest.warns(FutureWarning): cu_df = read_csv(StringIO(buffer), delim_whitespace=True, header=None) with pytest.warns(FutureWarning): - pd_df = pd.read_csv(StringIO(buffer), delim_whitespace=True, header=None) + pd_df = pd.read_csv( + StringIO(buffer), delim_whitespace=True, header=None + ) assert pd_df.shape == cu_df.shape # should raise an error if used with delimiter or sep @@ -1316,7 +1338,9 @@ def test_csv_reader_index_col(): # using a column index with names cu_df = read_csv(StringIO(buffer), header=None, index_col=0, names=names) - pd_df = pd.read_csv(StringIO(buffer), header=None, index_col=0, names=names) + pd_df = pd.read_csv( + StringIO(buffer), header=None, index_col=0, names=names + ) assert_eq(cu_df.index, pd_df.index) # passing False to avoid using a column as index (no-op in cuDF) @@ -1328,7 +1352,9 @@ def test_csv_reader_index_col(): @pytest.mark.parametrize("index_name", [None, "custom name", 124]) @pytest.mark.parametrize("index_col", [None, 0, "a"]) def test_csv_reader_index_names(index_name, index_col): - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [10, 11, 12]}, index=["AB", "CD", "EF"]) + pdf = pd.DataFrame( + {"a": [1, 2, 3], "b": [10, 11, 12]}, index=["AB", "CD", "EF"] + ) pdf.index.name = index_name buffer = pdf.to_csv() @@ -1337,7 +1363,9 @@ def test_csv_reader_index_names(index_name, index_col): assert_eq(actual, expected) -@pytest.mark.parametrize("names", [["a", "b", "c"], [416, 905, 647], range(3), None]) +@pytest.mark.parametrize( + "names", [["a", "b", "c"], [416, 905, 647], range(3), None] +) def test_csv_reader_column_names(names): buffer = "0,1,2\n3,4,5\n6,7,8" @@ -1375,7 +1403,9 @@ def test_csv_reader_aligned_byte_range(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file19.csv") nelem = 1000 - input_df = pd.DataFrame({"key": np.arange(0, nelem), "zeros": np.zeros(nelem)}) + input_df = pd.DataFrame( + {"key": np.arange(0, nelem), "zeros": np.zeros(nelem)} + ) input_df.to_csv(fname) df = cudf.read_csv(str(fname), byte_range=(0, 4096)) @@ -1397,7 +1427,9 @@ def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): # require explicit `hex` dtype to parse hexadecimals pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"]) gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) - np.testing.assert_array_equal(pdf["hex_int"], gdf["hex_int"].to_numpy()) + np.testing.assert_array_equal( + pdf["hex_int"], gdf["hex_int"].to_numpy() + ) else: # otherwise, dtype inference returns as object (string) pdf = pd.read_csv(StringIO(buffer), names=["hex_int"]) @@ -1442,7 +1474,9 @@ def test_csv_reader_pd_consistent_quotes(quoting): buffer = "\n".join(lines) - gd_df = read_csv(StringIO(buffer), names=names, dtype=dtypes, quoting=quoting) + gd_df = read_csv( + StringIO(buffer), names=names, dtype=dtypes, quoting=quoting + ) pd_df = pd.read_csv(StringIO(buffer), names=names, quoting=quoting) assert_eq(pd_df, gd_df) @@ -1656,7 +1690,9 @@ def test_csv_writer_terminator_sep(lineterminator, sep, cudf_mixed_dataframe): assert_eq(df, got) -@pytest.mark.parametrize("lineterminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")]) +@pytest.mark.parametrize( + "lineterminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")] +) def test_csv_writer_multichar_terminator(lineterminator, cudf_mixed_dataframe): df = cudf_mixed_dataframe @@ -1682,8 +1718,12 @@ def test_csv_writer_multichar_terminator(lineterminator, cudf_mixed_dataframe): None, ], ) -@pytest.mark.parametrize("header", [True, False, np.bool_(True), np.bool_(False)]) -@pytest.mark.parametrize("index", [True, False, np.bool_(True), np.bool_(False)]) +@pytest.mark.parametrize( + "header", [True, False, np.bool_(True), np.bool_(False)] +) +@pytest.mark.parametrize( + "index", [True, False, np.bool_(True), np.bool_(False)] +) def test_csv_writer_column_and_header_options( columns, header, index, pd_mixed_dataframe ): @@ -1753,8 +1793,12 @@ def test_csv_writer_chunksize(chunksize, dtype): "df", [ cudf.DataFrame({"vals": [1, 2, 3]}), - cudf.DataFrame({"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}), - cudf.DataFrame({"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}), + cudf.DataFrame( + {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]} + ), + cudf.DataFrame( + {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]} + ), ], ) def test_to_csv_empty_filename(df): @@ -1770,8 +1814,12 @@ def test_to_csv_empty_filename(df): "df", [ cudf.DataFrame({"vals": [1, 2, 3]}), - cudf.DataFrame({"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}), - cudf.DataFrame({"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}), + cudf.DataFrame( + {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]} + ), + cudf.DataFrame( + {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]} + ), ], ) def test_to_csv_StringIO(df): @@ -1885,7 +1933,9 @@ def test_csv_write_empty_dataframe(df, index): pd.DataFrame(columns=[""]), ], ) -@pytest.mark.parametrize("na_rep", ["", "_NA_", "---", "_____CUSTOM_NA_REP______"]) +@pytest.mark.parametrize( + "na_rep", ["", "_NA_", "---", "_____CUSTOM_NA_REP______"] +) def test_csv_write_dataframe_na_rep(df, na_rep): gdf = cudf.from_pandas(df) @@ -1930,7 +1980,9 @@ def test_csv_reader_nullable_dtypes(dtype): assert_eq(expected, actual.to_pandas(nullable=True)) -@pytest.mark.parametrize("dtype", sorted(list(cudf.utils.dtypes.TIMEDELTA_TYPES))) +@pytest.mark.parametrize( + "dtype", sorted(list(cudf.utils.dtypes.TIMEDELTA_TYPES)) +) def test_csv_reader_timedetla_dtypes(dtype): buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n43432423,13342,13243214\n" @@ -1940,7 +1992,9 @@ def test_csv_reader_timedetla_dtypes(dtype): assert_eq(expected, actual) -@pytest.mark.parametrize("dtype", sorted(list(cudf.utils.dtypes.DATETIME_TYPES))) +@pytest.mark.parametrize( + "dtype", sorted(list(cudf.utils.dtypes.DATETIME_TYPES)) +) def test_csv_reader_datetime_dtypes(dtype): buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n43432423,13342,13243214\n" @@ -1962,7 +2016,9 @@ def test_csv_reader_datetime_dtypes(dtype): cudf.DataFrame( { "a": cudf.Series([1.1, 2, 3, 1.1, 2], dtype="category"), - "b": cudf.Series([None, "c", None, "b", "a"], dtype="category"), + "b": cudf.Series( + [None, "c", None, "b", "a"], dtype="category" + ), } ), cudf.DataFrame( @@ -1970,7 +2026,9 @@ def test_csv_reader_datetime_dtypes(dtype): "b": cudf.Series( [1.1, 2, 3, 1.1, 2], dtype="category", - index=cudf.CategoricalIndex(["abc", "def", "ghi", "jkl", "xyz"]), + index=cudf.CategoricalIndex( + ["abc", "def", "ghi", "jkl", "xyz"] + ), ) } ), @@ -2031,8 +2089,12 @@ def test_na_filter_empty_fields(): gdf = cudf.read_csv(StringIO(buffer), keep_default_na=False) assert_eq(pdf, gdf) - pdf = pd.read_csv(StringIO(buffer), keep_default_na=False, na_values=test_na) - gdf = cudf.read_csv(StringIO(buffer), keep_default_na=False, na_values=test_na) + pdf = pd.read_csv( + StringIO(buffer), keep_default_na=False, na_values=test_na + ) + gdf = cudf.read_csv( + StringIO(buffer), keep_default_na=False, na_values=test_na + ) assert_eq(pdf, gdf) @@ -2085,7 +2147,9 @@ def test_empty_df_no_index(): assert_eq(actual, result) -def test_default_integer_bitwidth(cudf_mixed_dataframe, default_integer_bitwidth): +def test_default_integer_bitwidth( + cudf_mixed_dataframe, default_integer_bitwidth +): # Test that integer columns in csv are _inferred_ as user specified # bitwidth buf = BytesIO() @@ -2093,7 +2157,9 @@ def test_default_integer_bitwidth(cudf_mixed_dataframe, default_integer_bitwidth buf.seek(0) read = cudf.read_csv(buf) assert read["Integer"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") - assert read["Integer2"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") + assert read["Integer2"].dtype == np.dtype( + f"i{default_integer_bitwidth//8}" + ) def test_default_integer_bitwidth_partial( @@ -2106,7 +2172,9 @@ def test_default_integer_bitwidth_partial( buf.seek(0) read = cudf.read_csv(buf, dtype={"Integer": "int64"}) assert read["Integer"].dtype == np.dtype("i8") - assert read["Integer2"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") + assert read["Integer2"].dtype == np.dtype( + f"i{default_integer_bitwidth//8}" + ) @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py index c5d032a98b1..7fdf9754534 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/test_cuda_apply.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. """ Test method that apply GPU kernel to a frame. @@ -141,7 +141,10 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): expect_out1 = extra2 * in1 - extra1 * in2 + in3 expect_out2 = np.hstack( - [tpb * np.arange(e - s) for s, e in zip(chunks, chunks[1:] + [len(df)])] + [ + tpb * np.arange(e - s) + for s, e in zip(chunks, chunks[1:] + [len(df)]) + ] ) outdf = df.apply_chunks( diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 550cffe7a9e..213c6c2c1f9 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -79,7 +79,8 @@ def test_cuda_array_interface_interop_out_masked(dtype, module): expectation = does_not_raise() if module == "cupy": pytest.skip( - "cupy doesn't support version 1 of " "`__cuda_array_interface__` yet" + "cupy doesn't support version 1 of " + "`__cuda_array_interface__` yet" ) module_constructor = cupy.asarray @@ -128,13 +129,17 @@ def test_cuda_array_interface_as_column(dtype, nulls, mask_type): sr = sr.astype(dtype) - obj = types.SimpleNamespace(__cuda_array_interface__=sr.__cuda_array_interface__) + obj = types.SimpleNamespace( + __cuda_array_interface__=sr.__cuda_array_interface__ + ) if mask_type == "bools": if nulls == "some": obj.__cuda_array_interface__["mask"] = numba.cuda.to_device(mask) elif nulls == "all": - obj.__cuda_array_interface__["mask"] = numba.cuda.to_device([False] * 10) + obj.__cuda_array_interface__["mask"] = numba.cuda.to_device( + [False] * 10 + ) expect = sr got = cudf.Series(obj) diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py index 84e163d400a..24c1eaa8f02 100644 --- a/python/cudf/cudf/tests/test_cut.py +++ b/python/cudf/cudf/tests/test_cut.py @@ -18,7 +18,9 @@ @pytest.mark.parametrize("bins", [1, 2, 3]) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize("ordered", [True]) # if ordered is False we need labels +@pytest.mark.parametrize( + "ordered", [True] +) # if ordered is False we need labels @pytest.mark.parametrize("precision", [1, 2, 3]) def test_cut_basic(x, bins, right, include_lowest, ordered, precision): # will test optional labels, retbins and duplicates separately @@ -55,7 +57,9 @@ def test_cut_basic(x, bins, right, include_lowest, ordered, precision): @pytest.mark.parametrize( "labels", [["bad", "medium", "good"], ["A", "B", "C"], [1, 2, 3], False] ) -def test_cut_labels(x, bins, right, include_lowest, ordered, precision, labels): +def test_cut_labels( + x, bins, right, include_lowest, ordered, precision, labels +): pcat = pd.cut( x=x, bins=bins, @@ -83,7 +87,9 @@ def test_cut_labels(x, bins, right, include_lowest, ordered, precision, labels): @pytest.mark.parametrize("bins", [3]) # labels must be the same len as bins @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize("ordered", [False]) # labels must be unique if ordered=True +@pytest.mark.parametrize( + "ordered", [False] +) # labels must be unique if ordered=True @pytest.mark.parametrize("precision", [1, 2, 3]) @pytest.mark.parametrize( "labels", [["bad", "good", "good"], ["B", "A", "B"], [1, 2, 2], False] diff --git a/python/cudf/cudf/tests/test_dask.py b/python/cudf/cudf/tests/test_dask.py index 6048ccb5327..3af21b4a7ff 100644 --- a/python/cudf/cudf/tests/test_dask.py +++ b/python/cudf/cudf/tests/test_dask.py @@ -1,10 +1,12 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019, NVIDIA CORPORATION. import pytest import cudf -is_dataframe_like = pytest.importorskip("dask.dataframe.utils").is_dataframe_like +is_dataframe_like = pytest.importorskip( + "dask.dataframe.utils" +).is_dataframe_like is_index_like = pytest.importorskip("dask.dataframe.utils").is_index_like is_series_like = pytest.importorskip("dask.dataframe.utils").is_series_like diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 48c3fd911a6..ead1ab2da6c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -94,7 +94,8 @@ def _hide_concat_empty_dtype_warning(): # being caught and validated in other tests. warnings.filterwarnings( "ignore", - "The behavior of array concatenation with empty " "entries is deprecated.", + "The behavior of array concatenation with empty " + "entries is deprecated.", category=FutureWarning, ) yield @@ -272,8 +273,12 @@ def test_init_unaligned_with_index(): def test_init_series_list_columns_unsort(): - pseries = [pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)] - gseries = [cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)] + pseries = [ + pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) + ] + gseries = [ + cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) + ] pdf = pd.DataFrame(pseries) gdf = cudf.DataFrame(gseries) assert_eq(pdf, gdf) @@ -388,11 +393,17 @@ def test_dataframe_truncate_axis_1(): def test_dataframe_truncate_datetimeindex(): - dates = cudf.date_range("2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s") + dates = cudf.date_range( + "2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s" + ) df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates) pdf = df.to_pandas() - expected = pdf.truncate(before="2021-01-01 23:45:18", after="2021-01-01 23:45:27") - actual = df.truncate(before="2021-01-01 23:45:18", after="2021-01-01 23:45:27") + expected = pdf.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ) + actual = df.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ) assert_eq(actual, expected) @@ -478,7 +489,9 @@ def test_dataframe_basic(): name="custom_name", ), ), - pd.DataFrame({"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}), + pd.DataFrame( + {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} + ), ], ) @pytest.mark.parametrize( @@ -507,7 +520,9 @@ def test_dataframe_drop_columns(pdf, columns, inplace): {"a": range(10), "b": range(10, 20), "c": range(1, 11)}, index=pd.Index(list(range(10)), name="custom_name"), ), - pd.DataFrame({"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}), + pd.DataFrame( + {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} + ), ], ) @pytest.mark.parametrize( @@ -541,7 +556,9 @@ def test_dataframe_drop_labels_axis_0(pdf, labels, inplace): "pdf", [ pd.DataFrame({"a": range(10), "b": range(10, 20), "c": range(1, 11)}), - pd.DataFrame({"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}), + pd.DataFrame( + {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} + ), pd.DataFrame( { "a": range(10), @@ -627,7 +644,9 @@ def test_dataframe_drop_multiindex(pdf, index, level, inplace): "pdf", [ pd.DataFrame({"a": range(10), "b": range(10, 20), "c": range(1, 11)}), - pd.DataFrame({"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}), + pd.DataFrame( + {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} + ), ], ) @pytest.mark.parametrize( @@ -719,7 +738,9 @@ def test_dataframe_swaplevel_axis_0(): def test_dataframe_swaplevel_TypeError(): - cdf = cudf.DataFrame({"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"]) + cdf = cudf.DataFrame( + {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] + ) with pytest.raises(TypeError): cdf.swaplevel() @@ -750,7 +771,9 @@ def test_dataframe_swaplevel_axis_1(): def test_dataframe_drop_raises(): - df = cudf.DataFrame({"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"]) + df = cudf.DataFrame( + {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] + ) pdf = df.to_pandas() assert_exceptions_equal( lfunc=pdf.drop, @@ -863,7 +886,9 @@ def test_dataframe_index_rename(axis): def test_dataframe_MI_rename(): - gdf = cudf.DataFrame({"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)}) + gdf = cudf.DataFrame( + {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} + ) gdg = gdf.groupby(["a", "b"]).count() pdg = gdg.to_pandas() @@ -896,7 +921,9 @@ def test_dataframe_column_rename(axis): def test_dataframe_pop(): - pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [7.0, 8.0, 9.0]}) + pdf = pd.DataFrame( + {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [7.0, 8.0, 9.0]} + ) gdf = cudf.DataFrame.from_pandas(pdf) # Test non-existing column error @@ -972,7 +999,9 @@ def test_index_astype(nelem): def test_dataframe_to_string_with_skipped_rows(): # Test skipped rows - df = cudf.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}) + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) with pd.option_context("display.max_rows", 5): got = df.to_string() @@ -1021,7 +1050,9 @@ def test_dataframe_to_string_with_skipped_rows_and_columns(): def test_dataframe_to_string_with_masked_data(): # Test masked data - df = cudf.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}) + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) data = np.arange(6) mask = np.zeros(1, dtype=cudf.utils.utils.mask_dtype) @@ -1136,7 +1167,9 @@ def test_dataframe_copy_shallow(): def test_dataframe_dtypes(): - dtypes = pd.Series([np.int32, np.float32, np.float64], index=["c", "a", "b"]) + dtypes = pd.Series( + [np.int32, np.float32, np.float64], index=["c", "a", "b"] + ) df = cudf.DataFrame({k: np.ones(10, dtype=v) for k, v in dtypes.items()}) assert df.dtypes.equals(dtypes) @@ -1287,10 +1320,14 @@ def test_dataframe_setitem_from_masked_object(): test1_nan = cudf.Series(ary, nan_as_null=False) assert test1_nan.null_count == 0 - test2_null = cudf.DataFrame.from_pandas(pd.DataFrame({"a": ary}), nan_as_null=True) + test2_null = cudf.DataFrame.from_pandas( + pd.DataFrame({"a": ary}), nan_as_null=True + ) assert test2_null["a"].nullable assert test2_null["a"].null_count == 20 - test2_nan = cudf.DataFrame.from_pandas(pd.DataFrame({"a": ary}), nan_as_null=False) + test2_nan = cudf.DataFrame.from_pandas( + pd.DataFrame({"a": ary}), nan_as_null=False + ) assert test2_nan["a"].null_count == 0 gpu_ary = cupy.asarray(ary) @@ -1930,7 +1967,9 @@ def test_from_arrow(nelem, data_type): "b": np.random.randint(0, 1000, nelem).astype(data_type), } ) - padf = pa.Table.from_pandas(df, preserve_index=False).replace_schema_metadata(None) + padf = pa.Table.from_pandas( + df, preserve_index=False + ).replace_schema_metadata(None) gdf = cudf.DataFrame.from_arrow(padf) assert isinstance(gdf, cudf.DataFrame) @@ -1956,7 +1995,9 @@ def test_to_arrow(nelem, data_type): ) gdf = cudf.DataFrame.from_pandas(df) - pa_df = pa.Table.from_pandas(df, preserve_index=False).replace_schema_metadata(None) + pa_df = pa.Table.from_pandas( + df, preserve_index=False + ).replace_schema_metadata(None) pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) @@ -2015,7 +2056,9 @@ def test_to_arrow_categorical(): df["a"] = pd.Series(["a", "b", "c"], dtype="category") gdf = cudf.DataFrame.from_pandas(df) - pa_df = pa.Table.from_pandas(df, preserve_index=False).replace_schema_metadata(None) + pa_df = pa.Table.from_pandas( + df, preserve_index=False + ).replace_schema_metadata(None) pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) assert isinstance(pa_gdf, pa.Table) @@ -2053,7 +2096,9 @@ def test_to_arrow_missing_categorical(): def test_from_scalar_typing(data_type): if data_type == "datetime64[ms]": scalar = ( - np.dtype("int64").type(np.random.randint(0, 5)).astype("datetime64[ms]") + np.dtype("int64") + .type(np.random.randint(0, 5)) + .astype("datetime64[ms]") ) elif data_type.startswith("datetime64"): scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]") @@ -2162,7 +2207,9 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): dtype=dtype, ) if nulls == "some": - idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) + idx = np.random.choice( + num_rows, size=int(num_rows / 2), replace=False + ) if len(idx): data[idx] = null_rep elif nulls == "all": @@ -2634,7 +2681,9 @@ def test_iteritems(gdf): def test_quantile(q, numeric_only): ts = pd.date_range("2018-08-24", periods=5, freq="D") td = pd.to_timedelta(np.arange(5), unit="h") - pdf = pd.DataFrame({"date": ts, "delta": td, "val": np.random.randn(len(ts))}) + pdf = pd.DataFrame( + {"date": ts, "delta": td, "val": np.random.randn(len(ts))} + ) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf["date"].quantile(q), gdf["date"].quantile(q)) @@ -2656,7 +2705,9 @@ def test_quantile(q, numeric_only): ) def test_decimal_quantile(q, interpolation, decimal_type): data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"] - gdf = cudf.DataFrame({"id": np.random.randint(0, 10, size=len(data)), "val": data}) + gdf = cudf.DataFrame( + {"id": np.random.randint(0, 10, size=len(data)), "val": data} + ) gdf["id"] = gdf["id"].astype("float64") gdf["val"] = gdf["val"].astype(decimal_type(7, 2)) pdf = gdf.to_pandas() @@ -2738,7 +2789,8 @@ def test_cuda_array_interface(dtype): @pytest.mark.parametrize("data_type", dtypes) def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): np_list_data = [ - np.random.randint(0, 100, nelem).astype(data_type) for i in range(nchunks) + np.random.randint(0, 100, nelem).astype(data_type) + for i in range(nchunks) ] pa_chunk_array = pa.chunked_array(np_list_data) @@ -2754,10 +2806,13 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): assert_eq(expect, got) np_list_data2 = [ - np.random.randint(0, 100, nelem).astype(data_type) for i in range(nchunks) + np.random.randint(0, 100, nelem).astype(data_type) + for i in range(nchunks) ] pa_chunk_array2 = pa.chunked_array(np_list_data2) - pa_table = pa.Table.from_arrays([pa_chunk_array, pa_chunk_array2], names=["a", "b"]) + pa_table = pa.Table.from_arrays( + [pa_chunk_array, pa_chunk_array2], names=["a", "b"] + ) expect = pa_table.to_pandas() if cudf.api.types.is_datetime64_dtype( @@ -2848,7 +2903,9 @@ def test_dataframe_boolmask(mask_shape): [True, False, True], pytest.param( cudf.Series([True, False, True]), - marks=pytest_xfail(reason="Pandas can't index a multiindex with a Series"), + marks=pytest_xfail( + reason="Pandas can't index a multiindex with a Series" + ), ), ], ) @@ -3307,7 +3364,9 @@ def test_dataframe_reindex(copy, reindex_data, args, gd_kwargs): ), ], ) -def test_dataframe_reindex_fill_value(reindex_data_numeric, args, kwargs, fill_value): +def test_dataframe_reindex_fill_value( + reindex_data_numeric, args, kwargs, fill_value +): pdf, gdf = reindex_data_numeric.to_pandas(), reindex_data_numeric kwargs["fill_value"] = fill_value assert_eq(pdf.reindex(*args, **kwargs), gdf.reindex(*args, **kwargs)) @@ -3336,7 +3395,9 @@ def test_series_categorical_reindex(copy): gdf = cudf.datasets.randomdata(nrows=6, dtypes={"a": "category"}) pdf = gdf.to_pandas() assert_eq(pdf["a"].reindex(copy=True), gdf["a"].reindex(copy=copy)) - assert_eq(pdf["a"].reindex(index, copy=True), gdf["a"].reindex(index, copy=copy)) + assert_eq( + pdf["a"].reindex(index, copy=True), gdf["a"].reindex(index, copy=copy) + ) assert_eq( pdf["a"].reindex(index=index, copy=True), gdf["a"].reindex(index=index, copy=copy), @@ -3349,7 +3410,9 @@ def test_series_float_reindex(copy): gdf = cudf.datasets.randomdata(nrows=6, dtypes={"c": float}) pdf = gdf.to_pandas() assert_eq(pdf["c"].reindex(copy=True), gdf["c"].reindex(copy=copy)) - assert_eq(pdf["c"].reindex(index, copy=True), gdf["c"].reindex(index, copy=copy)) + assert_eq( + pdf["c"].reindex(index, copy=True), gdf["c"].reindex(index, copy=copy) + ) assert_eq( pdf["c"].reindex(index=index, copy=True), gdf["c"].reindex(index=index, copy=copy), @@ -3362,7 +3425,9 @@ def test_series_string_reindex(copy): gdf = cudf.datasets.randomdata(nrows=6, dtypes={"d": str}) pdf = gdf.to_pandas() assert_eq(pdf["d"].reindex(copy=True), gdf["d"].reindex(copy=copy)) - assert_eq(pdf["d"].reindex(index, copy=True), gdf["d"].reindex(index, copy=copy)) + assert_eq( + pdf["d"].reindex(index, copy=True), gdf["d"].reindex(index, copy=copy) + ) assert_eq( pdf["d"].reindex(index=index, copy=True), gdf["d"].reindex(index=index, copy=copy), @@ -3389,7 +3454,9 @@ def test_reindex_multiindex_col_to_multiindex(names, klass): @pytest.mark.parametrize("names", [None, ["a", "b"]]) @pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) def test_reindex_tuple_col_to_multiindex(names, klass): - idx = pd.Index([("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False) + idx = pd.Index( + [("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False + ) df = pd.DataFrame([[1, 2]], columns=idx) gdf = cudf.from_pandas(df) midx = klass.from_tuples([("A", "one"), ("A", "two")], names=names) @@ -3658,7 +3725,9 @@ def test_select_dtype(): ), ) - gdf = cudf.DataFrame({"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]}) + gdf = cudf.DataFrame( + {"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]} + ) pdf = gdf.to_pandas() assert_eq( pdf.select_dtypes(include=["object", "int", "category"]), @@ -3683,7 +3752,9 @@ def test_select_dtype(): pdf.select_dtypes(include=["object"]), gdf.select_dtypes(include=["object"]), ) - assert_eq(pdf.select_dtypes(include=["int"]), gdf.select_dtypes(include=["int"])) + assert_eq( + pdf.select_dtypes(include=["int"]), gdf.select_dtypes(include=["int"]) + ) assert_eq( pdf.select_dtypes(exclude=["float"]), gdf.select_dtypes(exclude=["float"]), @@ -3715,7 +3786,9 @@ def test_select_dtype(): gdf.select_dtypes(include=["int"], exclude=["object"]), ) - gdf = cudf.DataFrame({"int_col": [0, 1, 2], "list_col": [[1, 2], [3, 4], [5, 6]]}) + gdf = cudf.DataFrame( + {"int_col": [0, 1, 2], "list_col": [[1, 2], [3, 4], [5, 6]]} + ) pdf = gdf.to_pandas() assert_eq( pdf.select_dtypes("int64"), @@ -3840,7 +3913,9 @@ def test_dataframe_describe_percentiles(): def test_get_numeric_data(): - pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]}) + pdf = pd.DataFrame( + {"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]} + ) gdf = cudf.from_pandas(pdf) assert_eq(pdf._get_numeric_data(), gdf._get_numeric_data()) @@ -3968,7 +4043,9 @@ def test_ndim(): [1, 4, 3, -6], index=["floats", "ints", "floats_with_nan", "floats_same"], ), - cudf.Series([-4, -2, 12], index=["ints", "floats_with_nan", "floats_same"]), + cudf.Series( + [-4, -2, 12], index=["ints", "floats_with_nan", "floats_same"] + ), {"floats": -1, "ints": 15, "floats_will_nan": 2}, ], ) @@ -4207,12 +4284,16 @@ def test_as_column_types(): assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="float32") - gds = cudf.Series(column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32")) + gds = cudf.Series( + column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32") + ) assert_eq(pds, gds) pds = pd.Series([1.2, 18.0, 9.0], dtype="str") - gds = cudf.Series(column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str")) + gds = cudf.Series( + column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str") + ) assert_eq(pds, gds) @@ -4246,7 +4327,10 @@ def test_no_cols_head(index): @pytest.mark.parametrize("dtype", ALL_TYPES) @pytest.mark.parametrize( "np_dtype,pd_dtype", - [tuple(item) for item in cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.items()], + [ + tuple(item) + for item in cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.items() + ], ) def test_series_astype_pandas_nullable(dtype, np_dtype, pd_dtype): source = cudf.Series([0, 1, None], dtype=dtype) @@ -4380,7 +4464,9 @@ def test_series_astype_to_categorical_ordered(ordered): psr = pd.Series([1, 2, 3, 1], dtype="category") gsr = cudf.from_pandas(psr) - ordered_dtype_pd = pd.CategoricalDtype(categories=[1, 2, 3], ordered=ordered) + ordered_dtype_pd = pd.CategoricalDtype( + categories=[1, 2, 3], ordered=ordered + ) ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) assert_eq( psr.astype("int32").astype(ordered_dtype_pd).astype("int32"), @@ -4391,7 +4477,9 @@ def test_series_astype_to_categorical_ordered(ordered): @pytest.mark.parametrize("ordered", [True, False]) def test_series_astype_cat_ordered_to_unordered(ordered): pd_dtype = pd.CategoricalDtype(categories=[1, 2, 3], ordered=ordered) - pd_to_dtype = pd.CategoricalDtype(categories=[1, 2, 3], ordered=not ordered) + pd_to_dtype = pd.CategoricalDtype( + categories=[1, 2, 3], ordered=not ordered + ) gd_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) gd_to_dtype = cudf.CategoricalDtype.from_pandas(pd_to_dtype) @@ -4481,7 +4569,9 @@ def test_series_astype_null_cases(): assert_eq( pd.Series(data, dtype="datetime64[ns]").astype("category"), - cudf.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype("category"), + cudf.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype( + "category" + ), ) @@ -4575,7 +4665,9 @@ def test_dataframe_columns_returns_rangeindex_single_col(): @pytest.mark.parametrize("idx_data", [[], [1, 2]]) @pytest.mark.parametrize("data", [None, [], {}]) def test_dataframe_columns_empty_data_preserves_dtype(dtype, idx_data, data): - result = cudf.DataFrame(data, columns=cudf.Index(idx_data, dtype=dtype)).columns + result = cudf.DataFrame( + data, columns=cudf.Index(idx_data, dtype=dtype) + ).columns expected = pd.Index(idx_data, dtype=dtype) assert_eq(result, expected) @@ -4660,11 +4752,15 @@ def test_series_values_property(data): {"A": np.float32(np.arange(3)), "B": np.float64(np.arange(3))}, pytest.param( {"A": [1, None, 3], "B": [1, 2, None]}, - marks=pytest_xfail(reason="Nulls not supported by values accessor"), + marks=pytest_xfail( + reason="Nulls not supported by values accessor" + ), ), pytest.param( {"A": [None, None, None], "B": [None, None, None]}, - marks=pytest_xfail(reason="Nulls not supported by values accessor"), + marks=pytest_xfail( + reason="Nulls not supported by values accessor" + ), ), {"A": [], "B": []}, pytest.param( @@ -4795,7 +4891,8 @@ def test_isin_dataframe(data, values): except TypeError as e: # Can't do isin with different categories if str(e) == ( - "Categoricals can only be compared if 'categories' " "are the same." + "Categoricals can only be compared if 'categories' " + "are the same." ): return @@ -4989,7 +5086,9 @@ def test_df_astype_to_categorical_ordered(ordered): pdf["bar"] = psr gdf = cudf.DataFrame.from_pandas(pdf) - ordered_dtype_pd = pd.CategoricalDtype(categories=[1, 2, 3], ordered=ordered) + ordered_dtype_pd = pd.CategoricalDtype( + categories=[1, 2, 3], ordered=ordered + ) ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) assert_eq( @@ -5016,7 +5115,9 @@ def test_empty_df_astype(dtype): @pytest.mark.parametrize( "errors", [ - pytest.param("raise", marks=pytest_xfail(reason="should raise error here")), + pytest.param( + "raise", marks=pytest_xfail(reason="should raise error here") + ), pytest.param("other", marks=pytest_xfail(raises=ValueError)), "ignore", ], @@ -5069,7 +5170,9 @@ def test_df_constructor_dtype(dtype): { "a": [1, 2, 3, 4], "b": [7, np.NaN, 9, 10], - "c": cudf.Series([np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False), + "c": cudf.Series( + [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False + ), "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), @@ -5085,7 +5188,9 @@ def test_df_constructor_dtype(dtype): ), ], ) -@pytest.mark.parametrize("op", ["max", "min", "sum", "product", "mean", "var", "std"]) +@pytest.mark.parametrize( + "op", ["max", "min", "sum", "product", "mean", "var", "std"] +) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_rowwise_ops(data, op, skipna, numeric_only): @@ -5097,7 +5202,11 @@ def test_rowwise_ops(data, op, skipna, numeric_only): kwargs["ddof"] = 0 if not numeric_only and not all( - ((pdf[column].count() == 0) if skipna else (pdf[column].notna().count() == 0)) + ( + (pdf[column].count() == 0) + if skipna + else (pdf[column].notna().count() == 0) + ) or cudf.api.types.is_numeric_dtype(pdf[column].dtype) or cudf.api.types.is_bool_dtype(pdf[column].dtype) for column in pdf @@ -5118,7 +5227,9 @@ def test_rowwise_ops(data, op, skipna, numeric_only): ) -@pytest.mark.parametrize("op", ["max", "min", "sum", "product", "mean", "var", "std"]) +@pytest.mark.parametrize( + "op", ["max", "min", "sum", "product", "mean", "var", "std"] +) def test_rowwise_ops_nullable_dtypes_all_null(op): gdf = cudf.DataFrame( { @@ -5324,17 +5435,23 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes ): with pytest.raises(TypeError): - got = getattr(gdf, op)(axis=1, skipna=skipna, numeric_only=numeric_only) + got = getattr(gdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) with pytest.raises(TypeError): expected = getattr(pdf, op)( axis=1, skipna=skipna, numeric_only=numeric_only ) else: - got = getattr(gdf, op)(axis=1, skipna=skipna, numeric_only=numeric_only) - expected = getattr(pdf, op)(axis=1, skipna=skipna, numeric_only=numeric_only) - if got.dtype == cudf.dtype("datetime64[us]") and expected.dtype == np.dtype( - "datetime64[ns]" - ): + got = getattr(gdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + expected = getattr(pdf, op)( + axis=1, skipna=skipna, numeric_only=numeric_only + ) + if got.dtype == cudf.dtype( + "datetime64[us]" + ) and expected.dtype == np.dtype("datetime64[ns]"): # Workaround for a PANDAS-BUG: # https://github.com/pandas-dev/pandas/issues/52524 assert_eq(got.astype("datetime64[ns]"), expected) @@ -5351,7 +5468,9 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): ["2020-08-01 09:00:00", "1920-05-01 10:30:00"], dtype=""}], + "val": [ + {"name": "var1", "val": None, "type": "optional"} + ], "type": "list", }, {}, diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index ba0a47f793d..161b245953b 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -126,7 +126,9 @@ def test_drop_duplicates(): expected = pdf.drop_duplicates("E", keep="last") assert_eq(result, expected) - pdf = pd.DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) + pdf = pd.DataFrame( + {"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]} + ) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) @@ -155,7 +157,9 @@ def test_drop_duplicates(): @pytest.mark.skip(reason="cudf does not support duplicate column names yet") def test_drop_duplicates_with_duplicate_column_names(): - df = pd.DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) + df = pd.DataFrame( + [[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"] + ) df = cudf.DataFrame.from_pandas(df) result0 = df.drop_duplicates() @@ -339,8 +343,12 @@ def test_dataframe_drop_duplicates_method(): assert_eq(gdf.drop_duplicates("n1"), pdf.drop_duplicates("n1")) assert_eq(gdf.drop_duplicates("n2"), pdf.drop_duplicates("n2")) assert_eq(gdf.drop_duplicates("s1"), pdf.drop_duplicates("s1")) - assert_eq(gdf.drop_duplicates(["n1", "n2"]), pdf.drop_duplicates(["n1", "n2"])) - assert_eq(gdf.drop_duplicates(["n1", "s1"]), pdf.drop_duplicates(["n1", "s1"])) + assert_eq( + gdf.drop_duplicates(["n1", "n2"]), pdf.drop_duplicates(["n1", "n2"]) + ) + assert_eq( + gdf.drop_duplicates(["n1", "s1"]), pdf.drop_duplicates(["n1", "s1"]) + ) # Test drop error assert_exceptions_equal( diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 9a4a4489750..12a325fa4e8 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import os from string import ascii_letters @@ -19,7 +19,10 @@ def pdf(request): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - {f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) for typ in types} + { + f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + for typ in types + } ) # Delete the name of the column index, and rename the row index test_pdf.columns.name = None diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index a9b319d88ad..06516b6b4ea 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -111,7 +111,9 @@ def pdf(gdf): @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_mean(nelem): got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).mean() - expect_df = make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() + expect_df = ( + make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() + ) assert_groupby_results_equal(got_df, expect_df) @@ -119,18 +121,30 @@ def test_groupby_mean(nelem): def test_groupby_mean_3level(nelem): lvls = "z" bys = list("xyz") - got_df = make_frame(DataFrame, nelem=nelem, extra_levels=lvls).groupby(bys).mean() + got_df = ( + make_frame(DataFrame, nelem=nelem, extra_levels=lvls) + .groupby(bys) + .mean() + ) expect_df = ( - make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls).groupby(bys).mean() + make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls) + .groupby(bys) + .mean() ) assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_agg_mean_min(nelem): - got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(["mean", "min"]) + got_df = ( + make_frame(DataFrame, nelem=nelem) + .groupby(["x", "y"]) + .agg(["mean", "min"]) + ) expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(["mean", "min"]) + make_frame(pd.DataFrame, nelem=nelem) + .groupby(["x", "y"]) + .agg(["mean", "min"]) ) assert_groupby_results_equal(got_df, expect_df) @@ -185,11 +199,17 @@ def test_groupby_as_index_apply(pdf, gdf, as_index, engine): @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_as_index_multiindex(pdf, gdf, as_index): - pdf = pd.DataFrame({"a": [1, 2, 1], "b": [3, 3, 3], "c": [2, 2, 3], "d": [3, 1, 2]}) + pdf = pd.DataFrame( + {"a": [1, 2, 1], "b": [3, 3, 3], "c": [2, 2, 3], "d": [3, 1, 2]} + ) gdf = cudf.from_pandas(pdf) - gdf = gdf.groupby(["a", "b"], as_index=as_index, sort=True).agg({"c": "mean"}) - pdf = pdf.groupby(["a", "b"], as_index=as_index, sort=True).agg({"c": "mean"}) + gdf = gdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( + {"c": "mean"} + ) + pdf = pdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( + {"c": "mean"} + ) if as_index: assert_eq(pdf, gdf) @@ -362,7 +382,9 @@ def foo(key1, val1, com1, com2): got = got.to_pandas() expect = df.copy() - expect["com1"] = (expect["key1"] * 10000 + expect["key1"]).astype(np.float64) + expect["com1"] = (expect["key1"] * 10000 + expect["key1"]).astype( + np.float64 + ) expect["com2"] = np.zeros(nelem, dtype=np.int32) assert_groupby_results_equal(expect, got) @@ -400,7 +422,9 @@ def groupby_jit_data_large(groupby_jit_data_small): manifesting numerical issues such as overflow. """ max_tpb = 1024 - factor = max_tpb + 1 # bigger than a block but not always an exact multiple + factor = ( + max_tpb + 1 + ) # bigger than a block but not always an exact multiple df = cudf.concat([groupby_jit_data_small] * factor) return df @@ -476,14 +500,18 @@ def func(df): "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] ) @pytest.mark.parametrize("dataset", ["small", "large", "nans"]) -def test_groupby_apply_jit_unary_reductions(func, dtype, dataset, groupby_jit_datasets): +def test_groupby_apply_jit_unary_reductions( + func, dtype, dataset, groupby_jit_datasets +): dataset = groupby_jit_datasets[dataset] groupby_apply_jit_reductions_test_inner(func, dataset, dtype) # test unary reductions for special values -def groupby_apply_jit_reductions_special_vals_inner(func, data, dtype, special_val): +def groupby_apply_jit_reductions_special_vals_inner( + func, data, dtype, special_val +): funcstr = textwrap.dedent( f""" def func(df): @@ -503,7 +531,9 @@ def func(df): # test unary index reductions for special values -def groupby_apply_jit_idx_reductions_special_vals_inner(func, data, dtype, special_val): +def groupby_apply_jit_idx_reductions_special_vals_inner( + func, data, dtype, special_val +): funcstr = textwrap.dedent( f""" def func(df): @@ -587,7 +617,9 @@ def func(group): pytest.param( "small", marks=[ - pytest.mark.filterwarnings("ignore:Degrees of Freedom <= 0 for slice"), + pytest.mark.filterwarnings( + "ignore:Degrees of Freedom <= 0 for slice" + ), pytest.mark.filterwarnings( "ignore:divide by zero encountered in divide" ), @@ -610,7 +642,10 @@ def func(group): if np.dtype(dtype).kind == "f": # Correlation of floating types is not yet supported: # https://github.com/rapidsai/cudf/issues/13839 - m = f"Series.corr\\(Series\\) is not " f"supported for \\({dtype}, {dtype}\\)" + m = ( + f"Series.corr\\(Series\\) is not " + f"supported for \\({dtype}, {dtype}\\)" + ) with pytest.raises(UDFError, match=m): run_groupby_apply_jit_test(dataset, func, keys) return @@ -623,7 +658,9 @@ def test_groupby_apply_jit_correlation_zero_variance(dtype): # pearson correlation is undefined when the variance of either # variable is zero. This test ensures that the jit implementation # returns the same result as pandas in this case. - data = DataFrame({"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]}) + data = DataFrame( + {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]} + ) def func(group): return group["b"].corr(group["c"]) @@ -647,7 +684,9 @@ def func(group): @pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_groupby_apply_jit_invalid_binary_ops_error(groupby_jit_data_small, op): +def test_groupby_apply_jit_invalid_binary_ops_error( + groupby_jit_data_small, op +): keys = ["key1"] def func(group): @@ -718,9 +757,13 @@ def f3(df, k, L, m): return [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] -@pytest.mark.parametrize("func,args", create_test_groupby_apply_jit_args_params()) +@pytest.mark.parametrize( + "func,args", create_test_groupby_apply_jit_args_params() +) def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): - run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"], *args) + run_groupby_apply_jit_test( + groupby_jit_data_small, func, ["key1", "key2"], *args + ) def test_groupby_apply_jit_block_divergence(): @@ -877,7 +920,9 @@ def pdf_func(df): ) def test_groupby_2keys_agg(nelem, func): # gdf (Note: lack of multiIndex) - expect_df = make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) + expect_df = ( + make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) + ) got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) check_dtype = func not in _index_type_aggs @@ -1246,8 +1291,12 @@ def test_groupby_list_then_string(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [6, 7, 6, 7, 6] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg({"b": ["min", "max"], "c": "max"}) - pdg = pdf.groupby("a", as_index=True).agg({"b": ["min", "max"], "c": "max"}) + gdg = gdf.groupby("a", as_index=True).agg( + {"b": ["min", "max"], "c": "max"} + ) + pdg = pdf.groupby("a", as_index=True).agg( + {"b": ["min", "max"], "c": "max"} + ) assert_groupby_results_equal(gdg, pdg) @@ -1257,8 +1306,12 @@ def test_groupby_different_unequal_length_column_aggregations(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg({"b": "min", "c": ["max", "min"]}) - pdg = pdf.groupby("a", as_index=True).agg({"b": "min", "c": ["max", "min"]}) + gdg = gdf.groupby("a", as_index=True).agg( + {"b": "min", "c": ["max", "min"]} + ) + pdg = pdf.groupby("a", as_index=True).agg( + {"b": "min", "c": ["max", "min"]} + ) assert_groupby_results_equal(pdg, gdg) @@ -1430,7 +1483,9 @@ def test_groupby_nulls_in_index(): pdf = pd.DataFrame({"a": [None, 2, 1, 1], "b": [1, 2, 3, 4]}) gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal(pdf.groupby("a").sum(), gdf.groupby("a").sum()) + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) def test_groupby_all_nulls_index(): @@ -1441,13 +1496,17 @@ def test_groupby_all_nulls_index(): } ) pdf = gdf.to_pandas() - assert_groupby_results_equal(pdf.groupby("a").sum(), gdf.groupby("a").sum()) + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) gdf = cudf.DataFrame( {"a": cudf.Series([np.nan, np.nan, np.nan, np.nan]), "b": [1, 2, 3, 4]} ) pdf = gdf.to_pandas() - assert_groupby_results_equal(pdf.groupby("a").sum(), gdf.groupby("a").sum()) + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) @pytest.mark.parametrize("sort", [True, False]) @@ -1461,7 +1520,9 @@ def test_groupby_sort(sort): check_like=not sort, ) - pdf = pd.DataFrame({"c": [-1, 2, 1, 4], "b": [1, 2, 3, 4], "a": [2, 2, 1, 1]}) + pdf = pd.DataFrame( + {"c": [-1, 2, 1, 4], "b": [1, 2, 3, 4], "a": [2, 2, 1, 1]} + ) gdf = cudf.from_pandas(pdf) assert_eq( @@ -1520,7 +1581,9 @@ def test_groupby_quantile(request, interpolation, q): request.applymarker( pytest.mark.xfail( condition=(q == 0.5 and interpolation == "nearest"), - reason=("Pandas NaN Rounding will fail nearest interpolation at 0.5"), + reason=( + "Pandas NaN Rounding will fail nearest interpolation at 0.5" + ), ) ) @@ -1621,7 +1684,9 @@ def test_groupby_cumcount(index): @pytest.mark.parametrize("nelem", [2, 3, 1000]) @pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("agg", ["min", "max", "idxmin", "idxmax", "mean", "count"]) +@pytest.mark.parametrize( + "agg", ["min", "max", "idxmin", "idxmax", "mean", "count"] +) def test_groupby_datetime(nelem, as_index, agg): if agg == "mean" and as_index is True: return @@ -1647,7 +1712,9 @@ def test_groupby_datetime(nelem, as_index, agg): def test_groupby_dropna(): df = cudf.DataFrame({"a": [1, 1, None], "b": [1, 2, 3]}) - expect = cudf.DataFrame({"b": [3, 3]}, index=cudf.Series([1, None], name="a")) + expect = cudf.DataFrame( + {"b": [3, 3]}, index=cudf.Series([1, None], name="a") + ) got = df.groupby("a", dropna=False).sum() assert_groupby_results_equal(expect, got) @@ -1715,7 +1782,9 @@ def test_groupby_series_same_name_as_dataframe_column(): def test_group_by_series_and_column_name_in_by(): - gdf = cudf.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}, index=[1, 2, 3]) + gdf = cudf.DataFrame( + {"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}, index=[1, 2, 3] + ) gsr0 = cudf.Series([0.0, 1.0, 2.0], name="a", index=[1, 2, 3]) gsr1 = cudf.Series([0.0, 1.0, 3.0], name="b", index=[3, 4, 5]) @@ -1751,7 +1820,9 @@ def test_grouping(grouper): ) gdf = cudf.from_pandas(pdf) - for pdf_group, gdf_group in zip(pdf.groupby(grouper), gdf.groupby(grouper)): + for pdf_group, gdf_group in zip( + pdf.groupby(grouper), gdf.groupby(grouper) + ): assert pdf_group[0] == gdf_group[0] assert_eq(pdf_group[1], gdf_group[1]) @@ -1873,7 +1944,9 @@ def test_groupby_agg_combinations(agg): def test_groupby_apply_noempty_group(): - pdf = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]}) + pdf = pd.DataFrame( + {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} + ) gdf = cudf.from_pandas(pdf) expect = ( @@ -1930,7 +2003,9 @@ def _groupby(self): ], ) def test_groupby_groups(by): - pdf = pd.DataFrame({"a": [1, 2, 1, 2, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6, 7]}) + pdf = pd.DataFrame( + {"a": [1, 2, 1, 2, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6, 7]} + ) gdf = cudf.from_pandas(pdf) pdg = pdf.groupby(by) @@ -2049,7 +2124,9 @@ def test_groupby_list_single_element(list_agg): ) -@pytest.mark.parametrize("agg", [list, [list, "count"], {"b": list, "c": "sum"}]) +@pytest.mark.parametrize( + "agg", [list, [list, "count"], {"b": list, "c": "sum"}] +) def test_groupby_list_strings(agg): pdf = pd.DataFrame( { @@ -2129,7 +2206,9 @@ def f3(x, k, L, m): return [(f0, ()), (f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] -@pytest.mark.parametrize("func,args", create_test_groupby_apply_return_scalars_params()) +@pytest.mark.parametrize( + "func,args", create_test_groupby_apply_return_scalars_params() +) def test_groupby_apply_return_scalars(func, args): pdf = pd.DataFrame( { @@ -2189,7 +2268,9 @@ def f5(x, k, L, m): "func,args", create_test_groupby_apply_return_series_dataframe_params() ) def test_groupby_apply_return_series_dataframe(func, args): - pdf = pd.DataFrame({"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]}) + pdf = pd.DataFrame( + {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} + ) gdf = cudf.from_pandas(pdf) expected = pdf.groupby(["key"], group_keys=False).apply( @@ -2273,11 +2354,17 @@ def test_groupby_unique(by, data, dtype): @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -@pytest.mark.parametrize("func", ["cummin", "cummax", "cumcount", "cumsum", "cumprod"]) +@pytest.mark.parametrize( + "func", ["cummin", "cummax", "cumcount", "cumsum", "cumprod"] +) def test_groupby_2keys_scan(nelem, func): pdf = make_frame(pd.DataFrame, nelem=nelem) expect_df = pdf.groupby(["x", "y"], sort=True).agg(func) - got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"], sort=True).agg(func) + got_df = ( + make_frame(DataFrame, nelem=nelem) + .groupby(["x", "y"], sort=True) + .agg(func) + ) # pd.groupby.cumcount returns a series. if isinstance(expect_df, pd.Series): expect_df = expect_df.to_frame("val") @@ -2315,7 +2402,9 @@ def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct): def test_groupby_rank_fails(): - gdf = cudf.DataFrame({"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]}) + gdf = cudf.DataFrame( + {"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]} + ) with pytest.raises(NotImplementedError): gdf.groupby(["x", "y"]).rank(method="min", axis=1) gdf = cudf.DataFrame( @@ -2328,7 +2417,9 @@ def test_groupby_rank_fails(): gdf.groupby(["a"]).rank(method="min", axis=1) -@pytest.mark.parametrize("with_nan", [False, True], ids=["just-NA", "also-NaN"]) +@pytest.mark.parametrize( + "with_nan", [False, True], ids=["just-NA", "also-NaN"] +) @pytest.mark.parametrize("dropna", [False, True], ids=["keepna", "dropna"]) @pytest.mark.parametrize( "duplicate_index", [False, True], ids=["rangeindex", "dupindex"] @@ -2376,10 +2467,14 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): gdf = cudf.from_pandas(pdf) n_shift = int(nelem * shift_perc) * direction - expected = pdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value) + expected = pdf.groupby(["x", "y"]).shift( + periods=n_shift, fill_value=fill_value + ) got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value) - assert_groupby_results_equal(expected[["val", "val2"]], got[["val", "val2"]]) + assert_groupby_results_equal( + expected[["val", "val2"]], got[["val", "val2"]] + ) @pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) @@ -2403,7 +2498,9 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): ), ], ) -def test_groupby_shift_row_mixed_numerics(nelem, shift_perc, direction, fill_value): +def test_groupby_shift_row_mixed_numerics( + nelem, shift_perc, direction, fill_value +): t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2486,7 +2583,9 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction): ] ], ) -def test_groupby_shift_row_mixed_fill(nelem, shift_perc, direction, fill_value): +def test_groupby_shift_row_mixed_fill( + nelem, shift_perc, direction, fill_value +): t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2517,7 +2616,9 @@ def test_groupby_shift_row_mixed_fill(nelem, shift_perc, direction, fill_value): if isinstance(single_fill, cudf.Scalar): single_fill = single_fill._host_value expected[col] = ( - pdf[col].groupby(pdf["0"]).shift(periods=n_shift, fill_value=single_fill) + pdf[col] + .groupby(pdf["0"]) + .shift(periods=n_shift, fill_value=single_fill) ) got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) @@ -2570,7 +2671,9 @@ def test_groupby_diff_row(nelem, shift_perc, direction): expected = pdf.groupby(["x", "y"]).diff(periods=n_shift) got = gdf.groupby(["x", "y"]).diff(periods=n_shift) - assert_groupby_results_equal(expected[["val", "val2"]], got[["val", "val2"]]) + assert_groupby_results_equal( + expected[["val", "val2"]], got[["val", "val2"]] + ) @pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) @@ -2672,7 +2775,8 @@ def test_groupby_fillna_multi_value(nelem): # fill the dataframe with the first non-null item in the column fill_values = { - name: pdf[name].loc[pdf[name].first_valid_index()] for name in value_cols + name: pdf[name].loc[pdf[name].first_valid_index()] + for name in value_cols } # cudf can't fillna with a pandas.Timedelta type fill_values["4"] = fill_values["4"].to_numpy() @@ -2716,7 +2820,8 @@ def test_groupby_fillna_multi_value_df(nelem): # fill the dataframe with the first non-null item in the column fill_values = { - name: pdf[name].loc[pdf[name].first_valid_index()] for name in value_cols + name: pdf[name].loc[pdf[name].first_valid_index()] + for name in value_cols } # cudf can't fillna with a pandas.Timedelta type fill_values["4"] = fill_values["4"].to_numpy() @@ -2735,7 +2840,9 @@ def test_groupby_fillna_multi_value_df(nelem): "by", [pd.Series([1, 1, 2, 2, 3, 4]), lambda x: x % 2 == 0, pd.Grouper(level=0)], ) -@pytest.mark.parametrize("data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]]) +@pytest.mark.parametrize( + "data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]] +) @pytest.mark.parametrize("args", [{"value": 42}, {"method": "ffill"}]) def test_groupby_various_by_fillna(by, data, args): ps = pd.Series(data) @@ -2799,7 +2906,9 @@ def test_groupby_fillna_method(nelem, method): with pytest.warns(FutureWarning): got = gdf.groupby(key_col).fillna(method=method) - assert_groupby_results_equal(expect[value_cols], got[value_cols], sort=False) + assert_groupby_results_equal( + expect[value_cols], got[value_cols], sort=False + ) @pytest.mark.parametrize( @@ -3129,7 +3238,9 @@ def test_groupby_select_then_diff(): @pytest.mark.parametrize("by", ["a", ["a", "b"], pd.Series([1, 2, 1, 3])]) def test_groupby_transform_maintain_index(by): # test that we maintain the index after a groupby transform - gdf = cudf.DataFrame({"a": [1, 1, 1, 2], "b": [1, 2, 1, 2]}, index=[3, 2, 1, 0]) + gdf = cudf.DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 1, 2]}, index=[3, 2, 1, 0] + ) pdf = gdf.to_pandas() assert_groupby_results_equal( pdf.groupby(by).transform("max"), gdf.groupby(by).transform("max") @@ -3176,7 +3287,9 @@ def test_groupby_pct_change(data, gkey, periods, fill_method): pdf = gdf.to_pandas() with expect_warning_if(fill_method not in (no_default, None)): - actual = gdf.groupby(gkey).pct_change(periods=periods, fill_method=fill_method) + actual = gdf.groupby(gkey).pct_change( + periods=periods, fill_method=fill_method + ) with expect_warning_if( ( fill_method not in (no_default, None) @@ -3274,7 +3387,9 @@ def test_groupby_ngroup(by, ascending, df_ngroup): PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="warning not present in older pandas versions", ) -@pytest.mark.parametrize("groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]) +@pytest.mark.parametrize( + "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] +) def test_groupby_dtypes(groups): df = cudf.DataFrame( {"a": [1, 2, 3, 3], "b": ["x", "y", "z", "a"], "c": [10, 11, 12, 12]} @@ -3300,7 +3415,9 @@ def test_groupby_by_index_names(index_names): ) -@pytest.mark.parametrize("groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]) +@pytest.mark.parametrize( + "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] +) def test_group_by_pandas_compat(groups): with cudf.option_context("mode.pandas_compatible", True): df = cudf.DataFrame( @@ -3322,7 +3439,9 @@ def index(self, request): if request.param == "rangeindex": return cudf.RangeIndex(2, n + 2) elif request.param == "intindex": - return cudf.Index([2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32") + return cudf.Index( + [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32" + ) elif request.param == "strindex": return cudf.Index(list(string.ascii_lowercase[:n])) elif request.param == "default": @@ -3392,7 +3511,9 @@ def test_not_implemented_arguments(self, df): @pytest.mark.parametrize("frac", [0, 1 / 3, 1 / 2, 2 / 3, 1]) @pytest.mark.parametrize("replace", [False, True]) def test_fraction_rounding(self, df, by, frac, replace): - result = df.groupby(by).sample(frac=frac, replace=replace).sort_values("a") + result = ( + df.groupby(by).sample(frac=frac, replace=replace).sort_values("a") + ) assert_eq(self.expected(df, frac=frac), result.reset_index(drop=True)) @@ -3401,7 +3522,9 @@ class TestHeadTail: def n(self, request): return request.param - @pytest.fixture(params=[False, True], ids=["no-preserve-order", "preserve-order"]) + @pytest.fixture( + params=[False, True], ids=["no-preserve-order", "preserve-order"] + ) def preserve_order(self, request): return request.param @@ -3442,7 +3565,9 @@ def expected(self, df, n, take_head, preserve_order): slicefunc = operator.itemgetter(slice(None, n)) else: # Tail does group[-n:] except when n == 0 - slicefunc = operator.itemgetter(slice(-n, None) if n else slice(0)) + slicefunc = operator.itemgetter( + slice(-n, None) if n else slice(0) + ) values_to_sort = np.hstack( [df.values_host, np.arange(len(df)).reshape(-1, 1)] ) @@ -3455,7 +3580,9 @@ def expected(self, df, n, take_head, preserve_order): ) ) ) - return cudf.DataFrame({"a": expect_a, "b": expect_b}, index=index) + return cudf.DataFrame( + {"a": expect_a, "b": expect_b}, index=index + ) def test_head_tail(self, df, n, take_head, expected, preserve_order): if take_head: @@ -3482,7 +3609,9 @@ def test_head_tail_empty(): assert_eq(expected, got, check_column_type=False) -@pytest.mark.parametrize("groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]) +@pytest.mark.parametrize( + "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] +) @pytest.mark.parametrize("sort", [True, False]) def test_group_by_pandas_sort_order(groups, sort): with cudf.option_context("mode.pandas_compatible", True): @@ -3523,7 +3652,9 @@ def test_group_by_empty_reduction(dtype, reduce_op): gg = gdf.groupby("a")["c"] pg = pdf.groupby("a")["c"] - assert_eq(getattr(gg, reduce_op)(), getattr(pg, reduce_op)(), check_dtype=True) + assert_eq( + getattr(gg, reduce_op)(), getattr(pg, reduce_op)(), check_dtype=True + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py index 9d7b4599a89..e081119ff89 100644 --- a/python/cudf/cudf/tests/test_hash_vocab.py +++ b/python/cudf/cudf/tests/test_hash_vocab.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import filecmp import os import warnings @@ -10,7 +10,9 @@ @pytest.fixture(scope="module") def datadir(datadir): - return os.path.join(datadir, "subword_tokenizer_data", "bert_base_cased_sampled") + return os.path.join( + datadir, "subword_tokenizer_data", "bert_base_cased_sampled" + ) def test_correct_bert_base_vocab_hash(datadir, tmpdir): diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py index d48bf4c5b98..d420c95cfb4 100644 --- a/python/cudf/cudf/tests/test_hdf.py +++ b/python/cudf/cudf/tests/test_hdf.py @@ -15,7 +15,9 @@ @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): - types = set(NUMERIC_TYPES + ["datetime64[ns]"] + ["bool"]) - set(UNSIGNED_TYPES) + types = set(NUMERIC_TYPES + ["datetime64[ns]"] + ["bool"]) - set( + UNSIGNED_TYPES + ) typer = {"col_" + val: val for val in types} ncols = len(types) nrows = request.param @@ -57,7 +59,9 @@ def hdf_files(request, tmp_path_factory, pdf): fname_series = {} for column in pdf.columns: - fname_series[column] = tmp_path_factory.mktemp("hdf") / "test_series.hdf" + fname_series[column] = ( + tmp_path_factory.mktemp("hdf") / "test_series.hdf" + ) pdf[column].to_hdf( fname_series[column], key="hdf_series_tests", format=request.param ) @@ -79,7 +83,9 @@ def test_hdf_reader(hdf_files, columns): expect_df = pd.read_hdf(hdf_df_file, columns=columns) got_df = cudf.read_hdf(hdf_df_file, columns=columns) - assert_eq(expect_df, got_df, check_categorical=False, check_index_type=False) + assert_eq( + expect_df, got_df, check_categorical=False, check_index_type=False + ) for column in hdf_series.keys(): expect_series = pd.read_hdf(hdf_series[column]) diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py index 61ece42e9fc..f8de16f8609 100644 --- a/python/cudf/cudf/tests/test_hdfs.py +++ b/python/cudf/cudf/tests/test_hdfs.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import os from io import BytesIO @@ -57,7 +57,9 @@ def test_read_csv(tmpdir, pdf, hdfs, test_url): hdfs.upload(basedir + "/test_csv_reader.csv", buffer) if test_url: - hd_fpath = "hdfs://{}:{}{}/test_csv_reader.csv".format(host, port, basedir) + hd_fpath = "hdfs://{}:{}{}/test_csv_reader.csv".format( + host, port, basedir + ) else: hd_fpath = f"hdfs://{basedir}/test_csv_reader.csv" @@ -74,7 +76,9 @@ def test_read_csv(tmpdir, pdf, hdfs, test_url): def test_write_csv(pdf, hdfs, test_url): gdf = cudf.from_pandas(pdf) if test_url: - hd_fpath = "hdfs://{}:{}{}/test_csv_writer.csv".format(host, port, basedir) + hd_fpath = "hdfs://{}:{}{}/test_csv_writer.csv".format( + host, port, basedir + ) else: hd_fpath = f"hdfs://{basedir}/test_csv_writer.csv" @@ -132,7 +136,9 @@ def test_write_parquet(pdf, hdfs, test_url): assert_eq(pdf, got) -@pytest.mark.xfail(reason="Writing string columns with parition_cols is incorrect") +@pytest.mark.xfail( + reason="Writing string columns with parition_cols is incorrect" +) @pytest.mark.parametrize("test_url", [False, True]) def test_write_parquet_partitioned(tmpdir, pdf, hdfs, test_url): pdf.to_parquet( @@ -149,11 +155,15 @@ def test_write_parquet_partitioned(tmpdir, pdf, hdfs, test_url): hd_fpath = f"hdfs://{basedir}/test_parquet_partitioned.parquet" # Clear data written from previous runs hdfs.rm(f"{basedir}/test_parquet_partitioned.parquet", recursive=True) - gdf.to_parquet(hd_fpath, index=False, partition_cols=["Integer", "Boolean"]) + gdf.to_parquet( + hd_fpath, index=False, partition_cols=["Integer", "Boolean"] + ) assert hdfs.exists(f"{basedir}/test_parquet_partitioned.parquet") got = pd.read_parquet(hd_fpath) - expect = pd.read_parquet(tmpdir.join("pandas_parquet_writer_partitioned.parquet")) + expect = pd.read_parquet( + tmpdir.join("pandas_parquet_writer_partitioned.parquet") + ) assert_eq(expect, got) @@ -171,7 +181,9 @@ def test_read_json(tmpdir, pdf, hdfs, test_url): hdfs.upload(basedir + "/test_json_reader.json", buffer) if test_url: - hd_fpath = "hdfs://{}:{}{}/test_json_reader.json".format(host, port, basedir) + hd_fpath = "hdfs://{}:{}{}/test_json_reader.json".format( + host, port, basedir + ) else: hd_fpath = f"hdfs://{basedir}/test_json_reader.json" @@ -209,7 +221,9 @@ def test_write_orc(pdf, hdfs, test_url): pdf["Integer2"] = pdf["Integer2"].astype("int64") gdf = cudf.from_pandas(pdf) if test_url: - hd_fpath = "hdfs://{}:{}{}/test_orc_writer.orc".format(host, port, basedir) + hd_fpath = "hdfs://{}:{}{}/test_orc_writer.orc".format( + host, port, basedir + ) else: hd_fpath = f"hdfs://{basedir}/test_orc_writer.orc" diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 8c139a75161..05213d7601c 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -193,7 +193,9 @@ def test_pandas_as_index(): pdf_int_index = pd.Index([1, 2, 3, 4, 5]) pdf_uint_index = pd.Index([1, 2, 3, 4, 5]) pdf_float_index = pd.Index([1.0, 2.0, 3.0, 4.0, 5.0]) - pdf_datetime_index = pd.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) + pdf_datetime_index = pd.DatetimeIndex( + [1000000, 2000000, 3000000, 4000000, 5000000] + ) pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"]) # Define cudf Indexes @@ -219,7 +221,9 @@ def test_pandas_as_index(): assert_eq( pdf_category_index.codes, - gdf_category_index.codes.astype(pdf_category_index.codes.dtype).to_numpy(), + gdf_category_index.codes.astype( + pdf_category_index.codes.dtype + ).to_numpy(), ) @@ -704,7 +708,9 @@ def test_index_argsort(data): pd.Index([1, 10, 2, 100, -10], name="abc"), pd.Index(["z", "x", "a", "c", "b"]), pd.Index(["z", "x", "a", "c", "b"], dtype="category"), - pd.Index([-10.2, 100.1, -100.2, 0.0, 0.23], name="this is a float index"), + pd.Index( + [-10.2, 100.1, -100.2, 0.0, 0.23], name="this is a float index" + ), pd.Index([102, 1001, 1002, 0.0, 23], dtype="datetime64[ns]"), pd.Index([13240.2, 1001, 100.2, 0.0, 23], dtype="datetime64[ns]"), pd.RangeIndex(0, 10, 1), @@ -718,8 +724,12 @@ def test_index_sort_values(data, ascending, return_indexer): pdi = data gdi = cudf.from_pandas(pdi) - expected = pdi.sort_values(ascending=ascending, return_indexer=return_indexer) - actual = gdi.sort_values(ascending=ascending, return_indexer=return_indexer) + expected = pdi.sort_values( + ascending=ascending, return_indexer=return_indexer + ) + actual = gdi.sort_values( + ascending=ascending, return_indexer=return_indexer + ) if return_indexer: expected_indexer = expected[1] @@ -1079,7 +1089,11 @@ def test_index_append_error(data, other): gd_data = cudf.core.index.as_index(data) gd_other = cudf.core.index.as_index(other) - got_dtype = gd_other.dtype if gd_data.dtype == np.dtype("object") else gd_data.dtype + got_dtype = ( + gd_other.dtype + if gd_data.dtype == np.dtype("object") + else gd_data.dtype + ) with pytest.raises( TypeError, match=re.escape( @@ -1229,7 +1243,9 @@ def test_index_append_list(data, other): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +) @pytest.mark.parametrize("name", [1, "a", None]) def test_index_basic(data, dtype, name): pdi = pd.Index(data, dtype=dtype, name=name) @@ -1352,7 +1368,9 @@ def test_multiindex_append(data, other): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +) def test_index_empty(data, dtype): pdi = pd.Index(data, dtype=dtype) gdi = cudf.Index(data, dtype=dtype) @@ -1361,7 +1379,9 @@ def test_index_empty(data, dtype): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +) def test_index_size(data, dtype): pdi = pd.Index(data, dtype=dtype) gdi = cudf.Index(data, dtype=dtype) @@ -1370,7 +1390,9 @@ def test_index_size(data, dtype): @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +) def test_index_drop_duplicates(data, dtype): pdi = pd.Index(data, dtype=dtype) gdi = cudf.Index(data, dtype=dtype) @@ -1384,7 +1406,9 @@ def test_dropna_bad_how(): @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +) def test_index_tolist(data, dtype): gdi = cudf.Index(data, dtype=dtype) @@ -1400,7 +1424,9 @@ def test_index_tolist(data, dtype): @pytest.mark.parametrize("data", [[], [1], [1, 2, 3]]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +) def test_index_iter_error(data, dtype): gdi = cudf.Index(data, dtype=dtype) @@ -1416,7 +1442,9 @@ def test_index_iter_error(data, dtype): @pytest.mark.parametrize("data", [[], [1], [1, 2, 3, 4, 5]]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +) def test_index_values_host(data, dtype): gdi = cudf.Index(data, dtype=dtype) pdi = pd.Index(data, dtype=dtype) @@ -1536,7 +1564,9 @@ def test_multiindex_from_arrow(): def test_index_equals_categories(): - lhs = cudf.CategoricalIndex(["a", "b", "c", "b", "a"], categories=["a", "b", "c"]) + lhs = cudf.CategoricalIndex( + ["a", "b", "c", "b", "a"], categories=["a", "b", "c"] + ) rhs = cudf.CategoricalIndex( ["a", "b", "c", "b", "a"], categories=["a", "b", "c", "_"] ) @@ -1899,7 +1929,8 @@ def test_get_indexer_single_duplicate_string(idx, key, method): if ( # `method` only applicable to monotonic index - (not pi.is_monotonic_increasing and method is not None) or not pi.is_unique + (not pi.is_monotonic_increasing and method is not None) + or not pi.is_unique ): assert_exceptions_equal( lfunc=pi.get_indexer, @@ -1995,11 +2026,15 @@ def test_get_loc_multi_numeric_deviate(idx, key, result): pi = idx gi = cudf.from_pandas(pi) - with expect_warning_if(isinstance(key, tuple), pd.errors.PerformanceWarning): + with expect_warning_if( + isinstance(key, tuple), pd.errors.PerformanceWarning + ): key_flag = key not in pi if key_flag: - with expect_warning_if(isinstance(key, tuple), pd.errors.PerformanceWarning): + with expect_warning_if( + isinstance(key, tuple), pd.errors.PerformanceWarning + ): assert_exceptions_equal( lfunc=pi.get_loc, rfunc=gi.get_loc, @@ -2166,7 +2201,9 @@ def test_get_loc_multi_string(idx, key): ), ], ) -@pytest.mark.parametrize("key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]]) +@pytest.mark.parametrize( + "key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]] +) @pytest.mark.parametrize("method", [None, "ffill", "bfill"]) def test_get_indexer_multi_string(idx, key, method): pi = idx.sort_values() @@ -2209,7 +2246,9 @@ def test_get_indexer_multi_string(idx, key, method): def test_get_indexer_invalid(idx1, idx2): idx1 = idx1() idx2 = idx2() - assert_eq(idx1.get_indexer(idx2), idx1.to_pandas().get_indexer(idx2.to_pandas())) + assert_eq( + idx1.get_indexer(idx2), idx1.to_pandas().get_indexer(idx2.to_pandas()) + ) @pytest.mark.parametrize( @@ -2387,7 +2426,9 @@ def test_index_type_methods(data, func): assert_eq(expected, actual) -@pytest.mark.parametrize("resolution", ["D", "h", "min", "s", "ms", "us", "ns"]) +@pytest.mark.parametrize( + "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] +) def test_index_datetime_ceil(resolution): cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) pidx = cuidx.to_pandas() @@ -2398,7 +2439,9 @@ def test_index_datetime_ceil(resolution): assert_eq(pidx_ceil, cuidx_ceil) -@pytest.mark.parametrize("resolution", ["D", "h", "min", "s", "ms", "us", "ns"]) +@pytest.mark.parametrize( + "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] +) def test_index_datetime_floor(resolution): cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) pidx = cuidx.to_pandas() @@ -2409,7 +2452,9 @@ def test_index_datetime_floor(resolution): assert_eq(pidx_floor, cuidx_floor) -@pytest.mark.parametrize("resolution", ["D", "h", "min", "s", "ms", "us", "ns"]) +@pytest.mark.parametrize( + "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] +) def test_index_datetime_round(resolution): cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) pidx = cuidx.to_pandas() @@ -2595,7 +2640,9 @@ def test_index_constructor_integer(default_integer_bitwidth): def test_index_constructor_float(default_float_bitwidth): got = cudf.Index([1.0, 2.0, 3.0]) - expect = cudf.Index([1.0, 2.0, 3.0], dtype=f"float{default_float_bitwidth}") + expect = cudf.Index( + [1.0, 2.0, 3.0], dtype=f"float{default_float_bitwidth}" + ) assert_eq(expect, got) @@ -2631,7 +2678,9 @@ def test_rangeindex_take_default_user_option(default_integer_bitwidth): # configuration for take operation. idx = cudf.RangeIndex(0, 100) actual = idx.take([0, 3, 7, 62]) - expected = cudf.Index([0, 3, 7, 62], dtype=f"int{default_integer_bitwidth}") + expected = cudf.Index( + [0, 3, 7, 62], dtype=f"int{default_integer_bitwidth}" + ) assert_eq(expected, actual) @@ -2674,7 +2723,9 @@ def test_rangeindex_binops_user_option( # configuration for binary operation. idx = cudf.RangeIndex(1, 5) actual = op(idx) - expected = cudf.Index(expected, dtype=f"{expected_kind}{default_integer_bitwidth}") + expected = cudf.Index( + expected, dtype=f"{expected_kind}{default_integer_bitwidth}" + ) assert_eq( expected, actual, @@ -2986,7 +3037,9 @@ def test_empty_index_init(): assert_eq(pidx, gidx) -@pytest.mark.parametrize("data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]) +@pytest.mark.parametrize( + "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] +) @pytest.mark.parametrize("data_name", [None, 1, "abc"]) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("name", [None, no_default, 1, "abc"]) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 4d7eecc767c..5f5c4579e01 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -188,7 +188,9 @@ def test_series_indexing_large_size(): @pytest.mark.parametrize("psr", [pd.Series([1, 2, 3], index=["a", "b", "c"])]) -@pytest.mark.parametrize("arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]]) +@pytest.mark.parametrize( + "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] +) def test_series_get_item(psr, arg): gsr = cudf.from_pandas(psr) @@ -203,8 +205,12 @@ def test_dataframe_column_name_indexing(): data = np.asarray(range(10), dtype=np.int32) df["a"] = data df[1] = data - np.testing.assert_equal(df["a"].to_numpy(), np.asarray(range(10), dtype=np.int32)) - np.testing.assert_equal(df[1].to_numpy(), np.asarray(range(10), dtype=np.int32)) + np.testing.assert_equal( + df["a"].to_numpy(), np.asarray(range(10), dtype=np.int32) + ) + np.testing.assert_equal( + df[1].to_numpy(), np.asarray(range(10), dtype=np.int32) + ) pdf = pd.DataFrame() nelem = 10 @@ -235,9 +241,13 @@ def test_dataframe_column_name_indexing(): def test_dataframe_slicing(): df = cudf.DataFrame() size = 123 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(np.int32) + df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( + np.int32 + ) df["b"] = hb = np.random.random(size).astype(np.float32) - df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype(np.int64) + df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype( + np.int64 + ) df["d"] = hd = np.random.random(size).astype(np.float64) # Row slice first 10 @@ -316,13 +326,17 @@ def test_dataframe_loc(scalar, step): assert_eq(df.loc[begin:end, ["c", "d"]], pdf.loc[begin:end, ["c", "d"]]) # Slicing on columns: - assert_eq(df.loc[begin:end:step, "a":"c"], pdf.loc[begin:end:step, "a":"c"]) + assert_eq( + df.loc[begin:end:step, "a":"c"], pdf.loc[begin:end:step, "a":"c"] + ) # Slicing of size 1: assert_eq(df.loc[begin:begin, "a"], pdf.loc[begin:begin, "a"]) # TODO: Pandas changes the dtype here when it shouldn't - assert_eq(df.loc[begin, "a":"a"], pdf.loc[begin, "a":"a"], check_dtype=False) + assert_eq( + df.loc[begin, "a":"a"], pdf.loc[begin, "a":"a"], check_dtype=False + ) # Repeat with at[] assert_eq( @@ -369,7 +383,9 @@ def test_dataframe_loc_duplicate_index_scalar(): ) @pytest.mark.parametrize("arg", ["a", slice("a", "a"), slice("a", "b")]) def test_dataframe_loc_mask(mask, arg): - pdf = pd.DataFrame({"a": ["a", "b", "c", "d", "e"], "b": ["f", "g", "h", "i", "j"]}) + pdf = pd.DataFrame( + {"a": ["a", "b", "c", "d", "e"], "b": ["f", "g", "h", "i", "j"]} + ) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(pdf.loc[mask, arg], gdf.loc[mask, arg]) @@ -378,7 +394,9 @@ def test_dataframe_loc_mask(mask, arg): def test_dataframe_loc_outbound(): df = cudf.DataFrame() size = 10 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(np.int32) + df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( + np.int32 + ) df["b"] = hb = np.random.random(size).astype(np.float32) pdf = pd.DataFrame() @@ -423,7 +441,9 @@ def test_series_loc_float_index(): def test_series_loc_string(): - ps = pd.Series([1, 2, 3, 4, 5], index=["one", "two", "three", "four", "five"]) + ps = pd.Series( + [1, 2, 3, 4, 5], index=["one", "two", "three", "four", "five"] + ) gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc["one"], gs.loc["one"]) @@ -440,7 +460,9 @@ def test_series_loc_string(): def test_series_loc_datetime(): - ps = pd.Series([1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105")) + ps = pd.Series( + [1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105") + ) gs = cudf.Series.from_pandas(ps) # a few different ways of specifying a datetime label: @@ -501,7 +523,9 @@ def test_series_loc_datetime(): def test_series_loc_categorical(): - ps = pd.Series([1, 2, 3, 4, 5], index=pd.Categorical(["a", "b", "c", "d", "e"])) + ps = pd.Series( + [1, 2, 3, 4, 5], index=pd.Categorical(["a", "b", "c", "d", "e"]) + ) gs = cudf.Series.from_pandas(ps) assert_eq(ps.loc["a"], gs.loc["a"]) @@ -513,7 +537,9 @@ def test_series_loc_categorical(): # order of categories changes, so we can only # compare values: - assert_eq(ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_numpy()) + assert_eq( + ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_numpy() + ) assert_eq( ps.loc[[True, False, True, False, True]], @@ -527,19 +553,25 @@ def test_series_loc_categorical(): pd.DataFrame( {"a": [1, 2, 3, 4]}, index=pd.MultiIndex.from_frame( - pd.DataFrame({"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]}) + pd.DataFrame( + {"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]} + ) ), ), pd.Series( [1, 2, 3, 4], index=pd.MultiIndex.from_frame( - pd.DataFrame({"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]}) + pd.DataFrame( + {"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]} + ) ), ), ], ) def test_dataframe_series_loc_multiindex(obj): - pindex = pd.MultiIndex.from_frame(pd.DataFrame({"A": [3, 2], "B": ["high", "low"]})) + pindex = pd.MultiIndex.from_frame( + pd.DataFrame({"A": [3, 2], "B": ["high", "low"]}) + ) gobj = cudf.from_pandas(obj) gindex = cudf.MultiIndex.from_pandas(pindex) @@ -595,7 +627,9 @@ def test_series_iloc(nelem): def test_dataframe_iloc(nelem): gdf = cudf.DataFrame() - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(np.int32) + gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( + np.int32 + ) gdf["b"] = hb = np.random.random(nelem).astype(np.float32) pdf = pd.DataFrame() @@ -647,7 +681,9 @@ def test_dataframe_iloc(nelem): def test_dataframe_iloc_tuple(): gdf = cudf.DataFrame() nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(np.int32) + gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( + np.int32 + ) gdf["b"] = hb = np.random.random(nelem).astype(np.float32) pdf = pd.DataFrame() @@ -661,7 +697,9 @@ def test_dataframe_iloc_tuple(): def test_dataframe_iloc_index_error(): gdf = cudf.DataFrame() nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(np.int32) + gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( + np.int32 + ) gdf["b"] = hb = np.random.random(nelem).astype(np.float32) pdf = pd.DataFrame() @@ -1180,7 +1218,9 @@ def test_dataframe_setitem_iloc_multiindex(key, value, pdf_gdf_multi): def test_boolean_indexing_single_row(pdf_gdf): pdf, gdf = pdf_gdf - assert_eq(pdf.loc[[True, False, False], :], gdf.loc[[True, False, False], :]) + assert_eq( + pdf.loc[[True, False, False], :], gdf.loc[[True, False, False], :] + ) def test_iloc_negative_indices(): @@ -1648,7 +1688,9 @@ def test_dataframe_loc_inplace_update_shape_mismatch_RHS_df(): def test_dataframe_iloc_inplace_update_shape_mismatch_RHS_df(): gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) with pytest.raises(ValueError, match="shape mismatch:"): - gdf.iloc[[0, 2]] = cudf.DataFrame({"x": [10, 20]}, index=cudf.Index([0, 2])) + gdf.iloc[[0, 2]] = cudf.DataFrame( + {"x": [10, 20]}, index=cudf.Index([0, 2]) + ) @pytest.mark.parametrize( @@ -1761,14 +1803,18 @@ def test_boolean_mask_columns_iloc_series(): @pytest.mark.parametrize("index_type", ["single", "slice"]) def test_loc_timestamp_issue_8585(index_type): # https://github.com/rapidsai/cudf/issues/8585 - start = pd.Timestamp(datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")) + start = pd.Timestamp( + datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") + ) end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M")) timestamps = pd.date_range(start, end, periods=12) value = np.random.normal(size=12) df = pd.DataFrame(value, index=timestamps, columns=["value"]) cdf = cudf.from_pandas(df) if index_type == "single": - index = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")) + index = pd.Timestamp( + datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M") + ) elif index_type == "slice": index = slice(start, end, None) else: @@ -1798,7 +1844,9 @@ def test_loc_timestamp_issue_8585(index_type): ) def test_loc_multiindex_timestamp_issue_8585(index_type): # https://github.com/rapidsai/cudf/issues/8585 - start = pd.Timestamp(datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")) + start = pd.Timestamp( + datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") + ) end = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")) timestamps = pd.date_range(start, end, periods=4) labels = ["A", "B", "C"] @@ -1808,10 +1856,14 @@ def test_loc_multiindex_timestamp_issue_8585(index_type): value = np.random.normal(size=12) df = pd.DataFrame(value, index=index, columns=["value"]) cdf = cudf.from_pandas(df) - start = pd.Timestamp(datetime.strptime("2021-03-12 01:00", "%Y-%m-%d %H:%M")) + start = pd.Timestamp( + datetime.strptime("2021-03-12 01:00", "%Y-%m-%d %H:%M") + ) end = pd.Timestamp(datetime.strptime("2021-03-12 02:00", "%Y-%m-%d %H:%M")) if index_type == "single": - index = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")) + index = pd.Timestamp( + datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M") + ) elif index_type == "slice": index = slice(start, end, None) elif index_type == "date_range": @@ -2061,7 +2113,9 @@ def test_loc_index_inindex_subset(self, df, take_order): actual = df.loc[vals] assert_eq(expect, actual) - def test_loc_index_notinindex_slice(self, request, df, order, dtype, take_order): + def test_loc_index_notinindex_slice( + self, request, df, order, dtype, take_order + ): pdf = df.to_pandas() lo = pdf.index[1] hi = pdf.index[-2] @@ -2144,7 +2198,9 @@ def test_loc_setitem_categorical_integer_not_position_based(): @pytest.mark.parametrize("typ", ["datetime64[ns]", "timedelta64[ns]"]) @pytest.mark.parametrize("idx_method, key", [["iloc", 0], ["loc", "a"]]) -def test_series_iloc_scalar_datetimelike_return_pd_scalar(typ, idx_method, key): +def test_series_iloc_scalar_datetimelike_return_pd_scalar( + typ, idx_method, key +): obj = cudf.Series([1, 2, 3], index=list("abc"), dtype=typ) with cudf.option_context("mode.pandas_compatible", True): result = getattr(obj, idx_method)[key] @@ -2159,7 +2215,9 @@ def test_series_iloc_scalar_datetimelike_return_pd_scalar(typ, idx_method, key): def test_dataframe_iloc_scalar_datetimelike_return_pd_scalar( typ, idx_method, row_key, col_key ): - obj = cudf.DataFrame([1, 2, 3], index=list("abc"), columns=["a"], dtype=typ) + obj = cudf.DataFrame( + [1, 2, 3], index=list("abc"), columns=["a"], dtype=typ + ) with cudf.option_context("mode.pandas_compatible", True): result = getattr(obj, idx_method)[row_key, col_key] expected = getattr(obj.to_pandas(), idx_method)[row_key, col_key] @@ -2179,7 +2237,9 @@ def test_series_iloc_scalar_interval_return_pd_scalar(idx_method, key): @pytest.mark.parametrize( "idx_method, row_key, col_key", [["iloc", 0, 0], ["loc", "a", "a"]] ) -def test_dataframe_iloc_scalar_interval_return_pd_scalar(idx_method, row_key, col_key): +def test_dataframe_iloc_scalar_interval_return_pd_scalar( + idx_method, row_key, col_key +): iidx = cudf.IntervalIndex.from_breaks([1, 2, 3]) obj = cudf.DataFrame({"a": iidx}, index=list("ab")) with cudf.option_context("mode.pandas_compatible", True): @@ -2189,7 +2249,9 @@ def test_dataframe_iloc_scalar_interval_return_pd_scalar(idx_method, row_key, co def test_scalar_loc_row_categoricalindex(): - df = cudf.DataFrame(range(4), index=cudf.CategoricalIndex(["a", "a", "b", "c"])) + df = cudf.DataFrame( + range(4), index=cudf.CategoricalIndex(["a", "a", "b", "c"]) + ) result = df.loc["a"] expected = df.to_pandas().loc["a"] assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index 59ecd25d9d2..a0e90cc89a2 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -67,7 +67,9 @@ def test_interpolate_series(data, method, axis): assert_eq(expect, got, check_dtype=psr.dtype != "object") -@pytest.mark.parametrize("data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])]) +@pytest.mark.parametrize( + "data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])] +) def test_interpolate_series_unsorted_index(data, index): gsr = cudf.Series(data, index=index) psr = gsr.to_pandas() diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 4dd584f4447..7b923af1f75 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -57,7 +57,9 @@ def test_create_interval_series(data1, data2, data3, data4, closed): @pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) def test_create_interval_df(data1, data2, data3, data4, closed): # df for both pandas and cudf only works when interval is in a list - expect = pd.DataFrame([pd.Interval(data1, data2, closed)], dtype="interval") + expect = pd.DataFrame( + [pd.Interval(data1, data2, closed)], dtype="interval" + ) got = cudf.DataFrame([pd.Interval(data1, data2, closed)], dtype="interval") assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py index 335c9511995..8d71a6c05b8 100644 --- a/python/cudf/cudf/tests/test_join_order.py +++ b/python/cudf/cudf/tests/test_join_order.py @@ -45,7 +45,8 @@ def expected(left, right, sort, *, how): def test_join_ordering_pandas_compat(request, left, right, sort, how): request.applymarker( pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION and how == "right", + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION + and how == "right", reason="TODO: Result ording of suffix'ed columns is incorrect", ) ) @@ -126,7 +127,9 @@ def test_merge_combinations( expected = expected.sort_values("key") if not other_unique: other_value_counts = other["key"].value_counts() - repeats = other_value_counts.reindex(expected["key"].values, fill_value=1) + repeats = other_value_counts.reindex( + expected["key"].values, fill_value=1 + ) repeats = repeats.astype(np.intp) expected = expected["key"].repeat(repeats.values) expected = expected.to_frame() diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 43f67751e66..c063043b72a 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -71,11 +71,15 @@ def assert_join_results_equal(expect, got, how, **kwargs): **kwargs, ) elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): - if not len(expect.columns): # can't sort_values() on a df without columns + if not len( + expect.columns + ): # can't sort_values() on a df without columns return assert_eq(expect, got, **kwargs) assert_eq( - expect.sort_values(expect.columns.to_list()).reset_index(drop=True), + expect.sort_values(expect.columns.to_list()).reset_index( + drop=True + ), got.sort_values(got.columns.to_list()).reset_index(drop=True), **kwargs, ) @@ -147,7 +151,9 @@ def _check_series(expect, got): nanfilled_equal = np.all( expect.fillna(magic).values == got.fillna(magic).to_numpy() ) - msg = "direct_equal={}, nanfilled_equal={}".format(direct_equal, nanfilled_equal) + msg = "direct_equal={}, nanfilled_equal={}".format( + direct_equal, nanfilled_equal + ) assert direct_equal or nanfilled_equal, msg @@ -162,7 +168,9 @@ def test_dataframe_join_suffix(): left = df.set_index("a") right = df.set_index("c") - msg = "there are overlapping columns but lsuffix and rsuffix are not defined" + msg = ( + "there are overlapping columns but lsuffix and rsuffix are not defined" + ) with pytest.raises(ValueError, match=msg): left.join(right) @@ -238,7 +246,9 @@ def test_dataframe_join_mismatch_cats(how): "data_col_left": [10, 20, 30, 40, 50], } ) - pdf2 = pd.DataFrame({"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]}) + pdf2 = pd.DataFrame( + {"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]} + ) pdf1["join_col"] = pdf1["join_col"].astype("category") pdf2["join_col"] = pdf2["join_col"].astype("category") @@ -307,7 +317,9 @@ def test_dataframe_merge_on(on): for col in list(pddf_joined.columns): if col.count("_y") > 0: - join_result[col] = join_result[col].astype(np.float64).fillna(np.nan) + join_result[col] = ( + join_result[col].astype(np.float64).fillna(np.nan) + ) join_result_cudf[col] = ( join_result_cudf[col].astype(np.float64).fillna(np.nan) ) @@ -319,9 +331,9 @@ def test_dataframe_merge_on(on): .reset_index(drop=True) ) - pdf_result = pddf_joined.sort_values(list(pddf_joined.columns)).reset_index( - drop=True - ) + pdf_result = pddf_joined.sort_values( + list(pddf_joined.columns) + ).reset_index(drop=True) assert_join_results_equal(cdf_result, pdf_result, how="left") @@ -464,7 +476,9 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): match="No common columns to perform merge on", ): pdf_left.merge(pdf_right) - with pytest.raises(ValueError, match="No common columns to perform merge on"): + with pytest.raises( + ValueError, match="No common columns to perform merge on" + ): gdf_left.merge(gdf_right) elif not [value for value in pdf_left if value in pdf_right]: with pytest.raises( @@ -472,7 +486,9 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): match="No common columns to perform merge on", ): pdf_left.merge(pdf_right) - with pytest.raises(ValueError, match="No common columns to perform merge on"): + with pytest.raises( + ValueError, match="No common columns to perform merge on" + ): gdf_left.merge(gdf_right) else: pdf_result = pdf_left.merge(pdf_right, how=how) @@ -532,7 +548,9 @@ def test_empty_joins(how, left_empty, right_empty): def test_merge_left_index_zero(): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) - right = pd.DataFrame({"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6]) + right = pd.DataFrame( + {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] + ) gleft = cudf.from_pandas(left) gright = cudf.from_pandas(right) pd_merge = left.merge(right, left_on="x", right_on="y") @@ -552,7 +570,9 @@ def test_merge_left_index_zero(): ) def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) - right = pd.DataFrame({"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6]) + right = pd.DataFrame( + {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] + ) gleft = cudf.from_pandas(left) gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) @@ -571,7 +591,9 @@ def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs): ) def test_merge_left_right_index_left_right_on_kwargs(kwargs): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 5, 6]) - right = pd.DataFrame({"y": [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7]) + right = pd.DataFrame( + {"y": [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7] + ) gleft = cudf.from_pandas(left) gright = cudf.from_pandas(right) pd_merge = left.merge(right, **kwargs) @@ -662,7 +684,9 @@ def test_merge_left_right_index_left_right_on_kwargs2(kwargs): assert gd_merge.empty -@pytest.mark.parametrize("hows", [{"how": "inner"}, {"how": "left"}, {"how": "outer"}]) +@pytest.mark.parametrize( + "hows", [{"how": "inner"}, {"how": "left"}, {"how": "outer"}] +) @pytest.mark.parametrize( "ons", [ @@ -701,8 +725,12 @@ def test_merge_sort(ons, hows): gd_merge = gd_merge.drop(kwargs["on"], axis=1) if not pd_merge.empty: # check to make sure the non join key columns are the same - pd_merge = pd_merge.sort_values(list(pd_merge.columns)).reset_index(drop=True) - gd_merge = gd_merge.sort_values(list(gd_merge.columns)).reset_index(drop=True) + pd_merge = pd_merge.sort_values(list(pd_merge.columns)).reset_index( + drop=True + ) + gd_merge = gd_merge.sort_values(list(gd_merge.columns)).reset_index( + drop=True + ) assert_join_results_equal(pd_merge, gd_merge, how="left") @@ -820,7 +848,9 @@ def test_join_empty_table_dtype(): ), ( pd.Series(["dog", "cat", "fish", "bug"] * 2).astype("category"), - pd.Series(["bird", "cat", "mouse", "snake"] * 2).astype("category"), + pd.Series(["bird", "cat", "mouse", "snake"] * 2).astype( + "category" + ), ), ], ) @@ -1055,7 +1085,9 @@ def test_typecast_on_join_no_float_round(): exp_By = ["a", "b", "c", None, None] exp_join_col = cudf.Series(exp_join_data, dtype="float32") - expect = cudf.DataFrame({"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By}) + expect = cudf.DataFrame( + {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By} + ) got = gdf_l.merge(gdf_r, on="join_col", how="left") @@ -1115,8 +1147,12 @@ def test_typecast_on_join_overflow_unsafe(dtypes): def test_decimal_typecast_inner(dtype): other_data = ["a", "b", "c", "d", "e"] - join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype(dtype) - join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype(dtype) + join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype( + dtype + ) + join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype( + dtype + ) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) @@ -1152,8 +1188,12 @@ def test_decimal_typecast_inner(dtype): def test_decimal_typecast_left(dtype): other_data = ["a", "b", "c", "d"] - join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype(dtype) - join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1456.9472"]).astype(dtype) + join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype( + dtype + ) + join_data_r = cudf.Series( + ["95.05", "62.4056", "74.22", "1456.9472"] + ).astype(dtype) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) @@ -1189,8 +1229,12 @@ def test_decimal_typecast_left(dtype): ) def test_decimal_typecast_outer(dtype): other_data = ["a", "b", "c"] - join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype(dtype) - join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype(dtype) + join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype( + dtype + ) + join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype( + dtype + ) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = ["9284.103", "948.637", "1029.528", "741.248", "3627.292"] @@ -1221,8 +1265,12 @@ def test_decimal_typecast_outer(dtype): def test_mixed_decimal_typecast(dtype_l, dtype_r): other_data = ["a", "b", "c", "d"] - join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype(dtype_r) - join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype(dtype_l) + join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype( + dtype_r + ) + join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype( + dtype_l + ) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) @@ -1542,7 +1590,9 @@ def test_index_join(lhs, rhs, how, level): def test_index_join_corner_cases(): l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]}) + r_pdf = pd.DataFrame( + {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} + ) l_df = cudf.from_pandas(l_pdf) r_df = cudf.from_pandas(r_pdf) @@ -1599,7 +1649,9 @@ def test_index_join_corner_cases(): def test_index_join_exception_cases(): l_df = cudf.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_df = cudf.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]}) + r_df = cudf.DataFrame( + {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} + ) # Join between two MultiIndex lhs = ["a", "b"] @@ -1741,7 +1793,9 @@ def test_typecast_on_join_indexes_matching_categorical(): cudf.DataFrame({"b": [2, 3, 4], "c": [4, 5, 6]}), ], ) -@pytest.mark.parametrize("how", ["left", "inner", "outer", "leftanti", "leftsemi"]) +@pytest.mark.parametrize( + "how", ["left", "inner", "outer", "leftanti", "leftsemi"] +) @pytest.mark.parametrize( "kwargs", [ @@ -1771,7 +1825,9 @@ def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): @pytest.mark.xfail(reason="Cannot sort values of list dtype") -@pytest.mark.parametrize("how", ["left", "inner", "right", "leftanti", "leftsemi"]) +@pytest.mark.parametrize( + "how", ["left", "inner", "right", "leftanti", "leftsemi"] +) def test_merge_with_lists(how): pd_left = pd.DataFrame( { @@ -1802,7 +1858,9 @@ def test_join_renamed_index(): ).set_index([0, 1]) df.index.names = ["a", "b"] # doesn't actually change df._index._data - expect = df.to_pandas().merge(df.to_pandas(), left_index=True, right_index=True) + expect = df.to_pandas().merge( + df.to_pandas(), left_index=True, right_index=True + ) got = df.merge(df, left_index=True, right_index=True, how="inner") assert_join_results_equal(expect, got, how="inner") @@ -2063,7 +2121,9 @@ def test_string_join_values_nulls(): ) def test_merge_mixed_index_columns(left_on, right_on): left = pd.DataFrame({"a": [1, 2, 1, 2], "b": [2, 3, 3, 4]}).set_index("a") - right = pd.DataFrame({"a": [1, 2, 1, 3], "b": [2, 30, 3, 4]}).set_index("a") + right = pd.DataFrame({"a": [1, 2, 1, 3], "b": [2, 30, 3, 4]}).set_index( + "a" + ) left["c"] = 10 @@ -2108,8 +2168,12 @@ def test_join_on_index_with_duplicate_names(): # overall, we *should* be able to join on them: lhs = pd.DataFrame({"a": [1, 2, 3]}) rhs = pd.DataFrame({"b": [1, 2, 3]}) - lhs.index = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["x", "x"]) - rhs.index = pd.MultiIndex.from_tuples([(1, 1), (1, 3), (2, 1)], names=["x", "x"]) + lhs.index = pd.MultiIndex.from_tuples( + [(1, 1), (1, 2), (2, 1)], names=["x", "x"] + ) + rhs.index = pd.MultiIndex.from_tuples( + [(1, 1), (1, 3), (2, 1)], names=["x", "x"] + ) expect = lhs.join(rhs, how="inner") lhs = cudf.from_pandas(lhs) @@ -2123,7 +2187,9 @@ def test_join_redundant_params(): lhs = cudf.DataFrame( {"a": [1, 2, 3], "c": [2, 3, 4]}, index=cudf.Index([0, 1, 2], name="c") ) - rhs = cudf.DataFrame({"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a")) + rhs = cudf.DataFrame( + {"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a") + ) with pytest.raises(ValueError): lhs.merge(rhs, on="a", left_index=True) with pytest.raises(ValueError): @@ -2136,7 +2202,11 @@ def test_join_redundant_params(): def test_join_multiindex_index(): # test joining a MultiIndex with an Index with overlapping name - lhs = cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]}).set_index(["a", "b"]).index + lhs = ( + cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]}) + .set_index(["a", "b"]) + .index + ) rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner") got = lhs.join(rhs, how="inner") diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index d33133f32f5..40935733f34 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -37,7 +37,10 @@ def pdf(request): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - {f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) for typ in types} + { + f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + for typ in types + } ) # Delete the name of the column index, and rename the row index test_pdf.columns.name = None @@ -88,7 +91,8 @@ def json_files(request, tmp_path_factory, pdf): index, compression, orient = request.param if index is False and orient not in ("split", "table"): pytest.skip( - "'index=False' is only valid when 'orient' is 'split' or " "'table'" + "'index=False' is only valid when 'orient' is 'split' or " + "'table'" ) if index is False and orient == "table": pytest.skip("'index=False' isn't valid when 'orient' is 'table'") @@ -171,7 +175,9 @@ def test_json_writer(tmpdir, pdf, gdf): assert_eq(pdf_string, gdf_string) -@pytest.mark.parametrize("lines", [True, False], ids=["lines=True", "lines=False"]) +@pytest.mark.parametrize( + "lines", [True, False], ids=["lines=True", "lines=False"] +) def test_cudf_json_writer(pdf, lines): # removing datetime column because pandas doesn't support it for col_name in pdf.columns: @@ -192,9 +198,12 @@ def test_cudf_json_writer(pdf, lines): def test_cudf_json_writer_read(gdf_writer_types): dtypes = { - col_name: col_name[len("col_") :] for col_name in gdf_writer_types.columns + col_name: col_name[len("col_") :] + for col_name in gdf_writer_types.columns } - gdf_string = gdf_writer_types.to_json(orient="records", lines=True, engine="cudf") + gdf_string = gdf_writer_types.to_json( + orient="records", lines=True, engine="cudf" + ) gdf2 = cudf.read_json( StringIO(gdf_string), lines=True, @@ -293,7 +302,9 @@ def test_cudf_json_writer_sinks(sink, tmp_path_factory): target = tmp_path_factory.mktemp("json") / "test_df.json" df.to_json(target, engine="cudf") if sink == "string": - assert target.getvalue() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' + assert ( + target.getvalue() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' + ) elif sink == "file": assert os.path.exists(target) with open(target, "r") as f: @@ -411,27 +422,37 @@ def test_json_read_directory(tmpdir, json_input, engine): def test_json_lines_byte_range(json_input): # include the first row and half of the second row # should parse the first two rows - will_warn = isinstance(json_input, str) and not json_input.endswith(".json") + will_warn = isinstance(json_input, str) and not json_input.endswith( + ".json" + ) with expect_warning_if(will_warn): - df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(0, 15)) + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(0, 15) + ) assert df.shape == (2, 3) # include half of the second row and half of the third row # should parse only the third row with expect_warning_if(will_warn): - df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(15, 10)) + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(15, 10) + ) assert df.shape == (1, 3) # include half of the second row and entire third row # should parse only the third row with expect_warning_if(will_warn): - df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(15, 0)) + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(15, 0) + ) assert df.shape == (1, 3) # include half of the second row till past the end of the file # should parse only the third row with expect_warning_if(will_warn): - df = cudf.read_json(copy.deepcopy(json_input), lines=True, byte_range=(10, 50)) + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(10, 50) + ) assert df.shape == (1, 3) @@ -474,7 +495,9 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp): @pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.filterwarnings("ignore:engine='cudf_legacy' is a deprecated engine.") +@pytest.mark.filterwarnings( + "ignore:engine='cudf_legacy' is a deprecated engine." +) def test_json_engine_selection(): json = "[1, 2, 3]" @@ -518,7 +541,9 @@ def test_json_bool_values(): np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) -@pytest.mark.filterwarnings("ignore:engine='cudf_legacy' is a deprecated engine.") +@pytest.mark.filterwarnings( + "ignore:engine='cudf_legacy' is a deprecated engine." +) @pytest.mark.parametrize( "buffer", [ @@ -534,7 +559,9 @@ def test_json_null_literal(buffer): # first column contains a null field, type should be set to float # second column contains only empty fields, type should be set to int8 np.testing.assert_array_equal(df.dtypes, ["float64", "int8"]) - np.testing.assert_array_equal(df["0"].to_numpy(na_value=np.nan), [1.0, np.nan]) + np.testing.assert_array_equal( + df["0"].to_numpy(na_value=np.nan), [1.0, np.nan] + ) np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0]) @@ -561,8 +588,12 @@ def test_json_corner_case_with_escape_and_double_quote_char_with_pandas( ) pdf.to_json(fname, compression="infer", lines=True, orient="records") - df = cudf.read_json(fname, compression="infer", lines=True, orient="records") - pdf = pd.read_json(fname, compression="infer", lines=True, orient="records") + df = cudf.read_json( + fname, compression="infer", lines=True, orient="records" + ) + pdf = pd.read_json( + fname, compression="infer", lines=True, orient="records" + ) assert_eq(cudf.DataFrame(pdf), df) @@ -575,7 +606,9 @@ def test_json_corner_case_with_escape_and_double_quote_char_with_strings(): {"a":"\'","b":"\\t","c":"cudf"}""" ) - df = cudf.read_json(str_buffer, compression="infer", lines=True, orient="records") + df = cudf.read_json( + str_buffer, compression="infer", lines=True, orient="records" + ) expected = { "a": ['ab"cd', "\\\b", "\r\\", "'"], @@ -627,7 +660,9 @@ def test_json_to_json_special_characters(): ( cudf.DataFrame( { - "int64 col": cudf.Series([1, 2, None, 2323, None], dtype="int64"), + "int64 col": cudf.Series( + [1, 2, None, 2323, None], dtype="int64" + ), "string col": cudf.Series( ["abc", "a", None, "", None], dtype="str" ), @@ -665,7 +700,9 @@ def test_json_to_json_special_characters(): [None, True, False, None, True], dtype=pd.BooleanDtype(), ), - "categorical col": pd.Series([1, 2, 1, None, 2], dtype="category"), + "categorical col": pd.Series( + [1, 2, 1, None, 2], dtype="category" + ), "datetime col": pd.Series( [1231233, None, 2323234, None, 1], dtype="datetime64[ns]", @@ -798,7 +835,9 @@ def test_json_nested_lines(data, lines): # such that pandas would have the f1 member with null # Also, pyarrow chooses to select different ordering of a nested column # children though key-value pairs are correct. - pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) assert df.to_arrow().equals(pa_table_pdf) @@ -810,7 +849,9 @@ def test_json_nested_data(): df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") pdf = pd.read_json(StringIO(json_str), orient="records") pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) assert df.to_arrow().equals(pa_table_pdf) @@ -839,7 +880,9 @@ def test_json_types_data(): df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") pdf = pd.read_json(StringIO(json_str), orient="records") pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) assert df.to_arrow().equals(pa_table_pdf) @@ -1062,9 +1105,13 @@ def test_json_dtypes_nested_data(): }, ) - pdf = pd.read_json(StringIO(expected_json_str), orient="records", lines=True) + pdf = pd.read_json( + StringIO(expected_json_str), orient="records", lines=True + ) pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) assert df.to_arrow().equals(pa_table_pdf) @@ -1240,7 +1287,9 @@ def test_json_array_of_arrays(data, lines): # for values orient in cudf json reader pdf.rename(columns={name: str(name) for name in pdf.columns}, inplace=True) # assert_eq(pdf, df) - pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) assert df.to_arrow().equals(pa_table_pdf) @@ -1336,7 +1385,9 @@ def _replace_with_nulls(df, replace_items): # {} in pandas is represented as {"0": None} in cudf assert_eq(gdf, pdf) assert_eq(gdf2, pdf) - pa_table_pdf = pa.Table.from_pandas(pdf, schema=gdf.to_arrow().schema, safe=False) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=gdf.to_arrow().schema, safe=False + ) assert gdf.to_arrow().equals(pa_table_pdf) assert gdf2.to_arrow().equals(pa_table_pdf) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 432081552f7..f04cb8a91a4 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -203,11 +203,15 @@ def test_take(data, idx): ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")), ( [["a", "b"], ["c"]], - pytest.raises(TypeError, match="should be column of values of index types"), + pytest.raises( + TypeError, match="should be column of values of index types" + ), ), ( [[[1], [0]], [[0]]], - pytest.raises(TypeError, match="should be column of values of index types"), + pytest.raises( + TypeError, match="should be column of values of index types" + ), ), ([[0, 1], None], pytest.raises(ValueError, match="contains null")), ], @@ -274,7 +278,9 @@ def key_func_builder(x, na_position): [ None, pd.Index(["a", "b", "c"]), - pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]), + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] + ), ], ) @pytest.mark.parametrize("ascending", [True, False]) @@ -357,7 +363,9 @@ def test_get_default(): assert_eq(cudf.Series([0, 3, 7]), sr.list.get(-3, default=0)) assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1)) - string_sr = cudf.Series([["apple", "banana"], ["carrot", "daffodil", "elephant"]]) + string_sr = cudf.Series( + [["apple", "banana"], ["carrot", "daffodil", "elephant"]] + ) assert_eq( cudf.Series(["default", "elephant"]), string_sr.list.get(2, default="default"), @@ -369,7 +377,9 @@ def test_get_default(): sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) assert_eq(cudf.Series([[3, 4], [7, 8]]), sr_nested.list.get(1)) assert_eq(cudf.Series([[5, 6], cudf.NA]), sr_nested.list.get(2)) - assert_eq(cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0])) + assert_eq( + cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0]) + ) def test_get_ind_sequence(): @@ -470,7 +480,8 @@ def test_contains_invalid(data, scalar): sr = cudf.Series(data) with pytest.raises( TypeError, - match="Type/Scale of search key does not " "match list column element type.", + match="Type/Scale of search key does not " + "match list column element type.", ): sr.list.contains(scalar) @@ -527,7 +538,9 @@ def test_index(data, search_key, expect): if is_scalar(search_key): got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type)) else: - got = sr.list.index(cudf.Series(search_key, dtype=sr.dtype.element_type)) + got = sr.list.index( + cudf.Series(search_key, dtype=sr.dtype.element_type) + ) assert_eq(expect, got) @@ -553,7 +566,8 @@ def test_index_invalid_type(data, search_key): sr = cudf.Series(data) with pytest.raises( TypeError, - match="Type/Scale of search key does not " "match list column element type.", + match="Type/Scale of search key does not " + "match list column element type.", ): sr.list.index(search_key) diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 8e0fed12c19..53919a95115 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -16,8 +16,12 @@ @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) def test_range_index(testrange): - index = RangeIndex(start=testrange[0], stop=testrange[1], step=testrange[2]) - index_pd = pd.RangeIndex(start=testrange[0], stop=testrange[1], step=testrange[2]) + index = RangeIndex( + start=testrange[0], stop=testrange[1], step=testrange[2] + ) + index_pd = pd.RangeIndex( + start=testrange[0], stop=testrange[1], step=testrange[2] + ) assert index.is_unique == index_pd.is_unique assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing @@ -64,7 +68,9 @@ def test_string_index(testlist): assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing -@pytest.mark.parametrize("testlist", [["c", "d", "e", "f"], ["z", "y", "x", "r"]]) +@pytest.mark.parametrize( + "testlist", [["c", "d", "e", "f"], ["z", "y", "x", "r"]] +) def test_categorical_index(testlist): # Assuming unordered categorical data cannot be "monotonic" raw_cat = pd.Categorical(testlist, ordered=True) @@ -157,8 +163,12 @@ def test_multiindex(): gdf = cudf.from_pandas(pdf) assert pdf.index.is_unique == gdf.index.is_unique - assert pdf.index.is_monotonic_increasing == gdf.index.is_monotonic_increasing - assert pdf.index.is_monotonic_decreasing == gdf.index.is_monotonic_decreasing + assert ( + pdf.index.is_monotonic_increasing == gdf.index.is_monotonic_increasing + ) + assert ( + pdf.index.is_monotonic_decreasing == gdf.index.is_monotonic_decreasing + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 9d33453da59..4926d79e734 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -262,9 +262,9 @@ def test_multiindex_transpose(pdf, pdfIndex): def test_from_pandas_series(): - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( - ["a", "b"] - ) + pdf = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + ).set_index(["a", "b"]) result = cudf.from_pandas(pdf) assert_eq(pdf, result) @@ -292,8 +292,12 @@ def test_multiindex_take(pdf, gdf, pdfIndex): assert_eq(pdf.index.take(pd.Series([0])), gdf.index.take(Series([0]))) assert_eq(pdf.index.take([0, 1]), gdf.index.take([0, 1])) - assert_eq(pdf.index.take(np.array([0, 1])), gdf.index.take(np.array([0, 1]))) - assert_eq(pdf.index.take(pd.Series([0, 1])), gdf.index.take(Series([0, 1]))) + assert_eq( + pdf.index.take(np.array([0, 1])), gdf.index.take(np.array([0, 1])) + ) + assert_eq( + pdf.index.take(pd.Series([0, 1])), gdf.index.take(Series([0, 1])) + ) def test_multiindex_getitem(pdf, gdf, pdfIndex): @@ -388,7 +392,9 @@ def test_multiindex_loc_errors(pdf, gdf, pdfIndex): with pytest.raises(KeyError): gdf.loc[("a", "store", "clouds", "foo")] with pytest.raises(IndexError): - gdf.loc[("a", "store", "clouds", "fire", "x", "y")] # too many indexers + gdf.loc[ + ("a", "store", "clouds", "fire", "x", "y") + ] # too many indexers with pytest.raises(IndexError): gdf.loc[slice(None, ("a", "store", "clouds", "fire", "x", "y"))] @@ -489,7 +495,9 @@ def test_multiindex_from_tuples(): def test_multiindex_from_dataframe(): if not hasattr(pd.MultiIndex([[]], [[]]), "codes"): pytest.skip() - pdf = pd.DataFrame([["a", "house"], ["a", "store"], ["b", "house"], ["b", "store"]]) + pdf = pd.DataFrame( + [["a", "house"], ["a", "store"], ["b", "house"], ["b", "store"]] + ) gdf = cudf.from_pandas(pdf) pmi = pd.MultiIndex.from_frame(pdf, names=["alpha", "location"]) gmi = cudf.MultiIndex.from_frame(gdf, names=["alpha", "location"]) @@ -528,7 +536,9 @@ def test_multiindex_index_and_columns(): names=["x", "y"], ) gdf.index = mi - mc = cudf.MultiIndex(levels=[["val"], ["mean", "min"]], codes=[[0, 0], [0, 1]]) + mc = cudf.MultiIndex( + levels=[["val"], ["mean", "min"]], codes=[[0, 0], [0, 1]] + ) gdf.columns = mc pdf.index = mi.to_pandas() pdf.columns = mc.to_pandas() @@ -820,29 +830,43 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): assert all((x == y) for x, y in zip(lptrs, rptrs)) elif isinstance(data, cudf.MultiIndex): - same_ref = (not deep) or (cudf.get_option("copy_on_write") and not deep) + same_ref = (not deep) or ( + cudf.get_option("copy_on_write") and not deep + ) mi1 = data mi2 = mi1.copy(deep=deep) # Assert ._levels identity lptrs = [ - lv._data._data[None].base_data.get_ptr(mode="read") for lv in mi1._levels + lv._data._data[None].base_data.get_ptr(mode="read") + for lv in mi1._levels ] rptrs = [ - lv._data._data[None].base_data.get_ptr(mode="read") for lv in mi2._levels + lv._data._data[None].base_data.get_ptr(mode="read") + for lv in mi2._levels ] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._codes identity - lptrs = [c.base_data.get_ptr(mode="read") for _, c in mi1._codes._data.items()] - rptrs = [c.base_data.get_ptr(mode="read") for _, c in mi2._codes._data.items()] + lptrs = [ + c.base_data.get_ptr(mode="read") + for _, c in mi1._codes._data.items() + ] + rptrs = [ + c.base_data.get_ptr(mode="read") + for _, c in mi2._codes._data.items() + ] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._data identity - lptrs = [d.base_data.get_ptr(mode="read") for _, d in mi1._data.items()] - rptrs = [d.base_data.get_ptr(mode="read") for _, d in mi2._data.items()] + lptrs = [ + d.base_data.get_ptr(mode="read") for _, d in mi1._data.items() + ] + rptrs = [ + d.base_data.get_ptr(mode="read") for _, d in mi2._data.items() + ] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) cudf.set_option("copy_on_write", original_cow_setting) @@ -884,7 +908,9 @@ def test_multiindex_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): presult = pdf.iloc[iloc_rows, iloc_columns] gresult = gdf.iloc[iloc_rows, iloc_columns] if isinstance(gresult, cudf.DataFrame): - assert_eq(presult, gresult, check_index_type=False, check_column_type=False) + assert_eq( + presult, gresult, check_index_type=False, check_column_type=False + ) else: assert_eq(presult, gresult, check_index_type=False, check_dtype=False) @@ -893,7 +919,9 @@ def test_multiindex_iloc_scalar(): arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] tuples = list(zip(*arrays)) idx = cudf.MultiIndex.from_tuples(tuples) - gdf = cudf.DataFrame({"first": cp.random.rand(4), "second": cp.random.rand(4)}) + gdf = cudf.DataFrame( + {"first": cp.random.rand(4), "second": cp.random.rand(4)} + ) gdf.index = idx pdf = gdf.to_pandas() @@ -942,13 +970,17 @@ def test_multicolumn_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): if isinstance(name, str) and "cudf" in name: gresult.name = name if isinstance(presult, pd.DataFrame): - assert_eq(presult, gresult, check_index_type=False, check_column_type=False) + assert_eq( + presult, gresult, check_index_type=False, check_column_type=False + ) else: assert_eq(presult, gresult, check_index_type=False, check_dtype=False) def test_multicolumn_item(): - gdf = cudf.DataFrame({"x": np.arange(10), "y": np.arange(10), "z": np.arange(10)}) + gdf = cudf.DataFrame( + {"x": np.arange(10), "y": np.arange(10), "z": np.arange(10)} + ) gdg = gdf.groupby(["x", "y"]).min() gdgT = gdg.T pdgT = gdgT.to_pandas() @@ -1034,7 +1066,9 @@ def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex): gdf.index = gdfIndex # The index is unsorted, which makes things slow but is fine for testing. with pytest.warns(pd.errors.PerformanceWarning): - assert_eq(pdf.loc[("a",), :].sort_index(), gdf.loc[("a",), :].sort_index()) + assert_eq( + pdf.loc[("a",), :].sort_index(), gdf.loc[("a",), :].sort_index() + ) assert_eq( pdf.loc[(("a"), ("store")), :].sort_index(), gdf.loc[(("a"), ("store")), :].sort_index(), @@ -1056,8 +1090,12 @@ def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex): gdf.loc[(slice(None), slice(None), "storm"), :].sort_index(), ) assert_eq( - pdf.loc[(slice(None), slice(None), slice(None), "smoke"), :].sort_index(), - gdf.loc[(slice(None), slice(None), slice(None), "smoke"), :].sort_index(), + pdf.loc[ + (slice(None), slice(None), slice(None), "smoke"), : + ].sort_index(), + gdf.loc[ + (slice(None), slice(None), slice(None), "smoke"), : + ].sort_index(), ) @@ -1384,8 +1422,12 @@ def test_multiindex_sort_values(pmidx, ascending, return_indexer): pmidx = pmidx midx = cudf.from_pandas(pmidx) - expected = pmidx.sort_values(ascending=ascending, return_indexer=return_indexer) - actual = midx.sort_values(ascending=ascending, return_indexer=return_indexer) + expected = pmidx.sort_values( + ascending=ascending, return_indexer=return_indexer + ) + actual = midx.sort_values( + ascending=ascending, return_indexer=return_indexer + ) if return_indexer: expected_indexer = expected[1] @@ -1468,7 +1510,9 @@ def test_multiindex_set_names(idx, names, inplace): @pytest.mark.parametrize( "idx", [ - pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019], ["aab", "bcd"]]), + pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] + ), pd.MultiIndex.from_product( [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], names=[1, 0, 2], @@ -1487,7 +1531,9 @@ def test_multiindex_set_names(idx, names, inplace): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names_default_and_int_names(idx, level, names, inplace): +def test_multiindex_set_names_default_and_int_names( + idx, level, names, inplace +): pi = idx.copy() gi = cudf.from_pandas(idx) @@ -1540,7 +1586,9 @@ def test_multiindex_set_names_string_names(idx, level, names, inplace): "level, names", [(1, ["a"]), (None, "a"), ([1, 2], ["a"]), (None, ["a"])] ) def test_multiindex_set_names_error(level, names): - pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019], ["aab", "bcd"]]) + pi = pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] + ) gi = cudf.from_pandas(pi) assert_exceptions_equal( @@ -1585,7 +1633,9 @@ def test_multiindex_rename(idx, names, inplace): assert_eq(expected, actual) -@pytest.mark.parametrize("names", ["plain string", 123, ["str"], ["l1", "l2", "l3"]]) +@pytest.mark.parametrize( + "names", ["plain string", 123, ["str"], ["l1", "l2", "l3"]] +) def test_multiindex_rename_error(names): pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) gi = cudf.from_pandas(pi) @@ -1648,8 +1698,12 @@ def test_difference(): "idx1, idx2", [ ( - pd.MultiIndex.from_arrays([[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]), - pd.MultiIndex.from_arrays([[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]]), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ), ), ( pd.MultiIndex.from_arrays( @@ -1694,8 +1748,12 @@ def test_union_mulitIndex(idx1, idx2, sort): "idx1, idx2", [ ( - pd.MultiIndex.from_arrays([[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]), - pd.MultiIndex.from_arrays([[1, 3, 2, 2], ["Red", "Green", "Red", "Green"]]), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ), + pd.MultiIndex.from_arrays( + [[1, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ), ), ( pd.MultiIndex.from_arrays( @@ -1770,7 +1828,9 @@ def test_pickle_roundtrip_multiindex(names): @pytest.mark.parametrize( "pidx", [ - pd.MultiIndex.from_arrays([[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ), pd.MultiIndex.from_arrays( [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], names=["a", "b", "c"], @@ -1811,7 +1871,9 @@ def test_multiindex_index_single_row(): arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] tuples = list(zip(*arrays)) idx = cudf.MultiIndex.from_tuples(tuples) - gdf = cudf.DataFrame({"first": cp.random.rand(4), "second": cp.random.rand(4)}) + gdf = cudf.DataFrame( + {"first": cp.random.rand(4), "second": cp.random.rand(4)} + ) gdf.index = idx pdf = gdf.to_pandas() assert_eq(pdf.loc[("b", 3)], gdf.loc[("b", 3)]) @@ -1884,7 +1946,9 @@ def test_multiindex_to_series_error(): ) @pytest.mark.parametrize("allow_duplicates", [True, False]) @pytest.mark.parametrize("index", [True, False]) -def test_multiindex_to_frame_allow_duplicates(pidx, name, allow_duplicates, index): +def test_multiindex_to_frame_allow_duplicates( + pidx, name, allow_duplicates, index +): gidx = cudf.from_pandas(pidx) if name is None or ( diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index a189ec8bc6b..2e3be92dbeb 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -43,7 +43,9 @@ def test_can_cast_safely_same_kind(): to_dtype = np.dtype("float32") assert data.can_cast_safely(to_dtype) - data = cudf.Series([np.finfo("float32").max * 2, 1.0], dtype="float64")._column + data = cudf.Series( + [np.finfo("float32").max * 2, 1.0], dtype="float64" + )._column to_dtype = np.dtype("float32") assert not data.can_cast_safely(to_dtype) @@ -140,7 +142,9 @@ def test_can_cast_safely_has_nulls(): ), # Categories with nulls pd.Series([1, 2, 3], dtype=pd.CategoricalDtype(categories=[1, 2])), - pd.Series([5.0, 6.0], dtype=pd.CategoricalDtype(categories=[5.0, 6.0])), + pd.Series( + [5.0, 6.0], dtype=pd.CategoricalDtype(categories=[5.0, 6.0]) + ), pd.Series( ["2020-08-01 08:00:00", "1960-08-01 08:00:00"], dtype=np.dtype("= sizeof @@ -79,7 +81,9 @@ def test_packed_dataframe_equality_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) + df["keys"] = pd.Categorical( + ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] + ) df["vals"] = np.random.random(len(df)) check_packed_equality(df) @@ -99,7 +103,9 @@ def test_packed_dataframe_equality_struct(): np.random.seed(0) df = DataFrame() - df["keys"] = Series(list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))) + df["keys"] = Series( + list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) + ) df["vals"] = np.random.random(len(df)) check_packed_equality(df) @@ -143,7 +149,9 @@ def test_packed_dataframe_unique_pointers_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) + df["keys"] = pd.Categorical( + ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] + ) df["vals"] = np.random.random(len(df)) check_packed_unique_pointers(df) @@ -163,7 +171,9 @@ def test_packed_dataframe_unique_pointers_struct(): np.random.seed(0) df = DataFrame() - df["keys"] = Series(list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))) + df["keys"] = Series( + list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) + ) df["vals"] = np.random.random(len(df)) check_packed_unique_pointers(df) @@ -182,7 +192,9 @@ def check_packed_pickled_equality(df): assert_packed_frame_picklable(sortvaldf) # out-of-band buffers = [] - serialbytes = pickle.dumps(pack(df), protocol=5, buffer_callback=buffers.append) + serialbytes = pickle.dumps( + pack(df), protocol=5, buffer_callback=buffers.append + ) for b in buffers: assert isinstance(b, pickle.PickleBuffer) loaded = unpack(pickle.loads(serialbytes, buffers=buffers)) @@ -210,7 +222,9 @@ def test_pickle_packed_dataframe_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) + df["keys"] = pd.Categorical( + ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] + ) df["vals"] = np.random.random(len(df)) check_packed_pickled_equality(df) @@ -230,7 +244,9 @@ def test_pickle_packed_dataframe_struct(): np.random.seed(0) df = DataFrame() - df["keys"] = Series(list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))) + df["keys"] = Series( + list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) + ) df["vals"] = np.random.random(len(df)) check_packed_pickled_equality(df) @@ -271,7 +287,9 @@ def test_serialize_packed_dataframe_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) + df["keys"] = pd.Categorical( + ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] + ) df["vals"] = np.random.random(len(df)) check_packed_serialized_equality(df) @@ -291,7 +309,9 @@ def test_serialize_packed_dataframe_struct(): np.random.seed(0) df = DataFrame() - df["keys"] = Series(list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))) + df["keys"] = Series( + list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) + ) df["vals"] = np.random.random(len(df)) check_packed_serialized_equality(df) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 70cd7338b84..8b72fe84359 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -72,7 +72,10 @@ def simple_pdf(request): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - {f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) for typ in types}, + { + f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + for typ in types + }, # Need to ensure that this index is not a RangeIndex to get the # expected round-tripping behavior from Parquet reader/writer. index=pd.Index(list(range(nrows))), @@ -111,7 +114,10 @@ def build_pdf(num_columns, day_resolution_timestamps): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( - {f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) for typ in types}, + { + f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + for typ in types + }, # Need to ensure that this index is not a RangeIndex to get the # expected round-tripping behavior from Parquet reader/writer. index=pd.Index(list(range(nrows))), @@ -142,7 +148,9 @@ def build_pdf(num_columns, day_resolution_timestamps): ] if day_resolution_timestamps: data = [int(d / t["dayModulus"]) * t["dayModulus"] for d in data] - test_pdf["col_" + t["name"]] = pd.Series(np.asarray(data, dtype=t["name"])) + test_pdf["col_" + t["name"]] = pd.Series( + np.asarray(data, dtype=t["name"]) + ) # Create non-numeric categorical data otherwise parquet may typecast it data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] @@ -177,7 +185,9 @@ def gdf_day_timestamps(pdf_day_timestamps): @pytest.fixture(params=["snappy", "gzip", "brotli", None, np.str_("snappy")]) def parquet_file(request, tmp_path_factory, pdf): - fname = tmp_path_factory.mktemp("parquet") / (str(request.param) + "_test.parquet") + fname = tmp_path_factory.mktemp("parquet") / ( + str(request.param) + "_test.parquet" + ) pdf.to_parquet(fname, engine="pyarrow", compression=request.param) return fname @@ -330,7 +340,9 @@ def test_parquet_reader_index_col(tmpdir, index_col, columns): @pytest.mark.parametrize("pandas_compat", [True, False]) -@pytest.mark.parametrize("columns", [["a"], ["d"], ["a", "b"], ["a", "d"], None]) +@pytest.mark.parametrize( + "columns", [["a"], ["d"], ["a", "b"], ["a", "d"], None] +) def test_parquet_reader_pandas_metadata(tmpdir, columns, pandas_compat): df = pd.DataFrame( { @@ -351,7 +363,9 @@ def test_parquet_reader_pandas_metadata(tmpdir, columns, pandas_compat): expect = pa.parquet.read_table( fname, columns=columns, use_pandas_metadata=pandas_compat ).to_pandas() - got = cudf.read_parquet(fname, columns=columns, use_pandas_metadata=pandas_compat) + got = cudf.read_parquet( + fname, columns=columns, use_pandas_metadata=pandas_compat + ) if pandas_compat or columns is None or "b" in columns: assert got.index.name == "b" @@ -374,12 +388,16 @@ def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes): # PANDAS `read_parquet()` and PyArrow `read_pandas()` always includes index # Instead, directly use PyArrow to optionally omit the index - expect = pa.parquet.read_table(fname, use_pandas_metadata=pandas_compat).to_pandas() + expect = pa.parquet.read_table( + fname, use_pandas_metadata=pandas_compat + ).to_pandas() if as_bytes: # Make sure we can handle RangeIndex parsing # in pandas when the input is `bytes` with open(fname, "rb") as f: - got = cudf.read_parquet(f.read(), use_pandas_metadata=pandas_compat) + got = cudf.read_parquet( + f.read(), use_pandas_metadata=pandas_compat + ) else: got = cudf.read_parquet(fname, use_pandas_metadata=pandas_compat) @@ -418,7 +436,9 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): null_frequency=0.05, generator=lambda: [ "".join( - random.sample(string.ascii_letters, random.randint(4, 8)) + random.sample( + string.ascii_letters, random.randint(4, 8) + ) ) for _ in range(40) ], @@ -477,7 +497,9 @@ def test_parquet_read_filtered_multiple_files(tmpdir): df = pd.DataFrame({"x": range(10), "y": list("aaccccddee")}) df.to_parquet(fname_1, row_group_size=2) fname_2 = tmpdir.join("filtered_multiple_files_2.parquet") - df = pd.DataFrame({"x": [0, 1, 9, 9, 4, 5, 6, 7, 8, 9], "y": list("aabbzzddee")}) + df = pd.DataFrame( + {"x": [0, 1, 9, 9, 4, 5, 6, 7, 8, 9], "y": list("aabbzzddee")} + ) df.to_parquet(fname_2, row_group_size=2) # Check filter @@ -509,7 +531,9 @@ def test_parquet_read_filtered_multiple_files(tmpdir): ([[("x", "==", 0), ("z", "==", 9), ("y", "==", "a")]], 1), ], ) -def test_parquet_read_filtered_complex_predicate(tmpdir, predicate, expected_len): +def test_parquet_read_filtered_complex_predicate( + tmpdir, predicate, expected_len +): # Generate data fname = tmpdir.join("filtered_complex_predicate.parquet") df = pd.DataFrame( @@ -664,7 +688,9 @@ def test_parquet_reader_local_filepath(): cudf.read_parquet(fname) -@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "bytes", "url"]) +@pytest.mark.parametrize( + "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] +) def test_parquet_reader_filepath_or_buffer(parquet_path_or_buf, src): expect = pd.read_parquet(parquet_path_or_buf("filepath")) got = cudf.read_parquet(parquet_path_or_buf(src)) @@ -694,11 +720,15 @@ def test_parquet_reader_use_python_file_object( # Pass open fsspec file with fs.open(paths[0], mode="rb") as fil: - got1 = cudf.read_parquet(fil, use_python_file_object=use_python_file_object) + got1 = cudf.read_parquet( + fil, use_python_file_object=use_python_file_object + ) assert_eq(expect, got1) # Pass path only - got2 = cudf.read_parquet(paths[0], use_python_file_object=use_python_file_object) + got2 = cudf.read_parquet( + paths[0], use_python_file_object=use_python_file_object + ) assert_eq(expect, got2) @@ -722,7 +752,9 @@ def create_parquet_source(df, src_type, fname): return pathlib.Path(fname).as_uri() -@pytest.mark.parametrize("src", ["filepath", "pathobj", "bytes_io", "bytes", "url"]) +@pytest.mark.parametrize( + "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] +) def test_parquet_reader_multiple_files(tmpdir, src): test_pdf1 = make_pdf(nrows=1000, nvalids=1000 // 2, dtype="float64") test_pdf2 = make_pdf(nrows=500, dtype="float64") @@ -852,7 +884,9 @@ def string_gen(first_val, i): return strings[int_gen(first_val, i) % len(strings)] -def list_row_gen(gen, first_val, list_size, lists_per_row, include_validity=False): +def list_row_gen( + gen, first_val, list_size, lists_per_row, include_validity=False +): """ Generate a single row for a List> column based on input parameters. @@ -918,7 +952,10 @@ def L(list_size, first_val): ] def R(first_val, lists_per_row, list_size): - return [L(list_size, first_val + (list_size * i)) for i in range(lists_per_row)] + return [ + L(list_size, first_val + (list_size * i)) + for i in range(lists_per_row) + ] return [ ( @@ -950,7 +987,9 @@ def test_parquet_reader_list_large(tmpdir): def test_parquet_reader_list_validity(tmpdir): - expect = pd.DataFrame({"a": list_gen(int_gen, 256, 80, 50, include_validity=True)}) + expect = pd.DataFrame( + {"a": list_gen(int_gen, 256, 80, 50, include_validity=True)} + ) fname = tmpdir.join("test_parquet_reader_list_validity.parquet") expect.to_parquet(fname) assert os.path.exists(fname) @@ -997,7 +1036,9 @@ def test_parquet_reader_list_large_multi_rowgroup(tmpdir): expect.reset_index(inplace=True) # round trip the dataframe to/from parquet - fname = tmpdir.join("test_parquet_reader_list_large_multi_rowgroup.parquet") + fname = tmpdir.join( + "test_parquet_reader_list_large_multi_rowgroup.parquet" + ) expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) got = cudf.read_parquet(fname) @@ -1014,7 +1055,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): ) # round trip the dataframe to/from parquet - fname = tmpdir.join("test_parquet_reader_list_large_multi_rowgroup_nulls.parquet") + fname = tmpdir.join( + "test_parquet_reader_list_large_multi_rowgroup_nulls.parquet" + ) expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) assert os.path.exists(fname) got = cudf.read_parquet(fname) @@ -1044,7 +1087,9 @@ def struct_gen(gen, skip_rows, num_rows, include_validity=False): def R(first_val, num_fields): return { - "col" + str(f): (gen[f](first_val, first_val) if f % 4 != 0 else None) + "col" + str(f): ( + gen[f](first_val, first_val) if f % 4 != 0 else None + ) if include_validity else (gen[f](first_val, first_val)) for f in range(len(gen)) @@ -1504,7 +1549,9 @@ def test_parquet_reader_nested_v2(tmpdir, data): @pytest.mark.filterwarnings("ignore:Using CPU") -def test_parquet_writer_cpu_pyarrow(tmpdir, pdf_day_timestamps, gdf_day_timestamps): +def test_parquet_writer_cpu_pyarrow( + tmpdir, pdf_day_timestamps, gdf_day_timestamps +): pdf_fname = tmpdir.join("pdf.parquet") gdf_fname = tmpdir.join("gdf.parquet") @@ -1763,7 +1810,9 @@ def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs): # Know the specific row-group count for row_group_size_rows if "row_group_size_rows" in row_group_size_kwargs: - assert nrow_groups == size // row_group_size_kwargs["row_group_size_rows"] + assert ( + nrow_groups == size // row_group_size_kwargs["row_group_size_rows"] + ) assert_eq(cudf.read_parquet(fname), gdf) @@ -1927,7 +1976,9 @@ def test_parquet_writer_chunked_max_file_size( gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1] * 10000, "b": range(0, 50000)}) - df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3] * 10000, "b": range(50000, 100000)}) + df2 = cudf.DataFrame( + {"a": [1, 3, 3, 1, 3] * 10000, "b": range(50000, 100000)} + ) cw = ParquetDatasetWriter( gdf_dir, @@ -1989,7 +2040,9 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) - with ParquetDatasetWriter(gdf_dir, partition_cols=["a"], index=False) as cw: + with ParquetDatasetWriter( + gdf_dir, partition_cols=["a"], index=False + ) as cw: cw.write_table(df1) cw.write_table(df2) @@ -2057,7 +2110,9 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols): ) @pytest.mark.parametrize("selection", ["directory", "files", "row-groups"]) @pytest.mark.parametrize("use_cat", [True, False]) -def test_read_parquet_partitioned_filtered(tmpdir, pfilters, selection, use_cat): +def test_read_parquet_partitioned_filtered( + tmpdir, pfilters, selection, use_cat +): rng = np.random.default_rng(2) path = str(tmpdir) size = 100 @@ -2177,8 +2232,12 @@ def test_write_cudf_read_pandas_pyarrow(tmpdir, pdf): assert_eq(pd_res, cudf_res, check_index_type=not pdf.empty) - cudf_res = pa.parquet.read_table(cudf_path, use_pandas_metadata=True).to_pandas() - pd_res = pa.parquet.read_table(pandas_path, use_pandas_metadata=True).to_pandas() + cudf_res = pa.parquet.read_table( + cudf_path, use_pandas_metadata=True + ).to_pandas() + pd_res = pa.parquet.read_table( + pandas_path, use_pandas_metadata=True + ).to_pandas() assert_eq(cudf_res, pd_res, check_index_type=not pdf.empty) @@ -2309,7 +2368,11 @@ def test_parquet_nullable_boolean(tmpdir, engine): pandas_path = tmpdir.join("pandas_bools.parquet") pdf = pd.DataFrame( - {"a": pd.Series([True, False, None, True, False], dtype=pd.BooleanDtype())} + { + "a": pd.Series( + [True, False, None, True, False], dtype=pd.BooleanDtype() + ) + } ) expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]}) @@ -2397,7 +2460,9 @@ def test_parquet_no_index_empty(): def test_parquet_allnull_str(tmpdir, engine): pandas_path = tmpdir.join("pandas_allnulls.parquet") - pdf = pd.DataFrame({"a": pd.Series([None, None, None, None, None], dtype="str")}) + pdf = pd.DataFrame( + {"a": pd.Series([None, None, None, None, None], dtype="str")} + ) expected_gdf = cudf.DataFrame( {"a": cudf.Series([None, None, None, None, None], dtype="str")} ) @@ -2772,7 +2837,9 @@ def test_parquet_reader_one_level_list3(datadir): @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) -def test_to_parquet_row_group_size(tmpdir, large_int64_gdf, size_bytes, size_rows): +def test_to_parquet_row_group_size( + tmpdir, large_int64_gdf, size_bytes, size_rows +): fname = tmpdir.join("row_group_size.parquet") large_int64_gdf.to_parquet( fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index f0834429c1e..13a07ef8adc 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -53,7 +53,9 @@ def test_pickle_dataframe_categorical(): np.random.seed(0) df = DataFrame() - df["keys"] = pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]) + df["keys"] = pd.Categorical( + ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] + ) df["vals"] = np.random.random(len(df)) check_serialization(df) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index 68424fd8e17..cf9e70d85c7 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import datetime @@ -64,7 +64,9 @@ def test_query(data, fn, nulls): ] -@pytest.mark.parametrize("data,fn", product(params_query_data, params_query_env_fn)) +@pytest.mark.parametrize( + "data,fn", product(params_query_data, params_query_env_fn) +) def test_query_ref_env(data, fn): # prepare nelem, seed = data diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index f27b2dfeeb0..1a5f25e320f 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -15,7 +15,9 @@ def pdf(): return pd.DataFrame( { "col1": np.array([5, 4, 3, 5, 8, 5, 2, 1, 6, 6]), - "col2": np.array([5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]), + "col2": np.array( + [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] + ), }, index=np.array([5, 4, 3, 2, 1, 6, 7, 8, 9, 10]), ) @@ -36,7 +38,9 @@ def test_rank_all_arguments( pdf = pdf.copy(deep=True) # for parallel pytest if numeric_only: - pdf["str"] = np.array(["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"]) + pdf["str"] = np.array( + ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"] + ) gdf = DataFrame.from_pandas(pdf) kwargs = { diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index c51987b48e8..c6ffa1d2bc7 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -83,7 +83,9 @@ def test_product(dtype, nelem): data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): - data[np.random.randint(low=0, high=nelem, size=1)] = np.random.uniform() * 2 + data[np.random.randint(low=0, high=nelem, size=1)] = ( + np.random.uniform() * 2 + ) else: data = gen_rand(dtype, nelem) @@ -367,7 +369,9 @@ def test_reductions_axis_none_warning(op): def test_reduction_column_multiindex(): - idx = cudf.MultiIndex.from_tuples([("a", 1), ("a", 2)], names=["foo", "bar"]) + idx = cudf.MultiIndex.from_tuples( + [("a", 1), ("a", 2)], names=["foo", "bar"] + ) df = cudf.DataFrame(np.array([[1, 3], [2, 4]]), columns=idx) result = df.mean() expected = df.to_pandas().mean() diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index de623f94728..8992c4d617b 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -362,7 +362,9 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): pdata = container(data) if np.dtype(data_dtype).kind not in ("f"): - data_dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[np.dtype(data_dtype)] + data_dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[ + np.dtype(data_dtype) + ] pdata = pdata.astype(data_dtype) # Explicitly using nans_as_nulls=True @@ -389,18 +391,18 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): cudf.Series(["-74.56", None, "-23.73", "34.55", "2.89", None]).astype( Decimal32Dtype(7, 2) ), - cudf.Series(["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan]).astype( - Decimal64Dtype(8, 3) - ), - cudf.Series(["2.964", None, "57.432", "-989.330", None, "56.444"]).astype( - Decimal64Dtype(8, 3) - ), - cudf.Series([np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan]).astype( - Decimal64Dtype(10, 4) - ), - cudf.Series(["2.964", None, "54347.432", "-989.330", None, "56.444"]).astype( - Decimal128Dtype(20, 7) - ), + cudf.Series( + ["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan] + ).astype(Decimal64Dtype(8, 3)), + cudf.Series( + ["2.964", None, "57.432", "-989.330", None, "56.444"] + ).astype(Decimal64Dtype(8, 3)), + cudf.Series( + [np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan] + ).astype(Decimal64Dtype(10, 4)), + cudf.Series( + ["2.964", None, "54347.432", "-989.330", None, "56.444"] + ).astype(Decimal128Dtype(20, 7)), ], ) @pytest.mark.parametrize( @@ -485,7 +487,10 @@ def test_fillna_categorical(psr_data, fill_value, inplace): else: fill_value_cudf = fill_value - if isinstance(fill_value_cudf, cudf.Series) and gsr.dtype != fill_value_cudf.dtype: + if ( + isinstance(fill_value_cudf, cudf.Series) + and gsr.dtype != fill_value_cudf.dtype + ): assert_exceptions_equal( lfunc=psr.fillna, rfunc=gsr.fillna, @@ -666,16 +671,26 @@ def test_fillna_datetime(psr_data, fill_value, inplace): dtype="datetime64[ns]", ), # Timedelta - np.array([10, 100, 1000, None, None, 10, 100, 1000], dtype="datetime64[ns]"), - np.array([None, None, 10, None, 1000, 100, 10], dtype="datetime64[ns]"), - np.array([10, 100, None, None, 1000, None, None], dtype="datetime64[ns]"), + np.array( + [10, 100, 1000, None, None, 10, 100, 1000], dtype="datetime64[ns]" + ), + np.array( + [None, None, 10, None, 1000, 100, 10], dtype="datetime64[ns]" + ), + np.array( + [10, 100, None, None, 1000, None, None], dtype="datetime64[ns]" + ), # String np.array( ["10", "100", "1000", None, None, "10", "100", "1000"], dtype="object", ), - np.array([None, None, "1000", None, "10", "100", "10"], dtype="object"), - np.array(["10", "100", None, None, "1000", None, None], dtype="object"), + np.array( + [None, None, "1000", None, "10", "100", "10"], dtype="object" + ), + np.array( + ["10", "100", None, None, "1000", None, None], dtype="object" + ), ], ) @pytest.mark.parametrize("container", [pd.Series, pd.DataFrame]) @@ -706,7 +721,9 @@ def test_fillna_method_fixed_width_non_num(data, container, method, inplace): "df", [ pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}), - pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"]), + pd.DataFrame( + {"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"] + ), pd.DataFrame({"a": [1, 2, 3]}), ], ) @@ -961,7 +978,9 @@ def test_series_multiple_times_with_nulls(): @pytest.mark.parametrize("series_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("replacement", [128, 128.0, 128.5, 32769, 32769.0, 32769.5]) +@pytest.mark.parametrize( + "replacement", [128, 128.0, 128.5, 32769, 32769.0, 32769.5] +) def test_numeric_series_replace_dtype(series_dtype, replacement): psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) sr = cudf.from_pandas(psr) @@ -1000,13 +1019,15 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): sr.replace([0, 1], [replacement]) # Both lists of equal length - if (np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"}) or ( - not can_replace - ): + if ( + np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"} + ) or (not can_replace): with pytest.raises(TypeError): sr.replace([2, 3], [replacement, replacement]) else: - expect = psr.replace([2, 3], [replacement, replacement]).astype(psr.dtype) + expect = psr.replace([2, 3], [replacement, replacement]).astype( + psr.dtype + ) got = sr.replace([2, 3], [replacement, replacement]) assert_eq(expect, got) @@ -1112,7 +1133,9 @@ def test_replace_df_error(): ) @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_clip(lower, upper, inplace): - pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]}) + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} + ) gdf = cudf.from_pandas(pdf) got = gdf.clip(lower=lower, upper=upper, inplace=inplace) @@ -1149,7 +1172,9 @@ def test_dataframe_category_clip(lower, upper, inplace): [([2, 7.4], [4, 7.9, "d"]), ([2, 7.4, "a"], [4, 7.9, "d"])], ) def test_dataframe_exceptions_for_clip(lower, upper): - gdf = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]}) + gdf = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} + ) with pytest.raises(ValueError): gdf.clip(lower=lower, upper=upper) @@ -1358,7 +1383,9 @@ def test_replace_nulls(gsr, old, new, expected): def test_fillna_columns_multiindex(): columns = pd.MultiIndex.from_tuples([("a", "b"), ("d", "e")]) - pdf = pd.DataFrame({"0": [1, 2, None, 3, None], "1": [None, None, None, None, 4]}) + pdf = pd.DataFrame( + {"0": [1, 2, None, 3, None], "1": [None, None, None, None, 4]} + ) pdf.columns = columns gdf = cudf.from_pandas(pdf) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index ea703a26a39..8f65bd26bd1 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -31,7 +31,9 @@ def test_null_series(nrows, dtype): if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view(mode="read").copy_to_host(), - dtype=np_dtypes_to_pandas_dtypes.get(cudf.dtype(dtype), cudf.dtype(dtype)), + dtype=np_dtypes_to_pandas_dtypes.get( + cudf.dtype(dtype), cudf.dtype(dtype) + ), ) ps[sr.isnull().to_pandas()] = pd.NA else: @@ -117,7 +119,11 @@ def test_integer_dataframe(x): pd.reset_option("display.max_columns") -@given(st.lists(st.integers(-9223372036854775808, 9223372036854775807), max_size=10000)) +@given( + st.lists( + st.integers(-9223372036854775808, 9223372036854775807), max_size=10000 + ) +) @settings(deadline=None) def test_integer_series(x): sr = cudf.Series(x, dtype=int) @@ -146,7 +152,9 @@ def test_float_series(x): def mixed_pdf(): pdf = pd.DataFrame() pdf["Integer"] = np.array([2345, 11987, 9027, 9027]) - pdf["Date"] = np.array(["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"]) + pdf["Date"] = np.array( + ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"] + ) pdf["Float"] = np.array([9.001, 8.343, 6, 2.781]) pdf["Integer2"] = np.array([2345, 106, 2088, 789277]) pdf["Category"] = np.array(["M", "F", "F", "F"]) @@ -199,7 +207,9 @@ def test_MI(): @pytest.mark.parametrize("nrows", [0, 1, 3, 5, 10]) @pytest.mark.parametrize("ncols", [0, 1, 2, 3]) def test_groupby_MI(nrows, ncols): - gdf = cudf.DataFrame({"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)}) + gdf = cudf.DataFrame( + {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} + ) pdf = gdf.to_pandas() gdg = gdf.groupby(["a", "b"], sort=True).count() pdg = pdf.groupby(["a", "b"], sort=True).count() @@ -322,8 +332,11 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): ",\n , ],\n dtype='uint32')", ), ( - cudf.Index([None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16"), - "Index([, 111, 22, 33, , 23, 34, 2343, ], " "dtype='int16')", + cudf.Index( + [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" + ), + "Index([, 111, 22, 33, , 23, 34, 2343, ], " + "dtype='int16')", ), ( cudf.Index([1, 2, 3, None], dtype="category"), @@ -642,7 +655,9 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series([None, None, None, None, None], dtype="timedelta64[ns]"), + cudf.Series( + [None, None, None, None, None], dtype="timedelta64[ns]" + ), textwrap.dedent( """ 0 NaT @@ -655,7 +670,9 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series([None, None, None, None, None], dtype="timedelta64[ms]"), + cudf.Series( + [None, None, None, None, None], dtype="timedelta64[ms]" + ), textwrap.dedent( """ 0 NaT @@ -668,7 +685,9 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series([12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ns]"), + cudf.Series( + [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ns]" + ), textwrap.dedent( """ 0 0 days 00:00:00.000000012 @@ -682,7 +701,9 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series([12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ms]"), + cudf.Series( + [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ms]" + ), textwrap.dedent( """ 0 0 days 00:00:00.012 @@ -852,7 +873,11 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): [ ( cudf.DataFrame( - {"a": cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[s]")} + { + "a": cudf.Series( + [1000000, 200000, 3000000], dtype="timedelta64[s]" + ) + } ), textwrap.dedent( """ @@ -1009,8 +1034,11 @@ def test_timedelta_dataframe_repr(df, expected_repr): "dtype='timedelta64[ms]')", ), ( - cudf.Index([None, None, None, None, None], dtype="timedelta64[us]"), - "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], " "dtype='timedelta64[us]')", + cudf.Index( + [None, None, None, None, None], dtype="timedelta64[us]" + ), + "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], " + "dtype='timedelta64[us]')", ), ( cudf.Index( @@ -1060,7 +1088,9 @@ def test_timedelta_index_repr(index, expected_repr): @pytest.mark.parametrize( "pmi", [ - pd.MultiIndex.from_tuples([(1, "red"), (1, "blue"), (2, "red"), (2, "blue")]), + pd.MultiIndex.from_tuples( + [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] + ), pd.MultiIndex.from_tuples( [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] * 10 ), @@ -1212,7 +1242,9 @@ def test_multiindex_repr(pmi, max_seq_items): cudf.DataFrame( { "a": [None, None, None, None], - "b": cudf.Series([None, None, None, None], dtype="timedelta64[ns]"), + "b": cudf.Series( + [None, None, None, None], dtype="timedelta64[ns]" + ), "c": [0.345, np.nan, 100, 10], } ) @@ -1327,9 +1359,9 @@ def test_multiindex_null_repr(gdi, expected_repr): def test_categorical_series_with_nan_repr(): - series = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False).astype( - "category" - ) + series = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") expected_repr = textwrap.dedent( """ @@ -1361,9 +1393,9 @@ def test_categorical_series_with_nan_repr(): def test_categorical_dataframe_with_nan_repr(): - series = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False).astype( - "category" - ) + series = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") df = cudf.DataFrame({"a": series}) expected_repr = textwrap.dedent( """ @@ -1382,9 +1414,9 @@ def test_categorical_dataframe_with_nan_repr(): def test_categorical_index_with_nan_repr(): cat_index = cudf.Index( - cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False).astype( - "category" - ) + cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") ) expected_repr = ( diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index 7564d865142..ad6e0ac52c5 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -52,7 +52,9 @@ def test_series_resample_ffill(rule): rng = pd.date_range("1/1/2012", periods=10, freq="5s") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) gts = cudf.from_pandas(ts) - assert_resample_results_equal(ts.resample(rule).ffill(), gts.resample(rule).ffill()) + assert_resample_results_equal( + ts.resample(rule).ffill(), gts.resample(rule).ffill() + ) @pytest.mark.parametrize("rule", ["2s", "10s"]) @@ -60,7 +62,9 @@ def test_series_resample_bfill(rule): rng = pd.date_range("1/1/2012", periods=10, freq="5s") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) gts = cudf.from_pandas(ts) - assert_resample_results_equal(ts.resample(rule).bfill(), gts.resample(rule).bfill()) + assert_resample_results_equal( + ts.resample(rule).bfill(), gts.resample(rule).bfill() + ) @pytest.mark.parametrize("rule", ["2s", "10s"]) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index a1da690d0ad..d618669755d 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -46,7 +46,9 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): colname = "id" + str(i) data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) + idx = np.random.choice( + num_rows, size=int(num_rows / 2), replace=False + ) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -58,7 +60,9 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): colname = "val" + str(i) data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) + idx = np.random.choice( + num_rows, size=int(num_rows / 2), replace=False + ) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -73,7 +77,9 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars) # pandas' melt makes the 'variable' column of 'object' type (string) # cuDF's melt makes it Categorical because it doesn't support strings - expect["variable"] = expect["variable"].astype(got["variable"].dtype.to_pandas()) + expect["variable"] = expect["variable"].astype( + got["variable"].dtype.to_pandas() + ) assert_eq(expect, got) @@ -89,7 +95,9 @@ def test_melt_many_columns(): grid_df = pd.melt(df, id_vars=["id"], var_name="d", value_name="sales") df_d = cudf.DataFrame(mydict) - grid_df_d = cudf.melt(df_d, id_vars=["id"], var_name="d", value_name="sales") + grid_df_d = cudf.melt( + df_d, id_vars=["id"], var_name="d", value_name="sales" + ) grid_df_d["d"] = grid_df_d["d"].astype("str") assert_eq(grid_df, grid_df_d) @@ -97,7 +105,9 @@ def test_melt_many_columns(): @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) -@pytest.mark.parametrize("dtype", list(chain(NUMERIC_TYPES, DATETIME_TYPES, ["str"]))) +@pytest.mark.parametrize( + "dtype", list(chain(NUMERIC_TYPES, DATETIME_TYPES, ["str"])) +) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_df_stack(nulls, num_cols, num_rows, dtype): if dtype not in ["float32", "float64"] and nulls in ["some"]: @@ -108,7 +118,9 @@ def test_df_stack(nulls, num_cols, num_rows, dtype): colname = str(i) data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) + idx = np.random.choice( + num_rows, size=int(num_rows / 2), replace=False + ) data[idx] = np.nan pdf[colname] = data @@ -190,9 +202,13 @@ def test_df_stack_reset_index(): @pytest.mark.parametrize("dropna", [True, False]) def test_df_stack_multiindex_column_axis(columns, index, level, dropna): if isinstance(level, list) and len(level) > 1 and not dropna: - pytest.skip("Stacking multiple levels with dropna==False is unsupported.") + pytest.skip( + "Stacking multiple levels with dropna==False is unsupported." + ) - pdf = pd.DataFrame(data=[[1, 2, 3, 4], [2, 4, 6, 8]], columns=columns, index=index) + pdf = pd.DataFrame( + data=[[1, 2, 3, 4], [2, 4, 6, 8]], columns=columns, index=index + ) gdf = cudf.from_pandas(pdf) with pytest.warns(FutureWarning): @@ -258,7 +274,9 @@ def test_df_stack_multiindex_column_axis_pd_example(level): @pytest.mark.parametrize("num_rows", [1, 2, 10, 1000]) @pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["category"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["category"] +) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_interleave_columns(nulls, num_cols, num_rows, dtype): if dtype not in ["float32", "float64"] and nulls in ["some"]: @@ -270,7 +288,9 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype): data = pd.Series(np.random.randint(0, 26, num_rows)).astype(dtype) if nulls == "some": - idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) + idx = np.random.choice( + num_rows, size=int(num_rows / 2), replace=False + ) data[idx] = np.nan pdf[colname] = data @@ -282,7 +302,9 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype): else: got = gdf.interleave_columns() - expect = pd.Series(np.vstack(pdf.to_numpy()).reshape((-1,))).astype(dtype) + expect = pd.Series(np.vstack(pdf.to_numpy()).reshape((-1,))).astype( + dtype + ) assert_eq(expect, got) @@ -299,10 +321,14 @@ def test_tile(nulls, num_cols, num_rows, dtype, count): pdf = pd.DataFrame(dtype=dtype) for i in range(num_cols): colname = str(i) - data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype(dtype) + data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype( + dtype + ) if nulls == "some": - idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) + idx = np.random.choice( + num_rows, size=int(num_rows / 2), replace=False + ) data[idx] = np.nan pdf[colname] = data @@ -338,7 +364,9 @@ def _prepare_merge_sorted_test( indices = [i * chunk for i in range(0, nparts)] + [size] if index: dfs = [ - df.iloc[indices[i] : indices[i + 1]].copy().sort_index(ascending=ascending) + df.iloc[indices[i] : indices[i + 1]] + .copy() + .sort_index(ascending=ascending) for i in range(nparts) ] elif series: @@ -378,7 +406,9 @@ def test_df_merge_sorted(nparts, keys, na_position, ascending): ascending=ascending, ) - expect = df.sort_values(keys_1, na_position=na_position, ascending=ascending) + expect = df.sort_values( + keys_1, na_position=na_position, ascending=ascending + ) result = cudf.core.reshape._merge_sorted( dfs, keys=keys, na_position=na_position, ascending=ascending ) @@ -400,7 +430,9 @@ def test_df_merge_sorted_index(nparts, index, ascending): ) expect = df.sort_index(ascending=ascending) - result = cudf.core.reshape._merge_sorted(dfs, by_index=True, ascending=ascending) + result = cudf.core.reshape._merge_sorted( + dfs, by_index=True, ascending=ascending + ) assert_eq(expect.index, result.index) @@ -424,7 +456,9 @@ def test_df_merge_sorted_ignore_index(keys, na_position, ascending): ascending=ascending, ) - expect = df.sort_values(keys_1, na_position=na_position, ascending=ascending) + expect = df.sort_values( + keys_1, na_position=na_position, ascending=ascending + ) result = cudf.core.reshape._merge_sorted( dfs, keys=keys, @@ -514,7 +548,9 @@ def test_pivot_multi_values(): ) -@pytest.mark.parametrize("values", ["z", "z123", ["z123"], ["z", "z123", "123z"]]) +@pytest.mark.parametrize( + "values", ["z", "z123", ["z123"], ["z", "z123", "123z"]] +) def test_pivot_values(values): data = [ ["A", "a", 0, 0, 0], @@ -545,24 +581,32 @@ def test_pivot_values(values): 0, pytest.param( 1, - marks=pytest_xfail(reason="Categorical column indexes not supported"), + marks=pytest_xfail( + reason="Categorical column indexes not supported" + ), ), 2, "foo", pytest.param( "bar", - marks=pytest_xfail(reason="Categorical column indexes not supported"), + marks=pytest_xfail( + reason="Categorical column indexes not supported" + ), ), "baz", [], pytest.param( [0, 1], - marks=pytest_xfail(reason="Categorical column indexes not supported"), + marks=pytest_xfail( + reason="Categorical column indexes not supported" + ), ), ["foo"], pytest.param( ["foo", "bar"], - marks=pytest_xfail(reason="Categorical column indexes not supported"), + marks=pytest_xfail( + reason="Categorical column indexes not supported" + ), ), pytest.param( [0, 1, 2], @@ -602,7 +646,9 @@ def test_unstack_multiindex(level): pd.Index(range(0, 5), name="row_index"), pytest.param( pd.CategoricalIndex(["d", "e", "f", "g", "h"]), - marks=pytest_xfail(reason="Categorical column indexes not supported"), + marks=pytest_xfail( + reason="Categorical column indexes not supported" + ), ), ], ) @@ -643,7 +689,9 @@ def test_unstack_index_invalid(): def test_pivot_duplicate_error(): - gdf = cudf.DataFrame({"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]}) + gdf = cudf.DataFrame( + {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]} + ) with pytest.raises(ValueError): gdf.pivot(index="a", columns="b") with pytest.raises(ValueError): @@ -662,7 +710,9 @@ def test_pivot_duplicate_error(): } ], ) -@pytest.mark.parametrize("aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]) +@pytest.mark.parametrize( + "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] +) @pytest.mark.parametrize("fill_value", [0]) def test_pivot_table_simple(data, aggfunc, fill_value): pdf = pd.DataFrame(data) @@ -698,7 +748,9 @@ def test_pivot_table_simple(data, aggfunc, fill_value): } ], ) -@pytest.mark.parametrize("aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]) +@pytest.mark.parametrize( + "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] +) @pytest.mark.parametrize("fill_value", [0]) def test_dataframe_pivot_table_simple(data, aggfunc, fill_value): pdf = pd.DataFrame(data) diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 1280aed6c42..1d1d7ae8d29 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -20,7 +20,9 @@ ([1, 2, 4, 9, 9, 4], ["a", "b", "c", "d", "e", "f"]), ], ) -@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "std", "var"]) +@pytest.mark.parametrize( + "agg", ["sum", "min", "max", "mean", "count", "std", "var"] +) @pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) @pytest.mark.parametrize("center", [True, False]) def test_rolling_series_basic(data, index, agg, nulls, center): @@ -44,9 +46,9 @@ def test_rolling_series_basic(data, index, agg, nulls, center): expect = getattr( psr.rolling(window_size, min_periods, center), agg )().fillna(-1) - got = getattr(gsr.rolling(window_size, min_periods, center), agg)().fillna( - -1 - ) + got = getattr( + gsr.rolling(window_size, min_periods, center), agg + )().fillna(-1) assert_eq(expect, got, check_dtype=False, check_freq=False) @@ -62,7 +64,9 @@ def test_rolling_series_basic(data, index, agg, nulls, center): }, ], ) -@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "std", "var"]) +@pytest.mark.parametrize( + "agg", ["sum", "min", "max", "mean", "count", "std", "var"] +) @pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) @pytest.mark.parametrize("center", [True, False]) def test_rolling_dataframe_basic(data, agg, nulls, center): @@ -88,9 +92,9 @@ def test_rolling_dataframe_basic(data, agg, nulls, center): expect = getattr( pdf.rolling(window_size, min_periods, center), agg )().fillna(-1) - got = getattr(gdf.rolling(window_size, min_periods, center), agg)().fillna( - -1 - ) + got = getattr( + gdf.rolling(window_size, min_periods, center), agg + )().fillna(-1) assert_eq(expect, got, check_dtype=False) @@ -274,7 +278,9 @@ def test_rolling_getitem(): def test_rolling_getitem_window(): - index = pd.DatetimeIndex(pd.date_range("2000-01-01", "2000-01-02", freq="1h")) + index = pd.DatetimeIndex( + pd.date_range("2000-01-01", "2000-01-02", freq="1h") + ) pdf = pd.DataFrame({"x": np.arange(len(index))}, index=index) gdf = cudf.from_pandas(pdf) @@ -375,7 +381,9 @@ def some_func(A): ) -@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "var", "std"]) +@pytest.mark.parametrize( + "agg", ["sum", "min", "max", "mean", "count", "var", "std"] +) def test_rolling_groupby_simple(agg): pdf = pd.DataFrame( { @@ -386,7 +394,9 @@ def test_rolling_groupby_simple(agg): gdf = cudf.from_pandas(pdf) for window_size in range(1, len(pdf) + 1): - expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna(-1) + expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna( + -1 + ) got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) assert_eq(expect, got, check_dtype=False) @@ -396,12 +406,16 @@ def test_rolling_groupby_simple(agg): gdf = cudf.from_pandas(pdf) for window_size in range(1, len(pdf) + 1): - expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna(-1) + expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna( + -1 + ) got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "var", "std"]) +@pytest.mark.parametrize( + "agg", ["sum", "min", "max", "mean", "count", "var", "std"] +) def test_rolling_groupby_multi(agg): pdf = pd.DataFrame( { @@ -422,8 +436,12 @@ def test_rolling_groupby_multi(agg): assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count", "var", "std"]) -@pytest.mark.parametrize("window_size", ["1d", "2d", "3d", "4d", "5d", "6d", "7d"]) +@pytest.mark.parametrize( + "agg", ["sum", "min", "max", "mean", "count", "var", "std"] +) +@pytest.mark.parametrize( + "window_size", ["1d", "2d", "3d", "4d", "5d", "6d", "7d"] +) def test_rolling_groupby_offset(agg, window_size): pdf = pd.DataFrame( { @@ -433,7 +451,9 @@ def test_rolling_groupby_offset(agg, window_size): } ).set_index("date") gdf = cudf.from_pandas(pdf) - expect = getattr(pdf.groupby("group").rolling(window_size), agg)().fillna(-1) + expect = getattr(pdf.groupby("group").rolling(window_size), agg)().fillna( + -1 + ) got = getattr(gdf.groupby("group").rolling(window_size), agg)().fillna(-1) assert_eq(expect, got, check_dtype=False) @@ -442,7 +462,9 @@ def test_rolling_custom_index_support(): from pandas.api.indexers import BaseIndexer class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed, step=None): + def get_window_bounds( + self, num_values, min_periods, center, closed, step=None + ): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index f36afc950e8..cdce17eeb76 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -124,7 +124,9 @@ def pdf_ext(scope="module"): df["Integer"] = np.array([i for i in range(size)]) df["List"] = [[i] for i in range(size)] df["Struct"] = [{"a": i} for i in range(size)] - df["String"] = (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[:size] + df["String"] = (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[ + :size + ] return df @@ -186,7 +188,9 @@ def test_read_csv_byte_range( f"s3://{bucket}/{fname}", storage_options=s3so, byte_range=(74, 73), - bytes_per_thread=bytes_per_thread if not use_python_file_object else None, + bytes_per_thread=bytes_per_thread + if not use_python_file_object + else None, header=None, names=["Integer", "Float", "Integer2", "String", "Boolean"], use_python_file_object=use_python_file_object, @@ -255,7 +259,9 @@ def test_read_parquet( # Check fsspec file-object handling buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - fs = get_fs_token_paths(f"s3://{bucket}/{fname}", storage_options=s3so)[0] + fs = get_fs_token_paths( + f"s3://{bucket}/{fname}", storage_options=s3so + )[0] with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f: got2 = cudf.read_parquet( f, @@ -297,7 +303,9 @@ def test_read_parquet_ext( ) if index: expect = ( - pdf_ext.set_index(index)[columns] if columns else pdf_ext.set_index(index) + pdf_ext.set_index(index)[columns] + if columns + else pdf_ext.set_index(index) ) else: expect = pdf_ext[columns] if columns else pdf_ext @@ -396,8 +404,12 @@ def test_write_parquet(s3_base, s3so, pdf, partition_cols): ) assert s3fs.exists(f"s3://{bucket}/{fname_pandas}") - got = pd.read_parquet(f"s3://{bucket}/{fname_pandas}", storage_options=s3so) - expect = cudf.read_parquet(f"s3://{bucket}/{fname_cudf}", storage_options=s3so) + got = pd.read_parquet( + f"s3://{bucket}/{fname_pandas}", storage_options=s3so + ) + expect = cudf.read_parquet( + f"s3://{bucket}/{fname_cudf}", storage_options=s3so + ) assert_eq(expect, got) @@ -492,7 +504,9 @@ def test_write_chunked_parquet(s3_base, s3so): bucket = "parquet" from cudf.io.parquet import ParquetDatasetWriter - with s3_context(s3_base=s3_base, bucket=bucket, files={dirname: BytesIO()}) as s3fs: + with s3_context( + s3_base=s3_base, bucket=bucket, files={dirname: BytesIO()} + ) as s3fs: with ParquetDatasetWriter( f"s3://{bucket}/{dirname}", partition_cols=["a"], diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 9ac0404a9bc..05a91a8fea3 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -203,13 +203,17 @@ def test_scalar_roundtrip(value): @pytest.mark.parametrize( "dtype", - NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] + TEST_DECIMAL_TYPES, + NUMERIC_TYPES + + DATETIME_TYPES + + TIMEDELTA_TYPES + + ["object"] + + TEST_DECIMAL_TYPES, ) def test_null_scalar(dtype): s = cudf.Scalar(None, dtype=dtype) - if cudf.api.types.is_datetime64_dtype(dtype) or cudf.api.types.is_timedelta64_dtype( + if cudf.api.types.is_datetime64_dtype( dtype - ): + ) or cudf.api.types.is_timedelta64_dtype(dtype): assert s.value is cudf.NaT else: assert s.value is cudf.NA @@ -241,7 +245,9 @@ def test_nat_to_null_scalar_succeeds(value): assert s.dtype == value.dtype -@pytest.mark.parametrize("value", [None, np.datetime64("NaT"), np.timedelta64("NaT")]) +@pytest.mark.parametrize( + "value", [None, np.datetime64("NaT"), np.timedelta64("NaT")] +) def test_generic_null_scalar_construction_fails(value): with pytest.raises(TypeError): cudf.Scalar(value) @@ -389,7 +395,9 @@ def test_device_scalar_direct_construction(value, decimal_type): @pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) def test_construct_from_scalar(value): value = cudf.utils.dtypes.to_cudf_compatible_scalar(value) - x = cudf.Scalar(value, value.dtype if not isinstance(value, Decimal) else None) + x = cudf.Scalar( + value, value.dtype if not isinstance(value, Decimal) else None + ) y = cudf.Scalar(x) assert x.value == y.value or np.isnan(x.value) and np.isnan(y.value) diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py index 1b4ebec781a..3ba652ff6c0 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/test_search.py @@ -105,7 +105,9 @@ def test_searchsorted_categorical(side): @pytest.mark.parametrize("side", ["left", "right"]) def test_searchsorted_datetime(side): - psr1 = pd.Series(pd.date_range("20190101", "20200101", freq="400h", name="times")) + psr1 = pd.Series( + pd.date_range("20190101", "20200101", freq="400h", name="times") + ) sr1 = cudf.from_pandas(psr1) psr2 = pd.Series( diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 419d4dda7a9..f26d78e7783 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -83,13 +83,17 @@ lambda: cudf.DataFrame( {"a": list(range(13)), "b": [float(x) for x in range(13)]}, index=cudf.Index( - cudf.date_range(start="2011-01-01", end="2012-01-01", periods=13) + cudf.date_range( + start="2011-01-01", end="2012-01-01", periods=13 + ) ), ), lambda: cudf.Series( list(range(13)), index=cudf.Index( - cudf.date_range(start="2011-01-01", end="2012-01-01", periods=13) + cudf.date_range( + start="2011-01-01", end="2012-01-01", periods=13 + ) ), ), lambda: cudf.TimedeltaIndex( @@ -159,7 +163,9 @@ def test_serialize_dataframe(): df = cudf.DataFrame() df["a"] = np.arange(100) df["b"] = np.arange(100, dtype=np.float32) - df["c"] = pd.Categorical(["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"]) + df["c"] = pd.Categorical( + ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] + ) outdf = cudf.DataFrame.deserialize(*df.serialize()) assert_eq(df, outdf) @@ -168,7 +174,9 @@ def test_serialize_dataframe_with_index(): df = cudf.DataFrame() df["a"] = np.arange(100) df["b"] = np.random.random(100) - df["c"] = pd.Categorical(["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"]) + df["c"] = pd.Categorical( + ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] + ) df = df.sort_values("b") outdf = cudf.DataFrame.deserialize(*df.serialize()) assert_eq(df, outdf) @@ -203,7 +211,9 @@ def test_serialize_multi_index(): gdf = cudf.DataFrame.from_pandas(pdf) gdg = gdf.groupby(["a", "b"]).sum() multiindex = gdg.index - outindex = cudf.core.multiindex.MultiIndex.deserialize(*multiindex.serialize()) + outindex = cudf.core.multiindex.MultiIndex.deserialize( + *multiindex.serialize() + ) assert_eq(multiindex, outindex) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 5b2308465c4..48194494260 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -40,7 +40,9 @@ def _series_na_data(): pd.Series(["a", "b", "u", "h", "d"]), pd.Series([None, None, np.nan, None, np.inf, -np.inf]), pd.Series([], dtype="float64"), - pd.Series([pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")]), + pd.Series( + [pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")] + ), pd.Series([np.nan]), pd.Series([None]), pd.Series(["a", "b", "", "c", None, "e"]), @@ -258,7 +260,9 @@ def test_series_concat_error_mixed_types(): ] * 25, [ - pd.Series([0.1, 0.002, 324.2332, 0.2342], index=["-", "+", "%", "#"]), + pd.Series( + [0.1, 0.002, 324.2332, 0.2342], index=["-", "+", "%", "#"] + ), pd.Series([12, 14, 15, 27], index=["d", "e", "z", "x"]), ] * 46, @@ -305,9 +309,13 @@ def test_series_concat_existing_buffers(): a5 = cudf.Series(np.array([1, 2, 3], dtype=np.int32)) a6 = cudf.Series(np.array([4.5, 5.5, 6.5], dtype=np.float64)) gs = cudf.concat([a5, a6]) - np.testing.assert_equal(gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()])) + np.testing.assert_equal( + gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()]) + ) gs = cudf.concat([cudf.Series(a6), a5]) - np.testing.assert_equal(gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()])) + np.testing.assert_equal( + gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()]) + ) def test_series_column_iter_error(): @@ -356,7 +364,9 @@ def test_series_column_iter_error(): [None, None, None, None, None], np.array(["1991-11-20", "2004-12-04"], dtype=np.datetime64), np.array(["1991-11-20", None], dtype=np.datetime64), - np.array(["1991-11-20 05:15:00", "2004-12-04 10:00:00"], dtype=np.datetime64), + np.array( + ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], dtype=np.datetime64 + ), np.array(["1991-11-20 05:15:00", None], dtype=np.datetime64), ], ) @@ -434,7 +444,9 @@ def test_series_describe_timedelta(dtype): pd.Series(["d", "e", "f"], dtype="category"), pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])), pd.Series( - pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"], ordered=True) + pd.Categorical( + ["d", "e", "f"], categories=["f", "e", "d"], ordered=True + ) ), ], ) @@ -545,12 +557,16 @@ def test_categorical_value_counts(dropna, normalize, num_elements): # gdf gdf = cudf.DataFrame() gdf["a"] = cudf.Series.from_categorical(pd_cat) - gdf_value_counts = gdf["a"].value_counts(dropna=dropna, normalize=normalize) + gdf_value_counts = gdf["a"].value_counts( + dropna=dropna, normalize=normalize + ) # pandas pdf = pd.DataFrame() pdf["a"] = pd_cat - pdf_value_counts = pdf["a"].value_counts(dropna=dropna, normalize=normalize) + pdf_value_counts = pdf["a"].value_counts( + dropna=dropna, normalize=normalize + ) # verify assert_eq( @@ -573,11 +589,15 @@ def test_series_value_counts(dropna, normalize): for size in [10**x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 - sr = cudf.Series.from_masked_array(arr, cudf.Series(mask)._column.as_mask()) + sr = cudf.Series.from_masked_array( + arr, cudf.Series(mask)._column.as_mask() + ) sr.name = "col" expect = ( - sr.to_pandas().value_counts(dropna=dropna, normalize=normalize).sort_index() + sr.to_pandas() + .value_counts(dropna=dropna, normalize=normalize) + .sort_index() ) got = sr.value_counts(dropna=dropna, normalize=normalize).sort_index() @@ -614,8 +634,12 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize): psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0, None]) gsr = cudf.from_pandas(psr) - expected = psr.value_counts(ascending=ascending, dropna=dropna, normalize=normalize) - got = gsr.value_counts(ascending=ascending, dropna=dropna, normalize=normalize) + expected = psr.value_counts( + ascending=ascending, dropna=dropna, normalize=normalize + ) + got = gsr.value_counts( + ascending=ascending, dropna=dropna, normalize=normalize + ) assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) assert_eq( @@ -632,7 +656,9 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize): cudf.Series([None]), cudf.Series([4]), cudf.Series([2, 3, -1, 0, 1], name="test name"), - cudf.Series([1, 2, 3, None, 2, 1], index=["a", "v", "d", "e", "f", "g"]), + cudf.Series( + [1, 2, 3, None, 2, 1], index=["a", "v", "d", "e", "f", "g"] + ), cudf.Series([1, 2, 3, None, 2, 1, None], name="abc"), cudf.Series(["ab", "bc", "ab", None, "bc", None, None]), cudf.Series([None, None, None, None, None], dtype="str"), @@ -834,7 +860,9 @@ def test_series_memory_usage(): ), ( cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"), - pd.Series([234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype()), + pd.Series( + [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype() + ), ), ( cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"), @@ -846,10 +874,14 @@ def test_series_memory_usage(): ), ( cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"), - pd.Series([11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype()), + pd.Series( + [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype() + ), ), ( - cudf.Series([32431, None, None, 32322, 0, 10, -32324, None], dtype="int64"), + cudf.Series( + [32431, None, None, 32322, 0, 10, -32324, None], dtype="int64" + ), pd.Series( [32431, None, None, 32322, 0, 10, -32324, None], dtype=pd.Int64Dtype(), @@ -1220,7 +1252,9 @@ def test_series_drop_raises(): [ None, ["ia", "ib", "ic", "id", "ie"], - pd.MultiIndex.from_tuples([(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]), + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] + ), ], ) def test_explode(data, ignore_index, p_index): @@ -1292,7 +1326,9 @@ def test_series_raises_float16(data): @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) -def test_series_sort_index(index, axis, ascending, inplace, ignore_index, na_position): +def test_series_sort_index( + index, axis, ascending, inplace, ignore_index, na_position +): ps = pd.Series([10, 3, 12], index=index) gs = cudf.from_pandas(ps) @@ -1376,7 +1412,9 @@ def test_equals_names(lhs, rhs): assert_eq(expect, got) -@pytest.mark.parametrize("data", [[True, False, None, True, False], [None, None], []]) +@pytest.mark.parametrize( + "data", [[True, False, None, True, False], [None, None], []] +) @pytest.mark.parametrize("bool_dtype", ["bool", "boolean", pd.BooleanDtype()]) def test_nullable_bool_dtype_series(data, bool_dtype): psr = pd.Series(data, dtype=pd.BooleanDtype()) @@ -1399,7 +1437,8 @@ def test_reset_index(level, drop, inplace, original_name, name): if not drop and inplace: pytest.skip( - "For exception checks, see " "test_reset_index_dup_level_name_exceptions" + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" ) expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) @@ -1424,7 +1463,8 @@ def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): gs = cudf.from_pandas(ps) if level == [None] or not drop and inplace: pytest.skip( - "For exception checks, see " "test_reset_index_dup_level_name_exceptions" + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" ) expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) @@ -1449,7 +1489,8 @@ def test_reset_index_named(drop, inplace, original_name, name): if not drop and inplace: pytest.skip( - "For exception checks, see " "test_reset_index_dup_level_name_exceptions" + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" ) expect = ps.reset_index(drop=drop, inplace=inplace, name=name) @@ -1649,13 +1690,19 @@ def test_series_truncate_errors(): def test_series_truncate_datetimeindex(): - dates = cudf.date_range("2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s") + dates = cudf.date_range( + "2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s" + ) csr = cudf.Series(range(len(dates)), index=dates) psr = csr.to_pandas() assert_eq( - csr.truncate(before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"), - psr.truncate(before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"), + csr.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ), + psr.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ), ) @@ -1890,14 +1937,18 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): indices = s.digitize(s_bins, right) else: indices = s.digitize(bins, right) - np.testing.assert_array_equal(np.digitize(data, bins, right), indices.to_numpy()) + np.testing.assert_array_equal( + np.digitize(data, bins, right), indices.to_numpy() + ) def test_series_digitize_invalid_bins(): s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") bins = cudf.Series([2, None, None, 50, 90], dtype="int32") - with pytest.raises(ValueError, match="`bins` cannot contain null entries."): + with pytest.raises( + ValueError, match="`bins` cannot contain null entries." + ): _ = s.digitize(bins) @@ -2086,7 +2137,9 @@ def test_series_copy(data, copy): {"a": 1}, ], ) -@pytest.mark.parametrize("index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]) +@pytest.mark.parametrize( + "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] +) def test_series_init_dict_with_index(data, index): pandas_series = pd.Series(data, index=index) cudf_series = cudf.Series(data, index=index) @@ -2095,7 +2148,9 @@ def test_series_init_dict_with_index(data, index): @pytest.mark.parametrize("data", ["abc", None, 1, 3.7]) -@pytest.mark.parametrize("index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]) +@pytest.mark.parametrize( + "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] +) def test_series_init_scalar_with_index(data, index): pandas_series = pd.Series(data, index=index) cudf_series = cudf.Series(data, index=index) @@ -2267,7 +2322,9 @@ def test_series_count_invalid_param(): s.count(skipna=True) -@pytest.mark.parametrize("data", [[0, 1, 2], ["a", "b", "c"], [0.324, 32.32, 3243.23]]) +@pytest.mark.parametrize( + "data", [[0, 1, 2], ["a", "b", "c"], [0.324, 32.32, 3243.23]] +) def test_series_setitem_nat_with_non_datetimes(data): s = cudf.Series(data) with pytest.raises(TypeError): @@ -2350,7 +2407,9 @@ def test_series_arrow_numeric_types_roundtrip(pandas_type): cudf.from_pandas(pdf) -@pytest.mark.parametrize("pandas_type", [pd.ArrowDtype(pa.bool_()), pd.BooleanDtype()]) +@pytest.mark.parametrize( + "pandas_type", [pd.ArrowDtype(pa.bool_()), pd.BooleanDtype()] +) def test_series_arrow_bool_types_roundtrip(pandas_type): ps = pd.Series([True, False, None], dtype=pandas_type) pi = pd.Index(ps) @@ -2369,7 +2428,9 @@ def test_series_arrow_bool_types_roundtrip(pandas_type): cudf.from_pandas(pdf) -@pytest.mark.parametrize("pandas_type", [pd.ArrowDtype(pa.string()), pd.StringDtype()]) +@pytest.mark.parametrize( + "pandas_type", [pd.ArrowDtype(pa.string()), pd.StringDtype()] +) def test_series_arrow_string_types_roundtrip(pandas_type): ps = pd.Series(["abc", None, "xyz"], dtype=pandas_type) pi = pd.Index(ps) @@ -2622,7 +2683,9 @@ def test_list_interval_like_maintains_dtype(): assert_eq(result, expected) -@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index, pd.Series, pd.Index]) +@pytest.mark.parametrize( + "klass", [cudf.Series, cudf.Index, pd.Series, pd.Index] +) def test_series_from_named_object_name_priority(klass): result = cudf.Series(klass([1], name="a"), name="b") assert result.name == "b" diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py index c8b5f8ca5c0..9da08e483c9 100644 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ b/python/cudf/cudf/tests/test_seriesmap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from itertools import product from math import floor @@ -54,7 +54,9 @@ def test_series_map_callable_numeric_random(nelem): # Call map got = sr.map(lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x))) - expect = pdsr.map(lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x))) + expect = pdsr.map( + lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x)) + ) # Check assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 61e5b8f4ba3..ff2f7bd41f2 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -38,7 +38,9 @@ def test_dataframe_setitem_scaler_bool(): [pd.DataFrame({"a": [1, 2, 3]}), pd.DataFrame({"a": ["x", "y", "z"]})], ) @pytest.mark.parametrize("arg", [["a"], "a", "b"]) -@pytest.mark.parametrize("value", [-10, pd.DataFrame({"a": [-1, -2, -3]}), "abc"]) +@pytest.mark.parametrize( + "value", [-10, pd.DataFrame({"a": [-1, -2, -3]}), "abc"] +) def test_dataframe_setitem_columns(df, arg, value): gdf = cudf.from_pandas(df) cudf_replace_value = value @@ -81,16 +83,22 @@ def test_dataframe_setitem_new_columns(df, arg, value): # set_item_series inconsistency def test_series_setitem_index(): - df = pd.DataFrame(data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3]) + df = pd.DataFrame( + data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] + ) df["b"] = pd.Series(data=[12, 11, 10], index=[3, 2, 1]) - gdf = cudf.DataFrame(data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3]) + gdf = cudf.DataFrame( + data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] + ) gdf["b"] = cudf.Series(data=[12, 11, 10], index=[3, 2, 1]) assert_eq(df, gdf, check_dtype=False) @pytest.mark.parametrize("psr", [pd.Series([1, 2, 3], index=["a", "b", "c"])]) -@pytest.mark.parametrize("arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]]) +@pytest.mark.parametrize( + "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] +) def test_series_set_item(psr, arg): gsr = cudf.from_pandas(psr) @@ -114,7 +122,9 @@ def test_series_setitem_singleton_range(): @pytest.mark.parametrize( "index", [ - pd.MultiIndex.from_frame(pd.DataFrame({"b": [3, 2, 1], "c": ["a", "b", "c"]})), + pd.MultiIndex.from_frame( + pd.DataFrame({"b": [3, 2, 1], "c": ["a", "b", "c"]}) + ), ["a", "b", "c"], ], ) @@ -380,7 +390,9 @@ def test_loc_setitem_string_11298(value): @pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11944") def test_loc_setitem_list_11944(): - df = pd.DataFrame(data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]}) + df = pd.DataFrame( + data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]} + ) cdf = cudf.from_pandas(df) df.loc[df.a == "yes", "b"] = [["hello"]] cdf.loc[df.a == "yes", "b"] = [["hello"]] @@ -445,10 +457,16 @@ def test_loc_setitem_series_index_alignment_13031(other_index): pd.Series([1, 2, 3], index=pd.RangeIndex(0, 3)), pd.Series([1, 2, 3], index=pd.RangeIndex(start=2, stop=-1, step=-1)), pd.Series([1, 2, 3], index=pd.RangeIndex(start=1, stop=6, step=2)), - pd.Series([1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-9, step=-2)), - pd.Series([1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-12, step=-3)), + pd.Series( + [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-9, step=-2) + ), + pd.Series( + [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-12, step=-3) + ), pd.Series([1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=14, step=4)), - pd.Series([1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=-14, step=-4)), + pd.Series( + [1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=-14, step=-4) + ), ], ) @pytest.mark.parametrize("arg", list(range(-20, 20)) + [5.6, 3.1]) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index bb43be9bbdf..618c4f30bd9 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -30,7 +30,9 @@ sort_slice_args = [slice(1, None), slice(None, -1), slice(1, -1)] -@pytest.mark.parametrize("nelem,dtype", list(product(sort_nelem_args, sort_dtype_args))) +@pytest.mark.parametrize( + "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args)) +) def test_dataframe_sort_values(nelem, dtype): np.random.seed(0) df = DataFrame() @@ -56,7 +58,9 @@ def test_dataframe_sort_values_ignore_index(index, ignore_index): reason="Unstable sorting by pandas(numpy): https://github.com/pandas-dev/pandas/issues/57531" ) - gdf = DataFrame({"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]}) + gdf = DataFrame( + {"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]} + ) gdf = gdf.set_index(index) pdf = gdf.to_pandas() @@ -77,7 +81,9 @@ def test_series_sort_values_ignore_index(ignore_index): assert_eq(expect, got) -@pytest.mark.parametrize("nelem,sliceobj", list(product([10, 100], sort_slice_args))) +@pytest.mark.parametrize( + "nelem,sliceobj", list(product([10, 100], sort_slice_args)) +) def test_dataframe_sort_values_sliced(nelem, sliceobj): np.random.seed(0) df = pd.DataFrame() @@ -105,7 +111,9 @@ def test_series_argsort(nelem, dtype, asc): np.testing.assert_array_equal(expected, res.to_numpy()) -@pytest.mark.parametrize("nelem,asc", list(product(sort_nelem_args, [True, False]))) +@pytest.mark.parametrize( + "nelem,asc", list(product(sort_nelem_args, [True, False])) +) def test_series_sort_index(nelem, asc): np.random.seed(0) sr = Series(100 * np.random.random(nelem)) @@ -204,7 +212,9 @@ def test_dataframe_nsmallest_sliced(counts, sliceobj): @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column(num_cols, num_rows, dtype, ascending, na_position): +def test_dataframe_multi_column( + num_cols, num_rows, dtype, ascending, na_position +): np.random.seed(0) by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() @@ -219,7 +229,9 @@ def test_dataframe_multi_column(num_cols, num_rows, dtype, ascending, na_positio got = gdf.sort_values(by, ascending=ascending, na_position=na_position) expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) - assert_eq(got[by].reset_index(drop=True), expect[by].reset_index(drop=True)) + assert_eq( + got[by].reset_index(drop=True), expect[by].reset_index(drop=True) + ) @pytest.mark.parametrize("num_cols", [1, 2, 3]) @@ -241,7 +253,9 @@ def test_dataframe_multi_column_nulls( if nulls == "some": idx = np.array([], dtype="int64") if num_rows > 0: - idx = np.random.choice(num_rows, size=int(num_rows / 4), replace=False) + idx = np.random.choice( + num_rows, size=int(num_rows / 4), replace=False + ) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -252,13 +266,21 @@ def test_dataframe_multi_column_nulls( got = gdf.sort_values(by, ascending=ascending, na_position=na_position) expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) - assert_eq(got[by].reset_index(drop=True), expect[by].reset_index(drop=True)) + assert_eq( + got[by].reset_index(drop=True), expect[by].reset_index(drop=True) + ) -@pytest.mark.parametrize("ascending", list(product((True, False), (True, False)))) +@pytest.mark.parametrize( + "ascending", list(product((True, False), (True, False))) +) @pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column_nulls_multiple_ascending(ascending, na_position): - pdf = pd.DataFrame({"a": [3, 1, None, 2, 2, None, 1], "b": [1, 2, 3, 4, 5, 6, 7]}) +def test_dataframe_multi_column_nulls_multiple_ascending( + ascending, na_position +): + pdf = pd.DataFrame( + {"a": [3, 1, None, 2, 2, None, 1], "b": [1, 2, 3, 4, 5, 6, 7]} + ) gdf = DataFrame.from_pandas(pdf) expect = pdf.sort_values( by=["a", "b"], ascending=ascending, na_position=na_position @@ -316,8 +338,12 @@ def _check_scatter_by_map(dfs, col): _check_scatter_by_map( df.scatter_by_map("a", map_size, keep_index=keep), df["a"] ) - _check_scatter_by_map(df.scatter_by_map("b", map_size, keep_index=keep), df["b"]) - _check_scatter_by_map(df.scatter_by_map("c", map_size, keep_index=keep), df["c"]) + _check_scatter_by_map( + df.scatter_by_map("b", map_size, keep_index=keep), df["b"] + ) + _check_scatter_by_map( + df.scatter_by_map("c", map_size, keep_index=keep), df["c"] + ) with pytest.warns(UserWarning): _check_scatter_by_map( df.scatter_by_map("d", map_size, keep_index=keep), df["d"] @@ -347,8 +373,12 @@ def _check_scatter_by_map(dfs, col): isinstance(frame.index, type(df2.index)) -@pytest.mark.parametrize("nelem,dtype", list(product(sort_nelem_args, sort_dtype_args))) -@pytest.mark.parametrize("kind", ["quicksort", "mergesort", "heapsort", "stable"]) +@pytest.mark.parametrize( + "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args)) +) +@pytest.mark.parametrize( + "kind", ["quicksort", "mergesort", "heapsort", "stable"] +) def test_dataframe_sort_values_kind(nelem, dtype, kind): np.random.seed(0) df = DataFrame() diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index ace62fc53d1..f18cb32a091 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -93,7 +93,9 @@ def single_column_df_base_data(df: cudf.DataFrame) -> SpillableBuffer: def spilled_and_unspilled(manager: SpillManager) -> Tuple[int, int]: """Get bytes spilled and unspilled known by the manager""" spilled = sum(buf.size for buf in manager.buffers() if buf.is_spilled) - unspilled = sum(buf.size for buf in manager.buffers() if not buf.is_spilled) + unspilled = sum( + buf.size for buf in manager.buffers() if not buf.is_spilled + ) return spilled, unspilled @@ -320,7 +322,9 @@ def test_spill_to_device_limit(manager: SpillManager): assert single_column_df_data(df3).is_spilled -@pytest.mark.parametrize("manager", [{"device_memory_limit": 0}], indirect=True) +@pytest.mark.parametrize( + "manager", [{"device_memory_limit": 0}], indirect=True +) def test_zero_device_limit(manager: SpillManager): assert manager._device_memory_limit == 0 df1 = single_column_df() @@ -425,8 +429,12 @@ def f(sleep=False, nest=0): futures_with_spill_lock = [] futures_without_spill_lock = [] for _ in range(100): - futures_with_spill_lock.append(executor.submit(f, sleep=True, nest=1)) - futures_without_spill_lock.append(executor.submit(f, sleep=True, nest=1)) + futures_with_spill_lock.append( + executor.submit(f, sleep=True, nest=1) + ) + futures_without_spill_lock.append( + executor.submit(f, sleep=True, nest=1) + ) all(isinstance(f.result(), SpillLock) for f in futures_with_spill_lock) all(f is None for f in futures_without_spill_lock) @@ -486,7 +494,9 @@ def test_serialize_dask_dataframe(manager: SpillManager): protocol = pytest.importorskip("distributed.protocol") df1 = single_column_df(target="gpu") - header, frames = protocol.serialize(df1, serializers=("dask",), on_error="raise") + header, frames = protocol.serialize( + df1, serializers=("dask",), on_error="raise" + ) buf = single_column_df_data(df1) assert len(frames) == 1 assert isinstance(frames[0], memoryview) @@ -505,7 +515,9 @@ def test_serialize_cuda_dataframe(manager: SpillManager): protocol = pytest.importorskip("distributed.protocol") df1 = single_column_df(target="gpu") - header, frames = protocol.serialize(df1, serializers=("cuda",), on_error="raise") + header, frames = protocol.serialize( + df1, serializers=("cuda",), on_error="raise" + ) buf: SpillableBuffer = single_column_df_data(df1) assert len(buf.owner._spill_locks) == 1 assert len(frames) == 1 @@ -608,7 +620,9 @@ def test_memoryview_slice(manager: SpillManager, dtype): def test_statistics(manager: SpillManager): assert len(manager.statistics.spill_totals) == 0 - buf: SpillableBuffer = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) + buf: SpillableBuffer = as_buffer( + data=rmm.DeviceBuffer(size=10), exposed=False + ) buf.spill(target="cpu") if manager.statistics.level == 0: @@ -632,7 +646,8 @@ def test_statistics_expose(manager: SpillManager): assert len(manager.statistics.spill_totals) == 0 buffers: List[SpillableBuffer] = [ - as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) + as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) + for _ in range(10) ] # Expose the first buffer @@ -657,7 +672,8 @@ def test_statistics_expose(manager: SpillManager): # Create and spill 10 new buffers buffers: List[SpillableBuffer] = [ - as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) + as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) + for _ in range(10) ] manager.spill_to_device_limit(0) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 6e604345897..b9eb42906e8 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -109,7 +109,9 @@ def test_series_nunique(nan_as_null, dropna): got = cudf_series.nunique(dropna=dropna) assert expect == got - cudf_series = cudf.Series([1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null) + cudf_series = cudf.Series( + [1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null + ) if nan_as_null is True: pd_series = pd.Series([1.0, 2.0, 3.0, np.nan, None]) else: @@ -149,7 +151,9 @@ def test_exact_quantiles(int_method): df = pd.DataFrame(arr) gdf_series = cudf.Series(arr) - q1 = gdf_series.quantile(quant_values, interpolation=int_method, exact=True) + q1 = gdf_series.quantile( + quant_values, interpolation=int_method, exact=True + ) q2 = df.quantile(quant_values, interpolation=int_method) @@ -166,7 +170,9 @@ def test_exact_quantiles_int(int_method): df = pd.DataFrame(arr) gdf_series = cudf.Series(arr) - q1 = gdf_series.quantile(quant_values, interpolation=int_method, exact=True) + q1 = gdf_series.quantile( + quant_values, interpolation=int_method, exact=True + ) q2 = df.quantile(quant_values, interpolation=int_method) @@ -282,7 +288,9 @@ def test_kurt_skew_error(op): cudf.Series(np.zeros(100)), cudf.Series(np.repeat(np.nan, 100)), cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), - cudf.Series([5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False), + cudf.Series( + [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False + ), cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), cudf.Series([], dtype="float64"), cudf.Series([-3]), @@ -366,7 +374,9 @@ def test_series_pct_change(data, periods, fill_method): ) ): expected = ps.pct_change(periods=periods, fill_method=fill_method) - np.testing.assert_array_almost_equal(got.to_numpy(na_value=np.nan), expected) + np.testing.assert_array_almost_equal( + got.to_numpy(na_value=np.nan), expected + ) @pytest.mark.parametrize( @@ -455,12 +465,14 @@ def test_corr1d(data1, data2, method): ps1_align, ps2_align = ps1.align(ps2, join="inner") - is_singular = (len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0) or ( - len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0 - ) + is_singular = ( + len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0 + ) or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0) is_identical = ( len(ps1_align.dropna().unique()) == 1 and len(ps2_align.dropna()) > 0 - ) or (len(ps2_align.dropna().unique()) == 1 and len(ps1_align.dropna()) > 0) + ) or ( + len(ps2_align.dropna().unique()) == 1 and len(ps1_align.dropna()) > 0 + ) # Pearson correlation leads to division by 0 when either sample size is 1. # Spearman allows for size 1 samples, but will error if all data in a @@ -527,7 +539,9 @@ def test_nans_stats(data, ops, skipna): psr = pd.Series(data, dtype="float64" if len(data) == 0 else None) gsr = cudf.from_pandas(psr) - assert_eq(getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna)) + assert_eq( + getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) + ) gsr = cudf.Series( data, dtype="float64" if len(data) == 0 else None, nan_as_null=False @@ -587,7 +601,9 @@ def test_cov_corr_datetime_timedelta(data1, data2, dtype): @pytest.mark.parametrize( "data", [ - randomdata(nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}), + randomdata( + nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} + ), ], ) @pytest.mark.parametrize("null_flag", [False, True]) @@ -617,7 +633,9 @@ def test_kurtosis_df(data, null_flag, numeric_only): @pytest.mark.parametrize( "data", [ - randomdata(nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}), + randomdata( + nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} + ), ], ) @pytest.mark.parametrize("null_flag", [False, True]) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index bd755070334..de771a56e77 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -175,7 +175,11 @@ def test_string_repr(ps_gs, item): "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"] ) def test_string_astype(dtype): - if dtype.startswith("int") or dtype.startswith("uint") or dtype.startswith("long"): + if ( + dtype.startswith("int") + or dtype.startswith("uint") + or dtype.startswith("long") + ): data = ["1", "2", "3", "4", "5"] elif dtype.startswith("float"): data = [ @@ -295,7 +299,9 @@ def test_string_numeric_astype(dtype): if dtype.startswith("bool"): data = [1, 0, 1, 0, 1] elif ( - dtype.startswith("int") or dtype.startswith("uint") or dtype.startswith("long") + dtype.startswith("int") + or dtype.startswith("uint") + or dtype.startswith("long") ): data = [1, 2, 3, 4, 5] elif dtype.startswith("float"): @@ -387,7 +393,9 @@ def _cat_convert_seq_to_cudf(others): gd_others = pd_others if isinstance(gd_others, (list, tuple)): temp_tuple = [ - cudf.from_pandas(elem) if isinstance(elem, (pd.Series, pd.Index)) else elem + cudf.from_pandas(elem) + if isinstance(elem, (pd.Series, pd.Index)) + else elem for elem in gd_others ] @@ -761,7 +769,9 @@ def test_string_join(ps_gs, sep): @pytest.mark.parametrize("pat", [r"(a)", r"(f)", r"([a-z])", r"([A-Z])"]) @pytest.mark.parametrize("expand", [True, False]) -@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)]) +@pytest.mark.parametrize( + "flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)] +) def test_string_extract(ps_gs, pat, expand, flags, flags_raise): ps, gs = ps_gs expectation = raise_builder([flags_raise], NotImplementedError) @@ -882,7 +892,9 @@ def test_string_repeat(data, repeats): @pytest.mark.parametrize("repl", ["qwerty", "", " "]) @pytest.mark.parametrize("case,case_raise", [(None, 0), (True, 1), (False, 1)]) @pytest.mark.parametrize("flags,flags_raise", [(0, 0), (re.U, 1)]) -def test_string_replace(ps_gs, pat, repl, case, case_raise, flags, flags_raise, regex): +def test_string_replace( + ps_gs, pat, repl, case, case_raise, flags, flags_raise, regex +): ps, gs = ps_gs expectation = raise_builder([case_raise, flags_raise], NotImplementedError) @@ -1185,7 +1197,9 @@ def test_string_no_children_properties(): ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], ], ) -@pytest.mark.parametrize("index", [-100, -5, -2, -6, -1, 0, 1, 2, 3, 9, 10, 100]) +@pytest.mark.parametrize( + "index", [-100, -5, -2, -6, -1, 0, 1, 2, 3, 9, 10, 100] +) def test_string_get(string, index): pds = pd.Series(string) gds = cudf.Series(string) @@ -1547,7 +1561,10 @@ def test_string_rsplit_re(n, expand): # Pandas does not yet support the regex parameter for rsplit import inspect - assert "regex" not in inspect.signature(pd.Series.str.rsplit).parameters.keys() + assert ( + "regex" + not in inspect.signature(pd.Series.str.rsplit).parameters.keys() + ) expect = ps.str.rsplit(pat=" ", n=n, expand=expand) got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True) @@ -1615,15 +1632,23 @@ def test_strings_strip_tests(data, to_strip): ps = pd.Series(data) assert_eq(ps.str.strip(to_strip=to_strip), gs.str.strip(to_strip=to_strip)) - assert_eq(ps.str.rstrip(to_strip=to_strip), gs.str.rstrip(to_strip=to_strip)) - assert_eq(ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip)) + assert_eq( + ps.str.rstrip(to_strip=to_strip), gs.str.rstrip(to_strip=to_strip) + ) + assert_eq( + ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip) + ) gi = as_index(data) pi = pd.Index(data) assert_eq(pi.str.strip(to_strip=to_strip), gi.str.strip(to_strip=to_strip)) - assert_eq(pi.str.rstrip(to_strip=to_strip), gi.str.rstrip(to_strip=to_strip)) - assert_eq(pi.str.lstrip(to_strip=to_strip), gi.str.lstrip(to_strip=to_strip)) + assert_eq( + pi.str.rstrip(to_strip=to_strip), gi.str.rstrip(to_strip=to_strip) + ) + assert_eq( + pi.str.lstrip(to_strip=to_strip), gi.str.lstrip(to_strip=to_strip) + ) def test_string_strip_fail(): @@ -1967,8 +1992,12 @@ def test_string_starts_ends(data, pat): rfunc_args_and_kwargs=([pat],), ) else: - assert_eq(ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False) - assert_eq(ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False) + assert_eq( + ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False + ) + assert_eq( + ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False + ) @pytest.mark.parametrize( @@ -2306,7 +2335,9 @@ def test_string_str_match(data, pat): gs = cudf.Series(data) assert_eq(ps.str.match(pat), gs.str.match(pat)) - assert_eq(pd.Index(pd.Index(ps).str.match(pat)), as_index(gs).str.match(pat)) + assert_eq( + pd.Index(pd.Index(ps).str.match(pat)), as_index(gs).str.match(pat) + ) @pytest.mark.parametrize( @@ -2339,12 +2370,20 @@ def test_string_str_translate(data): gs.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), ) assert_eq( - pd.Index(ps).str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), - as_index(gs).str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), + pd.Index(ps).str.translate( + str.maketrans({"a": "z", "i": "$", "z": "1"}) + ), + as_index(gs).str.translate( + str.maketrans({"a": "z", "i": "$", "z": "1"}) + ), ) assert_eq( - ps.str.translate(str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."})), - gs.str.translate(str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."})), + ps.str.translate( + str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) + ), + gs.str.translate( + str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) + ), ) assert_eq( pd.Index(ps).str.translate( @@ -2371,7 +2410,9 @@ def test_string_str_filter_characters(): "", ] gs = cudf.Series(data) - expected = cudf.Series(["helloworld", "ABCD", "", "accnt", None, "150", ""]) + expected = cudf.Series( + ["helloworld", "ABCD", "", "accnt", None, "150", ""] + ) filter = {"a": "z", "A": "Z", "0": "9"} assert_eq(expected, gs.str.filter_characters(filter)) @@ -2618,7 +2659,9 @@ def test_istimestamp_empty(): def test_string_ip4_to_int(): - gsr = cudf.Series(["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"]) + gsr = cudf.Series( + ["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"] + ) expected = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]) got = gsr.str.ip2int() @@ -3128,20 +3171,28 @@ def test_string_get_json_object_allow_single_quotes(): ] ) assert_eq( - gs.str.get_json_object("$.store.book[0].author", allow_single_quotes=True), + gs.str.get_json_object( + "$.store.book[0].author", allow_single_quotes=True + ), cudf.Series(["Nigel Rees"]), ) assert_eq( - gs.str.get_json_object("$.store.book[*].title", allow_single_quotes=True), + gs.str.get_json_object( + "$.store.book[*].title", allow_single_quotes=True + ), cudf.Series(["['Sayings of the Century',\"Sword of Honour\"]"]), ) assert_eq( - gs.str.get_json_object("$.store.book[0].author", allow_single_quotes=False), + gs.str.get_json_object( + "$.store.book[0].author", allow_single_quotes=False + ), cudf.Series([None]), ) assert_eq( - gs.str.get_json_object("$.store.book[*].title", allow_single_quotes=False), + gs.str.get_json_object( + "$.store.book[*].title", allow_single_quotes=False + ), cudf.Series([None]), ) @@ -3333,7 +3384,9 @@ def test_str_join_lists_error(): ["-", "_", "**", None], "rep_str", "sep_str", - cudf.Series(["a-rep_str-b", None, "rep_str**hello**rep_str**world", None]), + cudf.Series( + ["a-rep_str-b", None, "rep_str**hello**rep_str**world", None] + ), ), ( cudf.Series([[None, "a"], [None], None]), @@ -3352,7 +3405,9 @@ def test_str_join_lists_error(): ], ) def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected): - actual = sr.str.join(sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep) + actual = sr.str.join( + sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep + ) assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py index 9566ee19ddd..5dbb86fe27d 100644 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ b/python/cudf/cudf/tests/test_string_udfs.py @@ -76,7 +76,9 @@ def run_udf_test(data, func, dtype): comparing it with the equivalent pandas result """ if dtype == "str": - output = rmm.DeviceBuffer(size=len(data) * _get_extensionty_size(udf_string)) + output = rmm.DeviceBuffer( + size=len(data) * _get_extensionty_size(udf_string) + ) else: dtype = np.dtype(dtype) output = cudf.core.column.column_empty(len(data), dtype=dtype) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 100fc40fb97..60d9516f385 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -226,7 +226,9 @@ def test_dataframe_to_struct(): assert_eq(expect, got) df = cudf.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) - expect = cudf.Series([{"a": 1, "b": "x"}, {"a": 2, "b": "y"}, {"a": 3, "b": "z"}]) + expect = cudf.Series( + [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}, {"a": 3, "b": "z"}] + ) got = df.to_struct() assert_eq(expect, got) @@ -343,7 +345,9 @@ def test_struct_with_datetime_and_timedelta(dtype): def test_struct_int_values(): - series = cudf.Series([{"a": 1, "b": 2}, {"a": 10, "b": None}, {"a": 5, "b": 6}]) + series = cudf.Series( + [{"a": 1, "b": 2}, {"a": 10, "b": None}, {"a": 5, "b": 6}] + ) actual_series = series.to_pandas() assert isinstance(actual_series[0]["b"], int) @@ -439,6 +443,8 @@ def test_struct_empty_children_slice(indices, values): def test_struct_iterate_error(): - s = cudf.Series([{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}]) + s = cudf.Series( + [{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}] + ) with pytest.raises(TypeError): iter(s.struct) diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index e594ba6ba45..1994536f395 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -39,7 +39,9 @@ def arrow_arrays(request): @pytest.mark.parametrize("check_names", [True, False]) @pytest.mark.parametrize("rname", ["a", "b"]) @pytest.mark.parametrize("check_categorical", [True, False]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] +) def test_basic_assert_index_equal( rdata, exact, @@ -70,7 +72,10 @@ def test_basic_assert_index_equal( if kind is not None: if (kind == TypeError) and ( msg - == ("Categoricals can only be compared " "if 'categories' are the same.") + == ( + "Categoricals can only be compared " + "if 'categories' are the same." + ) ): kind = AssertionError with pytest.raises(kind): @@ -96,7 +101,9 @@ def test_basic_assert_index_equal( @pytest.mark.parametrize("rname", ["a", "b"]) @pytest.mark.parametrize("check_category_order", [True, False]) @pytest.mark.parametrize("check_categorical", [True, False]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"]) +@pytest.mark.parametrize( + "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] +) def test_basic_assert_series_equal( rdata, rname, @@ -156,7 +163,9 @@ def test_assert_column_equal_dtype_edge_cases(other): base = as_column([1, 2, 3]) # for these dtypes, the diff should always be 100% regardless of the values - with pytest.raises(AssertionError, match=r".*values are different \(100.0 %\).*"): + with pytest.raises( + AssertionError, match=r".*values are different \(100.0 %\).*" + ): assert_column_equal(base, other, check_dtype=False) # the exceptions are the empty and all null cases @@ -323,7 +332,9 @@ def test_series_different_type_cases(dtype, check_exact, check_dtype): sr1, sr2, check_exact=check_exact, check_dtype=check_dtype ) else: - assert_series_equal(sr1, sr2, check_exact=check_exact, check_dtype=check_dtype) + assert_series_equal( + sr1, sr2, check_exact=check_exact, check_dtype=check_dtype + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 73e85cd1b7f..0c591965361 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -84,7 +84,9 @@ @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) def test_timedelta_series_create(data, dtype): if dtype not in ("timedelta64[ns]"): - pytest.skip("Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465") + pytest.skip( + "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" + ) psr = pd.Series( cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype ) @@ -106,7 +108,9 @@ def test_timedelta_series_create(data, dtype): @pytest.mark.parametrize("cast_dtype", ["int64", "category"]) def test_timedelta_from_typecast(data, dtype, cast_dtype): if dtype not in ("timedelta64[ns]"): - pytest.skip("Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465") + pytest.skip( + "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" + ) psr = pd.Series( cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype ) @@ -363,7 +367,9 @@ def test_timedelta_ops_datetime_inputs( ), pd.DataFrame( { - "A": pd.Series(pd.date_range("1994-1-1", periods=50, freq="D")), + "A": pd.Series( + pd.date_range("1994-1-1", periods=50, freq="D") + ), "B": pd.Series([pd.Timedelta(days=i) for i in range(50)]), } ), @@ -658,7 +664,9 @@ def test_timedelta_reduction_ops(data, dtype, reduction_op): actual = getattr(gsr, reduction_op)() if pd.isna(expected) and pd.isna(actual): pass - elif isinstance(expected, pd.Timedelta) and isinstance(actual, pd.Timedelta): + elif isinstance(expected, pd.Timedelta) and isinstance( + actual, pd.Timedelta + ): assert ( expected.round(gsr._column.time_unit).value == actual.round(gsr._column.time_unit).value @@ -736,7 +744,9 @@ def test_timedelta_index(data, dtype): @pytest.mark.parametrize("data", _TIMEDELTA_DATA_NON_OVERFLOW) @pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) @pytest.mark.parametrize("timedelta_dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_index_datetime_index_ops(data, datetime_dtype, timedelta_dtype): +def test_timedelta_index_datetime_index_ops( + data, datetime_dtype, timedelta_dtype +): gdt = cudf.Index(data, dtype=datetime_dtype) gtd = cudf.Index(data, dtype=timedelta_dtype) @@ -836,7 +846,9 @@ def test_timedelta_datetime_index_ops_misc( ], ) @pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning:pandas") -def test_timedelta_index_ops_with_scalars(request, data, other_scalars, dtype, op): +def test_timedelta_index_ops_with_scalars( + request, data, other_scalars, dtype, op +): gtdi = cudf.Index(data=data, dtype=dtype) ptdi = gtdi.to_pandas() @@ -907,7 +919,9 @@ def test_timedelta_index_ops_with_scalars(request, data, other_scalars, dtype, o "floordiv", ], ) -def test_timedelta_index_ops_with_cudf_scalars(request, data, cpu_scalar, dtype, op): +def test_timedelta_index_ops_with_cudf_scalars( + request, data, cpu_scalar, dtype, op +): gtdi = cudf.Index(data=data, dtype=dtype) ptdi = gtdi.to_pandas() @@ -1052,14 +1066,20 @@ def test_timedelta_fillna(data, dtype, fill_value): ), ( cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ms]"), - cudf.Series(["0 days 00:16:40", "0 days 00:03:20", "0 days 00:50:00"]), + cudf.Series( + ["0 days 00:16:40", "0 days 00:03:20", "0 days 00:50:00"] + ), ), ( cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[s]"), - cudf.Series(["11 days 13:46:40", "2 days 07:33:20", "34 days 17:20:00"]), + cudf.Series( + ["11 days 13:46:40", "2 days 07:33:20", "34 days 17:20:00"] + ), ), ( - cudf.Series([None, None, None, None, None], dtype="timedelta64[us]"), + cudf.Series( + [None, None, None, None, None], dtype="timedelta64[us]" + ), cudf.Series([None, None, None, None, None], dtype="str"), ), ( diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index bc66e00ffdd..4843decedba 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -210,7 +210,9 @@ def func(row): # we should get: # [?, ?, , , ] - gdf = cudf.DataFrame({"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]}) + gdf = cudf.DataFrame( + {"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]} + ) run_masked_udf_test(func, gdf, check_dtype=False) @@ -278,7 +280,8 @@ def func(row): request.applymarker( pytest.mark.xfail( condition=( - (gdf["data"] == 1).any() and op in {operator.pow, operator.ipow} + (gdf["data"] == 1).any() + and op in {operator.pow, operator.ipow} ), reason="https://github.com/rapidsai/cudf/issues/7478", ) @@ -537,7 +540,9 @@ def func(x): # in pandas, 1**NA == 1. In cudf, 1**NA == NA. request.applymarker( pytest.mark.xfail( - condition=(constant is cudf.NA and op in {operator.pow, operator.ipow}), + condition=( + constant is cudf.NA and op in {operator.pow, operator.ipow} + ), reason="https://github.com/rapidsai/cudf/issues/7478", ) ) @@ -557,7 +562,9 @@ def func(x): # in pandas, 1**NA == 1. In cudf, 1**NA == NA. request.applymarker( pytest.mark.xfail( - condition=(constant in {1} and op in {operator.pow, operator.ipow}), + condition=( + constant in {1} and op in {operator.pow, operator.ipow} + ), reason="https://github.com/rapidsai/cudf/issues/7478", ) ) @@ -608,7 +615,9 @@ def outer(row): y = row["b"] return inner(x, y) - gdf = cudf.DataFrame({"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]}) + gdf = cudf.DataFrame( + {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} + ) with pytest.raises(ValueError): gdf.apply(outer, axis=1) @@ -645,7 +654,9 @@ def func(row): @pytest.mark.parametrize( "unsupported_col", [ - _decimal_series(["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)), + _decimal_series( + ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) + ), cudf.Series([1, 2, 3], dtype="category"), cudf.interval_range(start=0, end=3), [[1, 2], [3, 4], [5, 6]], @@ -808,7 +819,9 @@ def f(x, c): assert precompiled.currsize == 2 -@pytest.mark.parametrize("data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]]) +@pytest.mark.parametrize( + "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] +) @pytest.mark.parametrize("operator", [float, int, bool]) def test_masked_udf_casting(operator, data): data = cudf.Series(data) @@ -993,7 +1006,9 @@ def func(row): run_masked_udf_test(func, str_udf_data, check_dtype=False) - @pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) + @pytest.mark.parametrize( + "concat_char", ["1", "a", "12", " ", "", ".", "@"] + ) def test_string_udf_concat(self, str_udf_data, concat_char): def func(row): return row["str_col"] + concat_char diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index 51ae5aa7de5..15d9d03d4a7 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import itertools import operator @@ -118,7 +118,8 @@ def test_scalar_no_negative_bools(): with pytest.raises( TypeError, match=re.escape( - "Boolean scalars in cuDF do not " "support negation, use logical not" + "Boolean scalars in cuDF do not " + "support negation, use logical not" ), ): -x diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py index a3ff76988d3..b21edc0477f 100644 --- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py +++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py @@ -16,9 +16,14 @@ def datadir(datadir): def assert_equal_tokenization_outputs(hf_output, cudf_output): - assert np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0 assert ( - np.sum(hf_output["attention_mask"] != cudf_output["attention_mask"].get()) == 0 + np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0 + ) + assert ( + np.sum( + hf_output["attention_mask"] != cudf_output["attention_mask"].get() + ) + == 0 ) @@ -27,8 +32,12 @@ def assert_equal_tokenization_outputs(hf_output, cudf_output): @pytest.mark.parametrize("stride", [0, 15, 30]) @pytest.mark.parametrize("add_special_tokens", [True, False]) @pytest.mark.parametrize("do_lower_case", [True, False]) -def test_subword_tokenize(seq_len, stride, add_special_tokens, do_lower_case, datadir): - with open(os.path.join(datadir, "test_sentences.txt"), encoding="utf-8") as file: +def test_subword_tokenize( + seq_len, stride, add_special_tokens, do_lower_case, datadir +): + with open( + os.path.join(datadir, "test_sentences.txt"), encoding="utf-8" + ) as file: input_sentence_ls = [line.strip() for line in file] vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") @@ -119,7 +128,9 @@ def test_text_subword_tokenize(tmpdir): cudf_tokenizer = SubwordTokenizer(hash_file) - token_d = cudf_tokenizer(sr, 8, 8, add_special_tokens=False, truncation=True) + token_d = cudf_tokenizer( + sr, 8, 8, add_special_tokens=False, truncation=True + ) tokens, masks, metadata = ( token_d["input_ids"], token_d["attention_mask"], diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 097272c8aab..2dccd583b23 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import random import string @@ -109,7 +109,9 @@ def test_detokenize(): assert type(expected) == type(actual) assert_eq(expected, actual) - indices = cudf.Series([4, 0, 0, 0, 0, 4, 1, 1, 4, 2, 2, 2, 2, 4, 3], dtype=np.int8) + indices = cudf.Series( + [4, 0, 0, 0, 0, 4, 1, 1, 4, 2, 2, 2, 2, 4, 3], dtype=np.int8 + ) actual = strings.str.detokenize(indices, "+") expected = cudf.Series( [ @@ -679,7 +681,9 @@ def test_text_replace_tokens(): "emptyme", ], ) - targets = cudf.Series(["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"]) + targets = cudf.Series( + ["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"] + ) replacements = cudf.Series(["the", "🚒", "🚒🚒🚒🚒", "🔥🔥", ""]) expected = cudf.Series( @@ -693,7 +697,9 @@ def test_text_replace_tokens(): assert_eq(expected, actual) - sr = cudf.Series(["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"]) + sr = cudf.Series( + ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] + ) targets = cudf.Series(["🌬", "🔥", "🌊"]) replacements = "🚰" @@ -749,7 +755,9 @@ def test_text_filter_tokens(): actual = sr.str.filter_tokens(5, "🔥") assert_eq(expected, actual) - sr = cudf.Series(["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"]) + sr = cudf.Series( + ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] + ) expected = cudf.Series( ["All-we-need;is;--", "\tall-we-need0is;--", "all;we:need+is;--"] ) @@ -962,7 +970,9 @@ def test_jaccard_index(): str1.str.jaccard_index(str3, 5) -def _make_list_of_strings_of_random_length(num_strings, min_length, max_length): +def _make_list_of_strings_of_random_length( + num_strings, min_length, max_length +): return [ "".join( random.choice(string.ascii_lowercase) diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py index 8862ffae125..494b48b3cfd 100644 --- a/python/cudf/cudf/utils/_numba.py +++ b/python/cudf/cudf/utils/_numba.py @@ -120,13 +120,17 @@ def _setup_numba(): versions = safe_get_versions() if versions != NO_DRIVER: driver_version, runtime_version = versions - ptx_toolkit_version = _get_cuda_version_from_ptx_file(_get_cc_60_ptx_file()) + ptx_toolkit_version = _get_cuda_version_from_ptx_file( + _get_cc_60_ptx_file() + ) # MVC is required whenever any PTX is newer than the driver # This could be the shipped PTX file or the PTX emitted by # the version of NVVM on the user system, the latter aligning # with the runtime version - if (driver_version < ptx_toolkit_version) or (driver_version < runtime_version): + if (driver_version < ptx_toolkit_version) or ( + driver_version < runtime_version + ): if driver_version < (12, 0): patch_numba_linker_cuda_11() else: @@ -182,19 +186,25 @@ def _get_cuda_version_from_ptx_file(path): cuda_ver = ver_map.get(version) if cuda_ver is None: - raise ValueError(f"Could not map PTX version {version} to a CUDA version") + raise ValueError( + f"Could not map PTX version {version} to a CUDA version" + ) return cuda_ver class _CUDFNumbaConfig: def __enter__(self): - self.CUDA_LOW_OCCUPANCY_WARNINGS = numba_config.CUDA_LOW_OCCUPANCY_WARNINGS + self.CUDA_LOW_OCCUPANCY_WARNINGS = ( + numba_config.CUDA_LOW_OCCUPANCY_WARNINGS + ) numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 self.CAPTURED_ERRORS = numba_config.CAPTURED_ERRORS numba_config.CAPTURED_ERRORS = "new_style" def __exit__(self, exc_type, exc_value, traceback): - numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = self.CUDA_LOW_OCCUPANCY_WARNINGS + numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = ( + self.CUDA_LOW_OCCUPANCY_WARNINGS + ) numba_config.CAPTURED_ERRORS = self.CAPTURED_ERRORS diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py index 616f816ccc7..54f5ea08ee1 100644 --- a/python/cudf/cudf/utils/_ptxcompiler.py +++ b/python/cudf/cudf/utils/_ptxcompiler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -85,8 +85,12 @@ def safe_get_versions(): try: # allow user to specify driver/runtime # versions manually, if necessary - driver_version = os.environ["PTXCOMPILER_KNOWN_DRIVER_VERSION"].split(".") - runtime_version = os.environ["PTXCOMPILER_KNOWN_RUNTIME_VERSION"].split(".") + driver_version = os.environ[ + "PTXCOMPILER_KNOWN_DRIVER_VERSION" + ].split(".") + runtime_version = os.environ[ + "PTXCOMPILER_KNOWN_RUNTIME_VERSION" + ].split(".") driver_version, runtime_version = ( tuple(map(int, driver_version)), tuple(map(int, runtime_version)), diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index ce3839c3644..d57303ca122 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -66,7 +66,9 @@ @doc_apply() -def apply_rows(df, func, incols, outcols, kwargs, pessimistic_nulls, cache_key): +def apply_rows( + df, func, incols, outcols, kwargs, pessimistic_nulls, cache_key +): """Row-wise transformation Parameters @@ -114,15 +116,21 @@ def make_aggregate_nullmask(df, columns=None, op="__and__"): nullmask = column.as_column(df[k]._column.nullmask) if out_mask is None: - out_mask = column.as_column(nullmask.copy(), dtype=utils.mask_dtype) + out_mask = column.as_column( + nullmask.copy(), dtype=utils.mask_dtype + ) else: - out_mask = libcudf.binaryop.binaryop(nullmask, out_mask, op, out_mask.dtype) + out_mask = libcudf.binaryop.binaryop( + nullmask, out_mask, op, out_mask.dtype + ) return out_mask class ApplyKernelCompilerBase: - def __init__(self, func, incols, outcols, kwargs, pessimistic_nulls, cache_key): + def __init__( + self, func, incols, outcols, kwargs, pessimistic_nulls, cache_key + ): # Get signature of user function sig = pysignature(func) self.sig = sig @@ -143,14 +151,15 @@ def run(self, df, **launch_params): } else: inputs = { - k: df[k]._column.data_array_view(mode="read") for k in self.incols + k: df[k]._column.data_array_view(mode="read") + for k in self.incols } # Allocate output columns outputs = {} for k, dt in self.outcols.items(): - outputs[k] = column.column_empty(len(df), dt, False).data_array_view( - mode="write" - ) + outputs[k] = column.column_empty( + len(df), dt, False + ).data_array_view(mode="write") # Bind argument args = {} for dct in [inputs, outputs, self.kwargs]: @@ -166,7 +175,9 @@ def run(self, df, **launch_params): # Prepare output frame outdf = df.copy() for k in sorted(self.outcols): - outdf[k] = cudf.Series(outputs[k], index=outdf.index, nan_as_null=False) + outdf[k] = cudf.Series( + outputs[k], index=outdf.index, nan_as_null=False + ) if out_mask is not None: outdf._data[k] = outdf[k]._column.set_mask( out_mask.data_array_view(mode="write") @@ -191,7 +202,9 @@ def launch_kernel(self, df, args): class ApplyChunksCompiler(ApplyKernelCompilerBase): def compile(self, func, argnames, extra_argnames): # Compile kernel - kernel = _load_cache_or_make_chunk_wise_kernel(func, argnames, extra_argnames) + kernel = _load_cache_or_make_chunk_wise_kernel( + func, argnames, extra_argnames + ) return kernel def launch_kernel(self, df, args, chunks, blkct=None, tpb=None): @@ -209,9 +222,9 @@ def launch_kernel(self, df, args, chunks, blkct=None, tpb=None): def normalize_chunks(self, size, chunks): if isinstance(chunks, int): # *chunks* is the chunksize - return cuda.as_cuda_array(cp.arange(start=0, stop=size, step=chunks)).view( - "int64" - ) + return cuda.as_cuda_array( + cp.arange(start=0, stop=size, step=chunks) + ).view("int64") else: # *chunks* is an array of chunk leading offset return cuda.as_cuda_array(cp.asarray(chunks)).view("int64") @@ -246,7 +259,9 @@ def row_wise_kernel({args}): stop = "" stride = "ntid" srcidx = "{a} = {a}[{start}:{stop}:{stride}]" - body.append(srcidx.format(a=a, start=start, stop=stop, stride=stride)) + body.append( + srcidx.format(a=a, start=start, stop=stop, stride=stride) + ) body.append(f"inner({args})") @@ -296,7 +311,9 @@ def chunk_wise_kernel(nrows, chunks, {args}): body.append(indent + "start = chunks[curblk]") body.append( - indent + "stop = chunks[curblk + 1]" + " if curblk + 1 < chunks.size else nrows" + indent + + "stop = chunks[curblk + 1]" + + " if curblk + 1 < chunks.size else nrows" ) slicedargs = {} @@ -306,7 +323,9 @@ def chunk_wise_kernel(nrows, chunks, {args}): else: slicedargs[a] = str(a) body.append( - "{}inner({})".format(indent, ", ".join(slicedargs[k] for k in argnames)) + "{}inner({})".format( + indent, ", ".join(slicedargs[k] for k in argnames) + ) ) indented = ["{}{}".format(" " * 4, ln) for ln in body] diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index dc4624a9a14..020c32de9f3 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from pickle import dumps @@ -29,12 +29,16 @@ def window_sizes_from_offset(arr, offset): window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32") if arr.size > 0: with _CUDFNumbaConfig(): - gpu_window_sizes_from_offset.forall(arr.size)(arr, window_sizes, offset) + gpu_window_sizes_from_offset.forall(arr.size)( + arr, window_sizes, offset + ) return window_sizes @cuda.jit -def gpu_grouped_window_sizes_from_offset(arr, window_sizes, group_starts, offset): +def gpu_grouped_window_sizes_from_offset( + arr, window_sizes, group_starts, offset +): i = cuda.grid(1) j = i if i < arr.size: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index d6438833846..e9dbc23d767 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -50,7 +50,8 @@ np.dtype("float64"): pd.Float64Dtype(), } pandas_dtypes_to_np_dtypes = { - pd_dtype: np_dtype for np_dtype, pd_dtype in np_dtypes_to_pandas_dtypes.items() + pd_dtype: np_dtype + for np_dtype, pd_dtype in np_dtypes_to_pandas_dtypes.items() } pyarrow_dtypes_to_pandas_dtypes = { @@ -127,11 +128,17 @@ def _find_common_type_decimal(dtypes): p = s + lhs if p > cudf.Decimal64Dtype.MAX_PRECISION: - return cudf.Decimal128Dtype(min(cudf.Decimal128Dtype.MAX_PRECISION, p), s) + return cudf.Decimal128Dtype( + min(cudf.Decimal128Dtype.MAX_PRECISION, p), s + ) elif p > cudf.Decimal32Dtype.MAX_PRECISION: - return cudf.Decimal64Dtype(min(cudf.Decimal64Dtype.MAX_PRECISION, p), s) + return cudf.Decimal64Dtype( + min(cudf.Decimal64Dtype.MAX_PRECISION, p), s + ) else: - return cudf.Decimal32Dtype(min(cudf.Decimal32Dtype.MAX_PRECISION, p), s) + return cudf.Decimal32Dtype( + min(cudf.Decimal32Dtype.MAX_PRECISION, p), s + ) def cudf_dtype_from_pydata_dtype(dtype): @@ -158,7 +165,9 @@ def cudf_dtype_to_pa_type(dtype): Python dtype. """ if isinstance(dtype, cudf.CategoricalDtype): - raise NotImplementedError("No conversion from Categorical to pyarrow type") + raise NotImplementedError( + "No conversion from Categorical to pyarrow type" + ) elif isinstance( dtype, (cudf.StructDtype, cudf.ListDtype, cudf.core.dtypes.DecimalDtype), @@ -192,12 +201,15 @@ def to_cudf_compatible_scalar(val, dtype=None): If `val` is None, returns None. """ - if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(val, cudf.Scalar): + if cudf._lib.scalar._is_null_host_scalar(val) or isinstance( + val, cudf.Scalar + ): return val if not cudf.api.types._is_scalar_or_zero_d_array(val): raise ValueError( - f"Cannot convert value of type {type(val).__name__} " "to cudf scalar" + f"Cannot convert value of type {type(val).__name__} " + "to cudf scalar" ) if isinstance(val, Decimal): @@ -206,9 +218,9 @@ def to_cudf_compatible_scalar(val, dtype=None): if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0: val = val.item() - if ((dtype is None) and isinstance(val, str)) or cudf.api.types.is_string_dtype( - dtype - ): + if ( + (dtype is None) and isinstance(val, str) + ) or cudf.api.types.is_string_dtype(dtype): dtype = "str" if isinstance(val, str) and val.endswith("\x00"): @@ -220,7 +232,9 @@ def to_cudf_compatible_scalar(val, dtype=None): # the string value directly (cudf.DeviceScalar will DTRT) return val - tz_error_msg = "Cannot covert a timezone-aware timestamp to timezone-naive scalar." + tz_error_msg = ( + "Cannot covert a timezone-aware timestamp to timezone-naive scalar." + ) if isinstance(val, pd.Timestamp): if val.tz is not None: raise NotImplementedError(tz_error_msg) @@ -235,9 +249,9 @@ def to_cudf_compatible_scalar(val, dtype=None): elif isinstance(val, datetime.timedelta): val = np.timedelta64(val) - val = _maybe_convert_to_default_type(cudf.api.types.pandas_dtype(type(val))).type( - val - ) + val = _maybe_convert_to_default_type( + cudf.api.types.pandas_dtype(type(val)) + ).type(val) if dtype is not None: if isinstance(val, str) and np.dtype(dtype).kind == "M": @@ -404,9 +418,9 @@ def get_time_unit(obj): def _get_nan_for_dtype(dtype): dtype = cudf.dtype(dtype) - if pd.api.types.is_datetime64_dtype(dtype) or pd.api.types.is_timedelta64_dtype( + if pd.api.types.is_datetime64_dtype( dtype - ): + ) or pd.api.types.is_timedelta64_dtype(dtype): time_unit, _ = np.datetime_data(dtype) return dtype.type("nat", time_unit) elif dtype.kind == "f": @@ -416,7 +430,9 @@ def _get_nan_for_dtype(dtype): def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): - error = TypeError(f"{op} not supported between {dtype_l} and {dtype_r} scalars") + error = TypeError( + f"{op} not supported between {dtype_l} and {dtype_r} scalars" + ) to_numpy_ops = { "__add__": _ADD_TYPES, @@ -447,7 +463,9 @@ def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): for valid_combo in allowed: ltype, rtype, outtype = valid_combo - if np.can_cast(dtype_l.char, ltype) and np.can_cast(dtype_r.char, rtype): + if np.can_cast(dtype_l.char, ltype) and np.can_cast( + dtype_r.char, rtype + ): return outtype raise error @@ -505,14 +523,20 @@ def find_common_type(dtypes): # Aggregate same types dtypes = set(dtypes) - if any(isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes): + if any( + isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes + ): if all( cudf.api.types.is_decimal_dtype(dtype) or cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes ): return _find_common_type_decimal( - [dtype for dtype in dtypes if cudf.api.types.is_decimal_dtype(dtype)] + [ + dtype + for dtype in dtypes + if cudf.api.types.is_decimal_dtype(dtype) + ] ) else: return cudf.dtype("O") @@ -526,24 +550,30 @@ def find_common_type(dtypes): # ListDtype(int64) & ListDtype(int32) common # dtype could be ListDtype(int64). raise NotImplementedError( - "Finding a common type for `ListDtype` is currently " "not supported" + "Finding a common type for `ListDtype` is currently " + "not supported" ) if any(isinstance(dtype, cudf.StructDtype) for dtype in dtypes): if len(dtypes) == 1: return dtypes.get(0) else: raise NotImplementedError( - "Finding a common type for `StructDtype` is currently " "not supported" + "Finding a common type for `StructDtype` is currently " + "not supported" ) # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately - dt_dtypes = set(filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes)) + dt_dtypes = set( + filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes) + ) if len(dt_dtypes) > 0: dtypes = dtypes - dt_dtypes dtypes.add(np.result_type(*dt_dtypes)) - td_dtypes = set(filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes)) + td_dtypes = set( + filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes) + ) if len(td_dtypes) > 0: dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) @@ -626,12 +656,16 @@ def _maybe_convert_to_default_type(dtype): """ if cudf.get_option("default_integer_bitwidth"): if cudf.api.types.is_signed_integer_dtype(dtype): - return cudf.dtype(f'i{cudf.get_option("default_integer_bitwidth")//8}') + return cudf.dtype( + f'i{cudf.get_option("default_integer_bitwidth")//8}' + ) elif cudf.api.types.is_unsigned_integer_dtype(dtype): - return cudf.dtype(f'u{cudf.get_option("default_integer_bitwidth")//8}') - if cudf.get_option("default_float_bitwidth") and cudf.api.types.is_float_dtype( - dtype - ): + return cudf.dtype( + f'u{cudf.get_option("default_integer_bitwidth")//8}' + ) + if cudf.get_option( + "default_float_bitwidth" + ) and cudf.api.types.is_float_dtype(dtype): return cudf.dtype(f'f{cudf.get_option("default_float_bitwidth")//8}') return dtype @@ -650,7 +684,9 @@ def _dtype_can_hold_element(dtype: np.dtype, element) -> bool: return True return False - elif is_integer(element) or (is_float(element) and element.is_integer()): + elif is_integer(element) or ( + is_float(element) and element.is_integer() + ): info = np.iinfo(dtype) if info.min <= element <= info.max: return True diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py index 386343e9e63..b5387ddeb5f 100644 --- a/python/cudf/cudf/utils/gpu_utils.py +++ b/python/cudf/cudf/utils/gpu_utils.py @@ -7,7 +7,10 @@ def validate_setup(): # TODO: Remove the following check once we arrive at a solution for #4827 # This is a temporary workaround to unblock internal testing # related issue: https://github.com/rapidsai/cudf/issues/4827 - if "RAPIDS_NO_INITIALIZE" in os.environ or "CUDF_NO_INITIALIZE" in os.environ: + if ( + "RAPIDS_NO_INITIALIZE" in os.environ + or "CUDF_NO_INITIALIZE" in os.environ + ): return import warnings @@ -126,7 +129,8 @@ def validate_setup(): # Driver Runtime version is >= Runtime version pass elif ( - cuda_driver_supported_rt_version >= 11000 and cuda_runtime_version >= 11000 + cuda_driver_supported_rt_version >= 11000 + and cuda_runtime_version >= 11000 ): # With cuda enhanced compatibility any code compiled # with 11.x version of cuda can now run on any diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index 9e04e6300de..ef078ed8c5d 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -24,7 +24,9 @@ # Shifts for bit packing A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW) -B_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW) +B_SECOND_LEVEL_SHIFT_AMT = np.uint8( + 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW +) BITS_FOR_INNER_TABLE_SIZE = np.uint8(8) NOT_FOUND = -1 @@ -91,8 +93,12 @@ def _find_hash_for_internal(hash_bin): new_length = _new_bin_length(len(hash_bin)) while True: - a = np.random.randint(A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH) - b = np.random.randint(B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH) + a = np.random.randint( + A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH + ) + b = np.random.randint( + B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH + ) bins = _make_bins(hash_bin, new_length, a, b) max_length = len(max(bins, key=len)) @@ -109,7 +115,9 @@ def _perfect_hash(integers, max_constant): ) flattened_bins = [] - internal_table_coeffs = np.zeros(shape=[num_top_level_bins], dtype=np.uint64) + internal_table_coeffs = np.zeros( + shape=[num_top_level_bins], dtype=np.uint64 + ) offset_into_flattened_table = np.zeros( shape=[num_top_level_bins + 1], dtype=np.uint64 ) @@ -126,7 +134,9 @@ def _perfect_hash(integers, max_constant): | coeff_b << B_SECOND_LEVEL_SHIFT_AMT | bin_length ) - offset_into_flattened_table[i + 1] = offset_into_flattened_table[i] + bin_length + offset_into_flattened_table[i + 1] = ( + offset_into_flattened_table[i] + bin_length + ) flattened_bins.extend(internal_table) print( @@ -189,7 +199,8 @@ def _store_func( f.write(f"{len(hash_table)}\n") f.writelines(f"{kv}\n" for kv in hash_table) f.writelines( - f"{tok_id}\n" for tok_id in [unk_tok_id, first_token_id, sep_token_id] + f"{tok_id}\n" + for tok_id in [unk_tok_id, first_token_id, sep_token_id] ) @@ -278,6 +289,8 @@ def hash_vocab( inner_table_coeffs, offsets_into_ht, ) - assert val == value, f"Incorrect value found. Got {val} expected {value}" + assert ( + val == value + ), f"Incorrect value found. Got {val} expected {value}" print("All present tokens return correct value.") diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index bd058cf8465..85abf438efb 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -118,7 +118,9 @@ -------- cudf.read_parquet """ -doc_read_parquet_metadata = docfmt_partial(docstring=_docstring_read_parquet_metadata) +doc_read_parquet_metadata = docfmt_partial( + docstring=_docstring_read_parquet_metadata +) _docstring_read_parquet = """ Load a Parquet dataset into a DataFrame @@ -392,7 +394,9 @@ -------- cudf.read_orc """ -doc_read_orc_statistics = docfmt_partial(docstring=_docstring_read_orc_statistics) +doc_read_orc_statistics = docfmt_partial( + docstring=_docstring_read_orc_statistics +) _docstring_read_orc = """ Load an ORC dataset into a DataFrame @@ -1613,7 +1617,9 @@ def _open_remote_files( # Use fsspec.parquet module. # TODO: Use `cat_ranges` to collect "known" # parts for all files at once. - row_groups = precache_options.pop("row_groups", None) or ([None] * len(paths)) + row_groups = precache_options.pop("row_groups", None) or ( + [None] * len(paths) + ) return [ ArrowPythonFile( _set_context( @@ -1642,7 +1648,8 @@ def _open_remote_files( # Default open - Use pyarrow filesystem API pa_fs = PyFileSystem(FSSpecHandler(fs)) return [ - _set_context(pa_fs.open_input_file(fpath), context_stack) for fpath in paths + _set_context(pa_fs.open_input_file(fpath), context_stack) + for fpath in paths ] @@ -1669,7 +1676,9 @@ def get_reader_filepath_or_buffer( # Get a filesystem object if one isn't already available paths = [path_or_data] if fs is None: - fs, paths = _get_filesystem_and_paths(path_or_data, storage_options) + fs, paths = _get_filesystem_and_paths( + path_or_data, storage_options + ) if fs is None: if warn_on_raw_text_input: # Do not remove until pandas 3.0 support is added. @@ -1709,7 +1718,9 @@ def get_reader_filepath_or_buffer( ) elif warn_on_raw_text_input: # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." + assert ( + PANDAS_LT_300 + ), "Need to drop after pandas-3.0 support is added." warnings.warn( f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " "deprecated and will be removed in a future version. " @@ -1964,7 +1975,10 @@ def _apply_predicate(op, val, col_stats): def _apply_filters(filters, stats): for conjunction in filters: - if all(_apply_predicate(op, val, stats[col]) for col, op, val in conjunction): + if all( + _apply_predicate(op, val, stats[col]) + for col, op, val in conjunction + ): return True return False @@ -2005,7 +2019,9 @@ def _fsspec_data_transfer( # Require `fs` if `path_or_fob` is not file-like file_like = is_file_like(path_or_fob) if fs is None and not file_like: - raise ValueError("fs must be defined if `path_or_fob` is not file-like") + raise ValueError( + "fs must be defined if `path_or_fob` is not file-like" + ) # Calculate total file size if file_like: diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py index c8bf14b2dba..a4404e51232 100644 --- a/python/cudf/cudf/utils/nvtx_annotation.py +++ b/python/cudf/cudf/utils/nvtx_annotation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. import hashlib from functools import partial @@ -25,4 +25,6 @@ def _cudf_nvtx_annotate(func, domain="cudf_python"): )(func) -_dask_cudf_nvtx_annotate = partial(_cudf_nvtx_annotate, domain="dask_cudf_python") +_dask_cudf_nvtx_annotate = partial( + _cudf_nvtx_annotate, domain="dask_cudf_python" +) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 38ab3e89336..239438afd24 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import ast import datetime @@ -22,7 +22,8 @@ ENVREF_PREFIX = "__CUDF_ENVREF__" SUPPORTED_QUERY_TYPES = { - np.dtype(dt) for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES + np.dtype(dt) + for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES } @@ -40,7 +41,9 @@ def visit_Name(self, node): raise QuerySyntaxError("assignment is not allowed") name = node.id - chosen = self.refnames if name.startswith(ENVREF_PREFIX) else self.colnames + chosen = ( + self.refnames if name.startswith(ENVREF_PREFIX) else self.colnames + ) chosen.add(name) @@ -94,7 +97,9 @@ def query_builder(info, funcid): func: a python function of the query """ args = info["args"] - def_line = "def {funcid}({args}):".format(funcid=funcid, args=", ".join(args)) + def_line = "def {funcid}({args}):".format( + funcid=funcid, args=", ".join(args) + ) lines = [def_line, " return {}".format(info["source"])] source = "\n".join(lines) glbs = {} @@ -215,7 +220,8 @@ def query_execute(df, expr, callenv): # wait to check the types until we know which cols are used if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays): raise TypeError( - "query only supports numeric, datetime, timedelta, " "or bool dtypes." + "query only supports numeric, datetime, timedelta, " + "or bool dtypes." ) colarrays = [col.data_array_view(mode="read") for col in colarrays] diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index acbf02f0359..95621cf9519 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -163,7 +163,8 @@ def wrapper(*args, **kwargs): fn = frame.f_code.co_filename if _cudf_root in fn and _tests_root not in fn: raise RuntimeError( - f"External-only API called in {fn} at line {lineno}. " f"{alternative}" + f"External-only API called in {fn} at line {lineno}. " + f"{alternative}" ) return func(*args, **kwargs) @@ -225,7 +226,9 @@ def __getattr__(self, key): try: return self[key] except KeyError: - raise AttributeError(f"{type(self).__name__} object has no attribute {key}") + raise AttributeError( + f"{type(self).__name__} object has no attribute {key}" + ) class NotIterable: @@ -368,7 +371,9 @@ def _is_same_name(left_name, right_name): right_name, decimal.Decimal ): return left_name.is_nan() and right_name.is_nan() - if isinstance(left_name, float) and isinstance(right_name, float): + if isinstance(left_name, float) and isinstance( + right_name, float + ): return np.isnan(left_name) and np.isnan(right_name) if isinstance(left_name, np.datetime64) and isinstance( right_name, np.datetime64 diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index ac5a50473d1..e7d327401d9 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -174,12 +174,12 @@ def test_groupby_apply_fallback(dataframe, groupby_udf): def test_groupby_external_series_apply_fallback(dataframe, groupby_udf): pdf, df = dataframe tm.assert_equal( - pdf.groupby(pd.Series([1, 2, 1, 2, 1]), sort=True, group_keys=True).apply( - groupby_udf - ), - df.groupby(xpd.Series([1, 2, 1, 2, 1]), sort=True, group_keys=True).apply( - groupby_udf - ), + pdf.groupby( + pd.Series([1, 2, 1, 2, 1]), sort=True, group_keys=True + ).apply(groupby_udf), + df.groupby( + xpd.Series([1, 2, 1, 2, 1]), sort=True, group_keys=True + ).apply(groupby_udf), ) @@ -374,7 +374,9 @@ def test_excel_round_trip(dataframe): excel_pdf.seek(0) excel_cudf_pandas.seek(0) - tm.assert_frame_equal(pd.read_excel(excel_pdf), xpd.read_excel(excel_cudf_pandas)) + tm.assert_frame_equal( + pd.read_excel(excel_pdf), xpd.read_excel(excel_cudf_pandas) + ) def test_hash_array(series): @@ -394,15 +396,21 @@ def test_is_sparse(): def test_is_file_like(): assert pd.api.types.is_file_like("a") == xpd.api.types.is_file_like("a") - assert pd.api.types.is_file_like(BytesIO()) == xpd.api.types.is_file_like(BytesIO()) - assert pd.api.types.is_file_like(StringIO("abc")) == xpd.api.types.is_file_like( - StringIO("abc") + assert pd.api.types.is_file_like(BytesIO()) == xpd.api.types.is_file_like( + BytesIO() ) + assert pd.api.types.is_file_like( + StringIO("abc") + ) == xpd.api.types.is_file_like(StringIO("abc")) def test_is_re_compilable(): - assert pd.api.types.is_re_compilable(".^") == xpd.api.types.is_re_compilable(".^") - assert pd.api.types.is_re_compilable(".*") == xpd.api.types.is_re_compilable(".*") + assert pd.api.types.is_re_compilable( + ".^" + ) == xpd.api.types.is_re_compilable(".^") + assert pd.api.types.is_re_compilable( + ".*" + ) == xpd.api.types.is_re_compilable(".*") def test_module_attribute_types(): @@ -424,8 +432,12 @@ def test_infer_freq(): def test_groupby_grouper_fallback(dataframe, groupby_udf): pdf, df = dataframe tm.assert_equal( - pdf.groupby(pd.Grouper("a"), sort=True, group_keys=True).apply(groupby_udf), - df.groupby(xpd.Grouper("a"), sort=True, group_keys=True).apply(groupby_udf), + pdf.groupby(pd.Grouper("a"), sort=True, group_keys=True).apply( + groupby_udf + ), + df.groupby(xpd.Grouper("a"), sort=True, group_keys=True).apply( + groupby_udf + ), ) @@ -514,7 +526,9 @@ def test_pyarrow_array_construction(data): assert actual_pa_array.equals(expected_pa_array) -@pytest.mark.parametrize("op", [">", "<", "==", "<=", ">=", "+", "%", "-", "*", "/"]) +@pytest.mark.parametrize( + "op", [">", "<", "==", "<=", ">=", "+", "%", "-", "*", "/"] +) def test_cudf_pandas_eval_series(op): lhs = xpd.Series([10, 11, 12]) # noqa: F841 rhs = xpd.Series([100, 1, 12]) # noqa: F841 @@ -529,7 +543,9 @@ def test_cudf_pandas_eval_series(op): tm.assert_series_equal(expected, actual) -@pytest.mark.parametrize("op", [">", "<", "==", "<=", ">=", "+", "%", "-", "*", "/"]) +@pytest.mark.parametrize( + "op", [">", "<", "==", "<=", ">=", "+", "%", "-", "*", "/"] +) def test_cudf_pandas_eval_dataframe(op): lhs = xpd.DataFrame({"a": [10, 11, 12], "b": [1, 2, 3]}) # noqa: F841 rhs = xpd.DataFrame({"a": [100, 1, 12], "b": [15, -10, 3]}) # noqa: F841 @@ -544,7 +560,9 @@ def test_cudf_pandas_eval_dataframe(op): tm.assert_frame_equal(expected, actual) -@pytest.mark.parametrize("expr", ["((a + b) * c % d) > e", "((a + b) * c % d)"]) +@pytest.mark.parametrize( + "expr", ["((a + b) * c % d) > e", "((a + b) * c % d)"] +) def test_cudf_pandas_eval_complex(expr): data = { "a": [10, 11, 12], @@ -733,7 +751,9 @@ def test_chunked_csv_reader(tmpdir, data): tm.assert_equal(pd_chunk, xpd_chunk, check_index_type=False) -@pytest.mark.parametrize("data", [(), (1,), (1, 2, 3), ("a", "b", "c"), (1, 2, "test")]) +@pytest.mark.parametrize( + "data", [(), (1,), (1, 2, 3), ("a", "b", "c"), (1, 2, "test")] +) def test_construct_from_generator(data): expect = pd.Series((x for x in data)) got = xpd.Series((x for x in data)) @@ -772,7 +792,9 @@ def test_construct_timedelta_index(): ) def test_datetime_ops(op): pd_dt_idx1 = pd.DatetimeIndex([10, 20, 30], dtype="datetime64[ns]") - cudf_pandas_dt_idx = xpd.DatetimeIndex([10, 20, 30], dtype="datetime64[ns]") + cudf_pandas_dt_idx = xpd.DatetimeIndex( + [10, 20, 30], dtype="datetime64[ns]" + ) tm.assert_equal( op(pd_dt_idx1, pd_dt_idx1), op(cudf_pandas_dt_idx, cudf_pandas_dt_idx) @@ -793,7 +815,9 @@ def test_datetime_ops(op): ) def test_timedelta_ops(op): pd_td_idx1 = pd.TimedeltaIndex([10, 20, 30], dtype="timedelta64[ns]") - cudf_pandas_td_idx = xpd.TimedeltaIndex([10, 20, 30], dtype="timedelta64[ns]") + cudf_pandas_td_idx = xpd.TimedeltaIndex( + [10, 20, 30], dtype="timedelta64[ns]" + ) tm.assert_equal( op(pd_td_idx1, pd_td_idx1), op(cudf_pandas_td_idx, cudf_pandas_td_idx) @@ -803,10 +827,14 @@ def test_timedelta_ops(op): @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_datetime_timedelta_ops(op): pd_dt_idx1 = pd.DatetimeIndex([10, 20, 30], dtype="datetime64[ns]") - cudf_pandas_dt_idx = xpd.DatetimeIndex([10, 20, 30], dtype="datetime64[ns]") + cudf_pandas_dt_idx = xpd.DatetimeIndex( + [10, 20, 30], dtype="datetime64[ns]" + ) pd_td_idx1 = pd.TimedeltaIndex([10, 20, 30], dtype="timedelta64[ns]") - cudf_pandas_td_idx = xpd.TimedeltaIndex([10, 20, 30], dtype="timedelta64[ns]") + cudf_pandas_td_idx = xpd.TimedeltaIndex( + [10, 20, 30], dtype="timedelta64[ns]" + ) tm.assert_equal( op(pd_dt_idx1, pd_td_idx1), op(cudf_pandas_dt_idx, cudf_pandas_td_idx) @@ -878,8 +906,12 @@ def test_datetime_values_dtype_roundtrip(): def test_resample(): - ser = pd.Series(range(3), index=pd.date_range("2020-01-01", freq="D", periods=3)) - xser = xpd.Series(range(3), index=xpd.date_range("2020-01-01", freq="D", periods=3)) + ser = pd.Series( + range(3), index=pd.date_range("2020-01-01", freq="D", periods=3) + ) + xser = xpd.Series( + range(3), index=xpd.date_range("2020-01-01", freq="D", periods=3) + ) expected = ser.resample("D").max() result = xser.resample("D").max() # TODO: See if as_unit can be avoided diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index 315effe83ba..a9a27c22225 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. # noqa: E501 # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -116,7 +116,9 @@ def func(): assert _fast_arg(func)() == (fast_x, fast_y.method()) -def test_fast_slow_arg_function_global(monkeypatch, function_proxy, final_proxy): +def test_fast_slow_arg_function_global( + monkeypatch, function_proxy, final_proxy +): fast_x, slow_x, x = function_proxy fast_y, slow_y, y = final_proxy @@ -336,7 +338,9 @@ def test_doc(fast_and_intermediate_with_doc, slow_and_intermediate_with_doc): assert inspect.getdoc(Pxy().prop) == inspect.getdoc(Slow().prop) assert inspect.getdoc(Pxy.method) == inspect.getdoc(Slow.method) assert inspect.getdoc(Pxy().method) == inspect.getdoc(Slow().method) - assert inspect.getdoc(Pxy().intermediate()) == inspect.getdoc(Slow().intermediate()) + assert inspect.getdoc(Pxy().intermediate()) == inspect.getdoc( + Slow().intermediate() + ) assert inspect.getdoc(Pxy().intermediate().method) == inspect.getdoc( Slow().intermediate().method ) @@ -534,7 +538,9 @@ def test_tuple_with_attrs_transform(): assert a != b assert b != c assert a != d - transform = partial(_transform_arg, attribute_name="_fsproxy_fast", seen=set()) + transform = partial( + _transform_arg, attribute_name="_fsproxy_fast", seen=set() + ) aprime = transform(a) bprime = transform(b) cprime = transform(c) diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py index f0a4961b9ad..4921446ab6b 100644 --- a/python/cudf/cudf_pandas_tests/test_profiler.py +++ b/python/cudf/cudf_pandas_tests/test_profiler.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -37,7 +37,11 @@ def test_profiler(): "Series.__getitem__", } for name, func in per_function_stats.items(): - assert len(func["cpu"]) == 0 if "Time" not in name else len(func["gpu"]) == 0 + assert ( + len(func["cpu"]) == 0 + if "Time" not in name + else len(func["gpu"]) == 0 + ) per_line_stats = profiler.per_line_stats calls = [ diff --git a/python/cudf_kafka/cudf_kafka/_version.py b/python/cudf_kafka/cudf_kafka/_version.py index 8e5082234ec..5adab566da0 100644 --- a/python/cudf_kafka/cudf_kafka/_version.py +++ b/python/cudf_kafka/cudf_kafka/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,9 @@ import importlib.resources __version__ = ( - importlib.resources.files("cudf_kafka").joinpath("VERSION").read_text().strip() + importlib.resources.files("cudf_kafka") + .joinpath("VERSION") + .read_text() + .strip() ) __git_commit__ = "" diff --git a/python/custreamz/custreamz/_version.py b/python/custreamz/custreamz/_version.py index 800582c753b..0f545f95f2b 100644 --- a/python/custreamz/custreamz/_version.py +++ b/python/custreamz/custreamz/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,9 @@ import importlib.resources __version__ = ( - importlib.resources.files("custreamz").joinpath("VERSION").read_text().strip() + importlib.resources.files("custreamz") + .joinpath("VERSION") + .read_text() + .strip() ) __git_commit__ = "" diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py index 5cf33e16c3b..0def0ba746e 100644 --- a/python/custreamz/custreamz/kafka.py +++ b/python/custreamz/custreamz/kafka.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import confluent_kafka as ck from cudf_kafka._lib.kafka import KafkaDatasource @@ -129,7 +129,8 @@ def read_gdf( if topic is None: raise ValueError( - "ERROR: You must specify the topic " "that you want to consume from" + "ERROR: You must specify the topic " + "that you want to consume from" ) kafka_datasource = KafkaDatasource( diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py index 2abc416d2a4..1cda9b71387 100644 --- a/python/custreamz/custreamz/tests/conftest.py +++ b/python/custreamz/custreamz/tests/conftest.py @@ -14,7 +14,9 @@ def kafka_client(): s.shutdown(2) s.close() except Exception: - pytest.skip("A running Kafka instance must be available to run these tests") + pytest.skip( + "A running Kafka instance must be available to run these tests" + ) kafka_configs = { "metadata.broker.list": "localhost:9092", diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index b24cf691988..bae4b051cae 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -277,7 +277,9 @@ def test_getitem(stream): "indexer", [lambda g: g, lambda g: g[["y"]], lambda g: g[["x", "y"]]] ) def test_groupby_aggregate(agg, grouper, indexer, stream): - df = cudf.DataFrame({"x": (np.arange(10) // 2).astype(float), "y": [1.0, 2.0] * 5}) + df = cudf.DataFrame( + {"x": (np.arange(10) // 2).astype(float), "y": [1.0, 2.0] * 5} + ) a = DataFrame(example=df.iloc[:0], stream=stream) @@ -300,7 +302,9 @@ def f(x): def test_repr(stream): - df = cudf.DataFrame({"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10}) + df = cudf.DataFrame( + {"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10} + ) a = DataFrame(example=df, stream=stream) text = repr(a) @@ -317,7 +321,9 @@ def test_repr(stream): def test_repr_html(stream): - df = cudf.DataFrame({"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10}) + df = cudf.DataFrame( + {"x": (np.arange(10) // 2).astype(float), "y": [1.0] * 10} + ) a = DataFrame(example=df, stream=stream) for x in [a, a.y, a.y.mean()]: @@ -414,8 +420,12 @@ def test_setitem_overwrites(stream): (lambda df: df, lambda df: df.x), ], ) -def test_rolling_count_aggregations(op, window, m, pre_get, post_get, kwargs, stream): - index = pd.DatetimeIndex(pd.date_range("2000-01-01", "2000-01-03", freq="1h")) +def test_rolling_count_aggregations( + op, window, m, pre_get, post_get, kwargs, stream +): + index = pd.DatetimeIndex( + pd.date_range("2000-01-01", "2000-01-03", freq="1h") + ) df = cudf.DataFrame({"x": np.arange(len(index))}, index=index) expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs) @@ -610,12 +620,16 @@ def test_windowing_n(func, n, getter): @pytest.mark.parametrize("func", [lambda x: x.sum(), lambda x: x.mean()]) @pytest.mark.parametrize("value", ["10h", "1d"]) @pytest.mark.parametrize("getter", [lambda df: df, lambda df: df.x]) -@pytest.mark.parametrize("grouper", [lambda a: "y", lambda a: a.index, lambda a: ["y"]]) +@pytest.mark.parametrize( + "grouper", [lambda a: "y", lambda a: a.index, lambda a: ["y"]] +) @pytest.mark.parametrize( "indexer", [lambda g: g, lambda g: g[["x"]], lambda g: g[["x", "y"]]] ) def test_groupby_windowing_value(func, value, getter, grouper, indexer): - index = pd.DatetimeIndex(pd.date_range("2000-01-01", "2000-01-03", freq="1h")) + index = pd.DatetimeIndex( + pd.date_range("2000-01-01", "2000-01-03", freq="1h") + ) df = cudf.DataFrame( { "x": np.arange(len(index), dtype=float), @@ -739,7 +753,9 @@ def test_groupby_aggregate_with_start_state(stream): sdf = DataFrame(stream, example=example).groupby(["name"]) output0 = sdf.amount.sum(start=None).stream.gather().sink_to_list() output1 = ( - sdf.amount.mean(with_state=True, start=None).stream.gather().sink_to_list() + sdf.amount.mean(with_state=True, start=None) + .stream.gather() + .sink_to_list() ) output2 = sdf.amount.count(start=None).stream.gather().sink_to_list() @@ -747,7 +763,9 @@ def test_groupby_aggregate_with_start_state(stream): stream.emit(df) out_df0 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [50, 100]}) - out_df1 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [50.0, 100.0]}) + out_df1 = cudf.DataFrame( + {"name": ["Alice", "Tom"], "amount": [50.0, 100.0]} + ) out_df2 = cudf.DataFrame({"name": ["Alice", "Tom"], "amount": [1, 1]}) assert assert_eq(output0[0].reset_index(), out_df0) assert assert_eq(output1[0][1].reset_index(), out_df1) @@ -762,7 +780,9 @@ def test_groupby_aggregate_with_start_state(stream): .sink_to_list() ) output5 = sdf.amount.count(start=output2[0]).stream.gather().sink_to_list() - df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) + df = cudf.DataFrame( + {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} + ) stream.emit(df) out_df2 = cudf.DataFrame( @@ -771,7 +791,9 @@ def test_groupby_aggregate_with_start_state(stream): out_df3 = cudf.DataFrame( {"name": ["Alice", "Linda", "Tom"], "amount": [50.0, 200.0, 100.0]} ) - out_df4 = cudf.DataFrame({"name": ["Alice", "Linda", "Tom"], "amount": [2, 1, 2]}) + out_df4 = cudf.DataFrame( + {"name": ["Alice", "Linda", "Tom"], "amount": [2, 1, 2]} + ) assert assert_eq(output3[0].reset_index(), out_df2) assert assert_eq(output4[0][1].reset_index(), out_df3) assert assert_eq(output5[0].reset_index(), out_df4) @@ -784,7 +806,9 @@ def test_reductions_with_start_state(stream): output1 = sdf.amount.count(start=3).stream.gather().sink_to_list() output2 = sdf.amount.sum(start=10).stream.gather().sink_to_list() - df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) + df = cudf.DataFrame( + {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} + ) stream.emit(df) assert output0[0] == 72.0 @@ -802,7 +826,9 @@ def test_rolling_aggs_with_start_state(stream): .sink_to_list() ) - df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) + df = cudf.DataFrame( + {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} + ) stream.emit(df) df = cudf.DataFrame({"name": ["Bob"], "amount": [250]}) stream.emit(df) @@ -846,7 +872,9 @@ def test_window_aggs_with_start_state(stream): .sink_to_list() ) - df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) + df = cudf.DataFrame( + {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} + ) stream.emit(df) df = cudf.DataFrame({"name": ["Bob"], "amount": [250]}) stream.emit(df) @@ -877,9 +905,13 @@ def test_windowed_groupby_aggs_with_start_state(stream): .sink_to_list() ) - df = cudf.DataFrame({"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]}) + df = cudf.DataFrame( + {"name": ["Alice", "Tom", "Linda"], "amount": [50, 100, 200]} + ) stream.emit(df) - df = cudf.DataFrame({"name": ["Alice", "Linda", "Bob"], "amount": [250, 300, 350]}) + df = cudf.DataFrame( + {"name": ["Alice", "Linda", "Bob"], "amount": [250, 300, 350]} + ) stream.emit(df) stream = Stream() diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py index 4a3f72077bc..0dd62854a4e 100644 --- a/python/dask_cudf/dask_cudf/_version.py +++ b/python/dask_cudf/dask_cudf/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,9 @@ import importlib.resources __version__ = ( - importlib.resources.files("dask_cudf").joinpath("VERSION").read_text().strip() + importlib.resources.files("dask_cudf") + .joinpath("VERSION") + .read_text() + .strip() ) __git_commit__ = "" diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index d9466313528..c7b4a1c4c6a 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -80,7 +80,9 @@ def _nonempty_index(idx): elif isinstance(idx._column, cudf.core.column.StringColumn): return cudf.Index(["cat", "dog"], name=idx.name) elif isinstance(idx, cudf.core.index.Index): - return cudf.core.index.Index(np.arange(2, dtype=idx.dtype), name=idx.name) + return cudf.core.index.Index( + np.arange(2, dtype=idx.dtype), name=idx.name + ) raise TypeError(f"Don't know how to handle index of type {type(idx)}") @@ -100,7 +102,9 @@ def _nest_list_data(data, leaf_type): @_dask_cudf_nvtx_annotate def _get_non_empty_data(s): if isinstance(s, cudf.core.column.CategoricalColumn): - categories = s.categories if len(s.categories) else [UNKNOWN_CATEGORIES] + categories = ( + s.categories if len(s.categories) else [UNKNOWN_CATEGORIES] + ) codes = cudf.core.column.as_column( 0, dtype=cudf._lib.types.size_type_dtype, @@ -131,7 +135,9 @@ def _get_non_empty_data(s): data = data.tz_localize(str(s.dtype.tz))._column else: if pd.api.types.is_numeric_dtype(s.dtype): - data = cudf.core.column.as_column(cp.arange(start=0, stop=2, dtype=s.dtype)) + data = cudf.core.column.as_column( + cp.arange(start=0, stop=2, dtype=s.dtype) + ) else: data = cudf.core.column.as_column( cp.arange(start=0, stop=2, dtype="int64") @@ -241,7 +247,9 @@ def make_meta_object_cudf(x, index=None): return _empty_series(x[0], x[1], index=index) elif isinstance(x, (list, tuple)): if not all(isinstance(i, tuple) and len(i) == 2 for i in x): - raise ValueError(f"Expected iterable of tuples of (name, dtype), got {x}") + raise ValueError( + f"Expected iterable of tuples of (name, dtype), got {x}" + ) return cudf.DataFrame( {c: _empty_series(c, d, index=index) for (c, d) in x}, columns=[c for c, d in x], @@ -287,7 +295,9 @@ def concat_cudf( return cudf.concat(dfs, axis=axis, ignore_index=ignore_index) -@categorical_dtype_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex)) +@categorical_dtype_dispatch.register( + (cudf.DataFrame, cudf.Series, cudf.BaseIndex) +) @_dask_cudf_nvtx_annotate def categorical_dtype_cudf(categories=None, ordered=False): return cudf.CategoricalDtype(categories=categories, ordered=ordered) @@ -329,11 +339,15 @@ def percentile_cudf(a, q, interpolation="linear"): result = cp.percentile(a.cat.codes, q, interpolation=interpolation) return ( - pd.Categorical.from_codes(result, a.dtype.categories, a.dtype.ordered), + pd.Categorical.from_codes( + result, a.dtype.categories, a.dtype.ordered + ), n, ) if np.issubdtype(a.dtype, np.datetime64): - result = a.quantile([i / 100.0 for i in q], interpolation=interpolation) + result = a.quantile( + [i / 100.0 for i in q], interpolation=interpolation + ) if q[0] == 0: # https://github.com/dask/dask/issues/6864 @@ -342,7 +356,9 @@ def percentile_cudf(a, q, interpolation="linear"): if not np.issubdtype(a.dtype, np.number): interpolation = "nearest" return ( - a.quantile([i / 100.0 for i in q], interpolation=interpolation).to_pandas(), + a.quantile( + [i / 100.0 for i in q], interpolation=interpolation + ).to_pandas(), n, ) @@ -355,7 +371,9 @@ def _get_pyarrow_schema_cudf(obj, preserve_index=None, **kwargs): f"`pyarrow_schema_dispatch`: {list(kwargs)}" ) - return _cudf_to_table(meta_nonempty(obj), preserve_index=preserve_index).schema + return _cudf_to_table( + meta_nonempty(obj), preserve_index=preserve_index + ).schema @to_pyarrow_table_dispatch.register(cudf.DataFrame) @@ -371,7 +389,9 @@ def _cudf_to_table(obj, preserve_index=None, **kwargs): if preserve_index and isinstance(obj.index, cudf.RangeIndex): obj = obj.copy() obj.index.name = ( - obj.index.name if obj.index.name is not None else "__index_level_0__" + obj.index.name + if obj.index.name is not None + else "__index_level_0__" ) obj.index = obj.index._as_int_index() @@ -400,7 +420,9 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs): @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex)) @_dask_cudf_nvtx_annotate -def union_categoricals_cudf(to_union, sort_categories=False, ignore_order=False): +def union_categoricals_cudf( + to_union, sort_categories=False, ignore_order=False +): return cudf.api.types._union_categoricals( to_union, sort_categories=False, ignore_order=False ) @@ -443,7 +465,8 @@ def group_split_cudf(df, c, k, ignore_index=False): @_dask_cudf_nvtx_annotate def sizeof_cudf_dataframe(df): return int( - sum(col.memory_usage for col in df._data.columns) + df._index.memory_usage() + sum(col.memory_usage for col in df._data.columns) + + df._index.memory_usage() ) @@ -609,7 +632,9 @@ def read_hdf(*args, **kwargs): "read_hdf is not yet implemented in cudf/dask_cudf. " "Moving to cudf from pandas. Expect poor performance!" ) - return _default_backend(dd.read_hdf, *args, **kwargs).to_backend("cudf") + return _default_backend(dd.read_hdf, *args, **kwargs).to_backend( + "cudf" + ) # Define "cudf" backend entrypoint for dask-expr diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index c22c889be92..bfe58531a73 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -102,7 +102,9 @@ def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): cache_key = uuid.uuid4() def do_apply_rows(df, func, incols, outcols, kwargs): - return df.apply_rows(func, incols, outcols, kwargs, cache_key=cache_key) + return df.apply_rows( + func, incols, outcols, kwargs, cache_key=cache_key + ) meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) return self.map_partitions( @@ -424,7 +426,9 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) n = num.count(split_every=split_every) name = ddf._token_prefix + "var" - result = map_partitions(var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof) + result = map_partitions( + var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof + ) if isinstance(ddf, DataFrame): result.divisions = (min(ddf.columns), max(ddf.columns)) return handle_out(out, result) @@ -467,7 +471,8 @@ def _finalize_var(vals): local_name = "local-" + name num = ddf._get_numeric_data() dsk = { - (local_name, n, 0): (_local_var, (num._name, n), skipna) for n in range(nparts) + (local_name, n, 0): (_local_var, (num._name, n), skipna) + for n in range(nparts) } # Use reduction tree @@ -481,7 +486,9 @@ def _finalize_var(vals): p_max = widths[depth - 1] lstart = split_every * group lstop = min(lstart + split_every, p_max) - node_list = [(local_name, p, depth - 1) for p in range(lstart, lstop)] + node_list = [ + (local_name, p, depth - 1) for p in range(lstart, lstop) + ] dsk[(local_name, group, depth)] = (_aggregate_var, node_list) if height == 1: group = depth = 0 @@ -628,7 +635,10 @@ def reduction( # Chunk a = f"{token or funcname(chunk)}-chunk-{token_key}" if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: - dsk = {(a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__())} + dsk = { + (a, 0, i): (chunk, key) + for i, key in enumerate(args[0].__dask_keys__()) + } else: dsk = { (a, 0, i): ( @@ -678,13 +688,16 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): from dask_cudf import QUERY_PLANNING_ON if isinstance(getattr(data, "index", None), cudf.MultiIndex): - raise NotImplementedError("dask_cudf does not support MultiIndex Dataframes.") + raise NotImplementedError( + "dask_cudf does not support MultiIndex Dataframes." + ) # Dask-expr doesn't support the `name` argument name = {} if not QUERY_PLANNING_ON: name = { - "name": name or ("from_cudf-" + tokenize(data, npartitions or chunksize)) + "name": name + or ("from_cudf-" + tokenize(data, npartitions or chunksize)) } return dd.from_pandas( diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index 4c7145ddaf1..b2f92aeddda 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -43,7 +43,9 @@ def var( index = self._meta.to_pandas().var(numeric_only=True).index frame = frame[list(index)] return new_collection( - frame.expr.var(axis, skipna, ddof, numeric_only, split_every=split_every) + frame.expr.var( + axis, skipna, ddof, numeric_only, split_every=split_every + ) ) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 0b36218ed5a..43ad4f0fee3 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -193,7 +193,9 @@ def last(self, split_every=None, split_out=1): @_deprecate_shuffle_kwarg @_dask_cudf_nvtx_annotate - def aggregate(self, arg, split_every=None, split_out=1, shuffle_method=None): + def aggregate( + self, arg, split_every=None, split_out=1, shuffle_method=None + ): if arg == "size": return self.size() @@ -335,7 +337,9 @@ def last(self, split_every=None, split_out=1): @_deprecate_shuffle_kwarg @_dask_cudf_nvtx_annotate - def aggregate(self, arg, split_every=None, split_out=1, shuffle_method=None): + def aggregate( + self, arg, split_every=None, split_out=1, shuffle_method=None + ): if arg == "size": return self.size() @@ -597,7 +601,9 @@ def groupby_agg( split_out, token="cudf-aggregate", sort=sort, - shuffle_method=shuffle_method if isinstance(shuffle_method, str) else None, + shuffle_method=shuffle_method + if isinstance(shuffle_method, str) + else None, ) # Deal with sort/shuffle defaults @@ -610,7 +616,9 @@ def groupby_agg( ) # Determine required columns to enable column projection - required_columns = list(set(gb_cols).union(aggs.keys()).intersection(ddf.columns)) + required_columns = list( + set(gb_cols).union(aggs.keys()).intersection(ddf.columns) + ) return aca( [ddf[required_columns]], @@ -631,7 +639,9 @@ def groupby_agg( @_dask_cudf_nvtx_annotate -def _make_groupby_agg_call(gb, aggs, split_every, split_out, shuffle_method=None): +def _make_groupby_agg_call( + gb, aggs, split_every, split_out, shuffle_method=None +): """Helper method to consolidate the common `groupby_agg` call for all aggregations in one place """ @@ -666,7 +676,9 @@ def _redirect_aggs(arg): if isinstance(arg[col], list): new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] elif isinstance(arg[col], dict): - new_arg[col] = {k: redirects.get(v, v) for k, v in arg[col].items()} + new_arg[col] = { + k: redirects.get(v, v) for k, v in arg[col].items() + } else: new_arg[col] = redirects.get(arg[col], arg[col]) return new_arg @@ -744,7 +756,9 @@ def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): df[pow2_name] = df[col].astype("float64").pow(2) _agg_dict[pow2_name] = ["sum"] - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(_agg_dict) + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + _agg_dict + ) output_columns = [_make_name(name, sep=sep) for name in gb.columns] gb.columns = output_columns # Return with deterministic column ordering @@ -774,7 +788,9 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep): else: raise ValueError(f"Unexpected aggregation: {agg}") - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(agg_dict) + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + agg_dict + ) # Don't include the last aggregation in the column names output_columns = [ diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index 2ebaa875817..49fea0d7602 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from io import BufferedWriter, IOBase @@ -18,7 +18,9 @@ def _read_orc_stripe(fs, path, stripe, columns, kwargs=None): if kwargs is None: kwargs = {} with fs.open(path, "rb") as f: - df_stripe = cudf.read_orc(f, stripes=[stripe], columns=columns, **kwargs) + df_stripe = cudf.read_orc( + f, stripes=[stripe], columns=columns, **kwargs + ) return df_stripe @@ -76,13 +78,17 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): if schema is None: schema = o.schema elif schema != o.schema: - raise ValueError("Incompatible schemas while parsing ORC files") + raise ValueError( + "Incompatible schemas while parsing ORC files" + ) nstripes_per_file.append(o.nstripes) schema = _get_pyarrow_dtypes(schema, categories=None) if columns is not None: ex = set(columns) - set(schema) if ex: - raise ValueError(f"Requested columns ({ex}) not in schema ({set(schema)})") + raise ValueError( + f"Requested columns ({ex}) not in schema ({set(schema)})" + ) else: columns = list(schema) @@ -99,7 +105,9 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): N = 0 for path, n in zip(paths, nstripes_per_file): for stripe in ( - range(n) if filters is None else cudf.io.orc._filter_stripes(filters, path) + range(n) + if filters is None + else cudf.io.orc._filter_stripes(filters, path) ): dsk[(name, N)] = ( _read_orc_stripe, @@ -161,7 +169,9 @@ def to_orc( if hasattr(path, "name"): path = stringify_path(path) - fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) + fs, _, _ = get_fs_token_paths( + path, mode="wb", storage_options=storage_options + ) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index bd716659ab8..fc962670c47 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -126,7 +126,9 @@ def _read_paths( pof, engine="cudf", columns=columns, - row_groups=row_groups[i] if row_groups else None, + row_groups=row_groups[i] + if row_groups + else None, dataset_kwargs=dataset_kwargs, categorical_partitions=False, **kwargs, @@ -282,7 +284,8 @@ def read_partition( paths.append(path) rgs.append( [row_group] - if not isinstance(row_group, list) and row_group is not None + if not isinstance(row_group, list) + and row_group is not None else row_group ) last_partition_keys = partition_keys @@ -369,14 +372,18 @@ def write_partition( engine=kwargs.get("engine", "cudf"), index=kwargs.get("index", None), partition_cols=kwargs.get("partition_cols", None), - partition_file_name=kwargs.get("partition_file_name", None), + partition_file_name=kwargs.get( + "partition_file_name", None + ), partition_offsets=kwargs.get("partition_offsets", None), statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), row_group_size_bytes=kwargs.get( "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT ), - row_group_size_rows=kwargs.get("row_group_size_rows", None), + row_group_size_rows=kwargs.get( + "row_group_size_rows", None + ), storage_options=kwargs.get("storage_options", None), metadata_file_path=filename if return_metadata else None, ) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index 89cf7c82001..a35a9f1be48 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -86,9 +86,9 @@ def test_csv_roundtrip_filepath(tmp_path): def test_read_csv(tmp_path): - df = dask.datasets.timeseries(dtypes={"x": int, "y": int}, freq="120s").reset_index( - drop=True - ) + df = dask.datasets.timeseries( + dtypes={"x": int, "y": int}, freq="120s" + ).reset_index(drop=True) csv_path = str(tmp_path / "data-*.csv") df.to_csv(csv_path, index=False) @@ -115,9 +115,9 @@ def test_raises_FileNotFoundError(): def test_read_csv_w_bytes(tmp_path): - df = dask.datasets.timeseries(dtypes={"x": int, "y": int}, freq="120s").reset_index( - drop=True - ) + df = dask.datasets.timeseries( + dtypes={"x": int, "y": int}, freq="120s" + ).reset_index(drop=True) df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20))) df.to_csv(tmp_path / "data-*.csv", index=False) @@ -169,7 +169,11 @@ def test_read_csv_compression_file_list(tmp_path): def test_read_csv_blocksize_none(tmp_path, compression, size): df = pd.DataFrame(dict(x=np.arange(size), y=np.arange(size))) - path = tmp_path / "data.csv.gz" if compression == "gzip" else tmp_path / "data.csv" + path = ( + tmp_path / "data.csv.gz" + if compression == "gzip" + else tmp_path / "data.csv" + ) # Types need to be specified for empty csv files if size == 0: @@ -257,4 +261,6 @@ def test_read_csv_nrows(csv_end_bad_lines): def test_read_csv_nrows_error(csv_end_bad_lines): with pytest.raises(ValueError): - dask_cudf.read_csv(csv_end_bad_lines, nrows=2, blocksize="100 MiB").compute() + dask_cudf.read_csv( + csv_end_bad_lines, nrows=2, blocksize="100 MiB" + ).compute() diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 18a06a7dda6..de2a735b2ce 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -50,7 +50,9 @@ def test_roundtrip_backend_dispatch(tmpdir): @pytest.mark.parametrize("divisions", [True, False]) def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file): tmpdir = str(tmpdir) - ddf.to_parquet(tmpdir, write_metadata_file=write_metadata_file, engine="pyarrow") + ddf.to_parquet( + tmpdir, write_metadata_file=write_metadata_file, engine="pyarrow" + ) files = sorted( (os.path.join(tmpdir, f) for f in os.listdir(tmpdir)), key=natural_sort_key, @@ -61,11 +63,15 @@ def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file): dd.assert_eq(ddf, ddf2, check_divisions=divisions) # Specify columns=['x'] - ddf2 = dask_cudf.read_parquet(files, columns=["x"], calculate_divisions=divisions) + ddf2 = dask_cudf.read_parquet( + files, columns=["x"], calculate_divisions=divisions + ) dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions) # Specify columns='y' - ddf2 = dask_cudf.read_parquet(files, columns="y", calculate_divisions=divisions) + ddf2 = dask_cudf.read_parquet( + files, columns="y", calculate_divisions=divisions + ) dd.assert_eq(ddf["y"], ddf2, check_divisions=divisions) # Now include metadata @@ -73,11 +79,15 @@ def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file): dd.assert_eq(ddf, ddf2, check_divisions=divisions) # Specify columns=['x'] (with metadata) - ddf2 = dask_cudf.read_parquet(tmpdir, columns=["x"], calculate_divisions=divisions) + ddf2 = dask_cudf.read_parquet( + tmpdir, columns=["x"], calculate_divisions=divisions + ) dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions) # Specify columns='y' (with metadata) - ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", calculate_divisions=divisions) + ddf2 = dask_cudf.read_parquet( + tmpdir, columns="y", calculate_divisions=divisions + ) dd.assert_eq(ddf["y"], ddf2, check_divisions=divisions) @@ -140,7 +150,9 @@ def test_roundtrip_from_pandas(tmpdir): def test_strings(tmpdir): fn = str(tmpdir) - dfp = pd.DataFrame({"a": ["aa", "bbb", "cccc"], "b": ["hello", "dog", "man"]}) + dfp = pd.DataFrame( + {"a": ["aa", "bbb", "cccc"], "b": ["hello", "dog", "man"]} + ) dfp.set_index("a", inplace=True, drop=True) ddf2 = dd.from_pandas(dfp, npartitions=2) ddf2.to_parquet(fn, engine="pyarrow") @@ -165,7 +177,9 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions): fn = str(tmpdir) ddf2 = dask.datasets.timeseries(freq="D") ddf2.to_parquet(fn, engine="pyarrow", write_index=index) - read_df = dask_cudf.read_parquet(fn, index=index, calculate_divisions=divisions) + read_df = dask_cudf.read_parquet( + fn, index=index, calculate_divisions=divisions + ) dd.assert_eq( ddf2, read_df, check_divisions=(divisions and index), check_index=index ) @@ -176,10 +190,14 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions): @pytest.mark.parametrize("divisions", [False, True]) def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions): fn = str(tmpdir) - ddf2 = dask_cudf.from_cudf(cudf.datasets.timeseries(freq="D"), npartitions=4) + ddf2 = dask_cudf.from_cudf( + cudf.datasets.timeseries(freq="D"), npartitions=4 + ) ddf2.name = ddf2.name.astype("object") ddf2.to_parquet(fn, write_index=index) - read_df = dask_cudf.read_parquet(fn, index=index, calculate_divisions=divisions) + read_df = dask_cudf.read_parquet( + fn, index=index, calculate_divisions=divisions + ) dd.assert_eq( ddf2, read_df, check_divisions=(divisions and index), check_index=index ) @@ -206,7 +224,9 @@ def test_filters(tmpdir): ddf.to_parquet(tmp_path, engine="pyarrow") - a = dask_cudf.read_parquet(tmp_path, filters=[("x", ">", 4)], split_row_groups=True) + a = dask_cudf.read_parquet( + tmp_path, filters=[("x", ">", 4)], split_row_groups=True + ) assert a.npartitions == 3 assert (a.x > 3).all().compute() @@ -245,13 +265,17 @@ def test_isna_filters(tmpdir, null, numeric): # Test "is" col = "i" if numeric else "j" filters = [(col, "is", null)] - out = dask_cudf.read_parquet(tmp_path, filters=filters, split_row_groups=True) + out = dask_cudf.read_parquet( + tmp_path, filters=filters, split_row_groups=True + ) assert len(out) == 2 assert list(out.x.compute().values) == [4, 5] # Test "is not" filters = [(col, "is not", null)] - out = dask_cudf.read_parquet(tmp_path, filters=filters, split_row_groups=True) + out = dask_cudf.read_parquet( + tmp_path, filters=filters, split_row_groups=True + ) assert len(out) == 8 assert list(out.x.compute().values) == [0, 1, 2, 3, 6, 7, 8, 9] @@ -272,7 +296,9 @@ def test_filters_at_row_group_level(tmpdir): # Overwrite=True can be removed for dask-expr>=0.4.1 # See: https://github.com/dask-contrib/dask-expr/issues/800 - ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=1, overwrite=True) + ddf.to_parquet( + tmp_path, engine="pyarrow", row_group_size=1, overwrite=True + ) b = dask_cudf.read_parquet( tmp_path, filters=[("x", "==", 1)], split_row_groups=True @@ -297,7 +323,9 @@ def test_roundtrip_from_dask_partitioned(tmpdir, parts, daskcudf, metadata): df.index.name = "index" if daskcudf: ddf2 = dask_cudf.from_cudf(cudf.from_pandas(df), npartitions=2) - ddf2.to_parquet(tmpdir, write_metadata_file=metadata, partition_on=parts) + ddf2.to_parquet( + tmpdir, write_metadata_file=metadata, partition_on=parts + ) else: ddf2 = dd.from_pandas(df, npartitions=2) ddf2.to_parquet( diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index 9d7f1559680..f4a6fabdb60 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -129,7 +129,9 @@ def test_read_parquet(s3_base, s3so, open_file_options): buffer = BytesIO() pdf.to_parquet(path=buffer) buffer.seek(0) - with s3_context(s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer}): + with s3_context( + s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer} + ): if "open_file_func" in open_file_options: fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 9aa98b859c5..f3774e20d32 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -31,12 +31,16 @@ def wrapper(*args, **kwargs): if old_arg_value is not None: new_arg_value = old_arg_value msg = ( - "the 'shuffle' keyword is deprecated, " "use 'shuffle_method' instead." + "the 'shuffle' keyword is deprecated, " + "use 'shuffle_method' instead." ) warnings.warn(msg, FutureWarning) if kwargs.get("shuffle_method") is not None: - msg = "Can only specify 'shuffle' " "or 'shuffle_method', not both." + msg = ( + "Can only specify 'shuffle' " + "or 'shuffle_method', not both." + ) raise TypeError(msg) kwargs["shuffle_method"] = new_arg_value return func(*args, **kwargs) @@ -56,7 +60,9 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): if ascending: partitions = divisions.searchsorted(s, side="right") - 1 else: - partitions = len(divisions) - divisions.searchsorted(s, side="right") - 1 + partitions = ( + len(divisions) - divisions.searchsorted(s, side="right") - 1 + ) partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = ( 0 if ascending else (len(divisions) - 2) ) @@ -192,7 +198,8 @@ def finalize_tsk(tsk): name = "quantiles-1-" + token val_dsk = { - (name, i): (_quantile, key, qs) for i, key in enumerate(df.__dask_keys__()) + (name, i): (_quantile, key, qs) + for i, key in enumerate(df.__dask_keys__()) } name2 = "quantiles-2-" + token diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 58553d14f0b..ebb8e4be187 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -127,7 +127,9 @@ def test_categorical_basic(data): assert_eq(pdsr.cat.categories, dsr.cat.categories) - np.testing.assert_array_equal(pdsr.cat.codes.values, result.cat.codes.values_host) + np.testing.assert_array_equal( + pdsr.cat.codes.values, result.cat.codes.values_host + ) string = str(result) expect_str = """ @@ -228,8 +230,12 @@ def test_categorical_compare_ordered(data): assert pdsr1.cat.ordered # Test ordered operators - np.testing.assert_array_equal(pdsr1 < pdsr2, (dsr1 < dsr2).compute().values_host) - np.testing.assert_array_equal(pdsr1 > pdsr2, (dsr1 > dsr2).compute().values_host) + np.testing.assert_array_equal( + pdsr1 < pdsr2, (dsr1 < dsr2).compute().values_host + ) + np.testing.assert_array_equal( + pdsr1 > pdsr2, (dsr1 > dsr2).compute().values_host + ) ############################################################################# @@ -252,7 +258,9 @@ def test_string_slicing(data): def test_categorical_categories(): - df = DataFrame({"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)}) + df = DataFrame( + {"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)} + ) df["a"] = df["a"].astype("category") pdf = df.to_pandas(nullable=False) @@ -312,7 +320,10 @@ def data_test_non_numeric(): def data_test_nested(): - return [list(list(y for y in range(x % 5)) for x in range(i)) for i in range(40)] + return [ + list(list(y for y in range(x % 5)) for x in range(i)) + for i in range(40) + ] def data_test_sort(): @@ -523,7 +534,9 @@ def test_struct_explode(data): def test_tz_localize(): data = Series(date_range("2000-04-01", "2000-04-03", freq="H")) - expect = data.dt.tz_localize("US/Eastern", ambiguous="NaT", nonexistent="NaT") + expect = data.dt.tz_localize( + "US/Eastern", ambiguous="NaT", nonexistent="NaT" + ) got = dask_cudf.from_cudf(data, 2).dt.tz_localize( "US/Eastern", ambiguous="NaT", nonexistent="NaT" ) @@ -538,7 +551,9 @@ def test_tz_localize(): "data", [ date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("UTC"), - date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("US/Eastern"), + date_range("2000-04-01", "2000-04-03", freq="H").tz_localize( + "US/Eastern" + ), ], ) def test_tz_convert(data): diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 13bd444495c..8a2f3414fd1 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -195,7 +195,9 @@ def test_set_index(nelem): # Use unique index range as the sort may not be stable-ordering x = np.arange(nelem) np.random.shuffle(x) - df = pd.DataFrame({"x": x, "y": np.random.randint(0, nelem, size=nelem)}) + df = pd.DataFrame( + {"x": x, "y": np.random.randint(0, nelem, size=nelem)} + ) ddf = dd.from_pandas(df, npartitions=2) ddf2 = ddf.to_backend("cudf") @@ -307,7 +309,9 @@ def test_rearrange_by_divisions(nelem, index): df["z"] = df["z"].astype("category") ddf1 = dd.from_pandas(df, npartitions=4) - gdf1 = dask_cudf.from_cudf(cudf.DataFrame.from_pandas(df), npartitions=4) + gdf1 = dask_cudf.from_cudf( + cudf.DataFrame.from_pandas(df), npartitions=4 + ) ddf1.index.name = index gdf1.index.name = index divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20) @@ -488,7 +492,10 @@ def test_repartition_hash_staged(npartitions): # and that the key values are preserved expect_unique = gdf[by].drop_duplicates().sort_values(by) got_unique = cudf.concat( - [part[by].compute().drop_duplicates() for part in ddf_new[by].partitions], + [ + part[by].compute().drop_duplicates() + for part in ddf_new[by].partitions + ], ignore_index=True, ).sort_values(by) dd.assert_eq(got_unique, expect_unique, check_index=False) @@ -525,7 +532,10 @@ def test_repartition_hash(by, npartitions, max_branch): # and that the key values are preserved expect_unique = gdf[by].drop_duplicates().sort_values(by) got_unique = cudf.concat( - [part[by].compute().drop_duplicates() for part in ddf_new[by].partitions], + [ + part[by].compute().drop_duplicates() + for part in ddf_new[by].partitions + ], ignore_index=True, ).sort_values(by) dd.assert_eq(got_unique, expect_unique, check_index=False) @@ -584,7 +594,11 @@ def test_concat(gdf, gddf, series): if series: gdf = gdf.x gddf = gddf.x - a = cudf.concat([gdf, gdf + 1, gdf + 2]).sort_values().reset_index(drop=True) + a = ( + cudf.concat([gdf, gdf + 1, gdf + 2]) + .sort_values() + .reset_index(drop=True) + ) b = ( dd.concat([gddf, gddf + 1, gddf + 2], interleave_partitions=True) .compute() @@ -592,7 +606,11 @@ def test_concat(gdf, gddf, series): .reset_index(drop=True) ) else: - a = cudf.concat([gdf, gdf + 1, gdf + 2]).sort_values("x").reset_index(drop=True) + a = ( + cudf.concat([gdf, gdf + 1, gdf + 2]) + .sort_values("x") + .reset_index(drop=True) + ) b = ( dd.concat([gddf, gddf + 1, gddf + 2], interleave_partitions=True) .compute() @@ -671,7 +689,9 @@ def test_hash_object_dispatch(index): ) def test_make_meta_backends(index): dtypes = ["int8", "int32", "int64", "float64"] - df = cudf.DataFrame({dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes}) + df = cudf.DataFrame( + {dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes} + ) df["strings"] = ["cat", "dog", "fish"] df["cats"] = df["strings"].astype("category") df["time_s"] = np.array( @@ -781,7 +801,9 @@ def test_dataframe_describe(): ddf = dask_cudf.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) - dd.assert_eq(ddf.describe(), pddf.describe(), check_exact=False, atol=0.0001) + dd.assert_eq( + ddf.describe(), pddf.describe(), check_exact=False, atol=0.0001 + ) @xfail_dask_expr("Insufficient describe support in dask-expr") @@ -828,13 +850,17 @@ def test_index_map_partitions(): def test_merging_categorical_columns(): - df_1 = cudf.DataFrame({"id_1": [0, 1, 2, 3], "cat_col": ["a", "b", "f", "f"]}) + df_1 = cudf.DataFrame( + {"id_1": [0, 1, 2, 3], "cat_col": ["a", "b", "f", "f"]} + ) ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2) ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"]) - df_2 = cudf.DataFrame({"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]}) + df_2 = cudf.DataFrame( + {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]} + ) ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 9757f5966a4..3bb3e3b0bb8 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -62,7 +62,9 @@ def pdf(request): def test_groupby_basic(series, aggregation, pdf): gdf = cudf.DataFrame.from_pandas(pdf) gdf_grouped = gdf.groupby("xx", dropna=True) - ddf_grouped = dask_cudf.from_cudf(gdf, npartitions=5).groupby("xx", dropna=True) + ddf_grouped = dask_cudf.from_cudf(gdf, npartitions=5).groupby( + "xx", dropna=True + ) if series: gdf_grouped = gdf_grouped.x @@ -221,7 +223,9 @@ def test_reset_index_multiindex(): @pytest.mark.parametrize("split_out", [1, 2, 3]) -@pytest.mark.parametrize("column", ["c", "d", "e", ["b", "c"], ["b", "d"], ["b", "e"]]) +@pytest.mark.parametrize( + "column", ["c", "d", "e", ["b", "c"], ["b", "d"], ["b", "e"]] +) def test_groupby_split_out(split_out, column): df = pd.DataFrame( { @@ -239,17 +243,26 @@ def test_groupby_split_out(split_out, column): gddf = dask_cudf.from_cudf(gdf, npartitions=3) ddf_result = ( - ddf.groupby(column).a.mean(split_out=split_out).compute().sort_values().dropna() + ddf.groupby(column) + .a.mean(split_out=split_out) + .compute() + .sort_values() + .dropna() ) gddf_result = ( - gddf.groupby(column).a.mean(split_out=split_out).compute().sort_values() + gddf.groupby(column) + .a.mean(split_out=split_out) + .compute() + .sort_values() ) dd.assert_eq(gddf_result, ddf_result, check_index=False) @pytest.mark.parametrize("dropna", [False, True, None]) -@pytest.mark.parametrize("by", ["a", "b", "c", "d", ["a", "b"], ["a", "c"], ["a", "d"]]) +@pytest.mark.parametrize( + "by", ["a", "b", "c", "d", ["a", "b"], ["a", "c"], ["a", "d"]] +) def test_groupby_dropna_cudf(dropna, by): # NOTE: This test is borrowed from upstream dask # (dask/dask/dataframe/tests/test_groupby.py) @@ -296,12 +309,16 @@ def test_groupby_dropna_cudf(dropna, by): pytest.param( False, ["a", "b"], - marks=pytest.mark.xfail(reason="https://github.com/dask/dask/issues/8817"), + marks=pytest.mark.xfail( + reason="https://github.com/dask/dask/issues/8817" + ), ), pytest.param( False, ["a", "c"], - marks=pytest.mark.xfail(reason="https://github.com/dask/dask/issues/8817"), + marks=pytest.mark.xfail( + reason="https://github.com/dask/dask/issues/8817" + ), ), pytest.param( False, @@ -502,7 +519,9 @@ def test_groupby_reset_index_dtype(): def test_groupby_reset_index_names(): - df = cudf.datasets.randomdata(nrows=10, dtypes={"a": str, "b": int, "c": int}) + df = cudf.datasets.randomdata( + nrows=10, dtypes={"a": str, "b": int, "c": int} + ) pdf = df.to_pandas() gddf = dask_cudf.from_cudf(df, 2) @@ -524,11 +543,17 @@ def test_groupby_reset_index_string_name(): gddf = dask_cudf.from_cudf(df, npartitions=1) pddf = dd.from_pandas(pdf, npartitions=1) - g_res = gddf.groupby(["key"]).agg({"value": "mean"}).reset_index(drop=False) - p_res = pddf.groupby(["key"]).agg({"value": "mean"}).reset_index(drop=False) + g_res = ( + gddf.groupby(["key"]).agg({"value": "mean"}).reset_index(drop=False) + ) + p_res = ( + pddf.groupby(["key"]).agg({"value": "mean"}).reset_index(drop=False) + ) got = g_res.compute().sort_values(["key", "value"]).reset_index(drop=True) - expect = p_res.compute().sort_values(["key", "value"]).reset_index(drop=True) + expect = ( + p_res.compute().sort_values(["key", "value"]).reset_index(drop=True) + ) dd.assert_eq(got, expect) assert len(g_res) == len(p_res) @@ -621,7 +646,9 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): assert ("name", "") in gr.columns and ("a", "") in gr.columns # Check `split_out` argument - assert gr.npartitions == (1 if split_out == "use_dask_default" else split_out) + assert gr.npartitions == ( + 1 if split_out == "use_dask_default" else split_out + ) # Compute for easier multiindex handling gf = gr.compute() @@ -632,13 +659,19 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): gf = gf.reset_index(drop=False) sort_cols = [("name", ""), ("a", ""), ("c", "mean")] gf = gf.sort_values(sort_cols).reset_index(drop=True) - pf = pf.reset_index(drop=False).sort_values(sort_cols).reset_index(drop=True) + pf = ( + pf.reset_index(drop=False) + .sort_values(sort_cols) + .reset_index(drop=True) + ) dd.assert_eq(gf, pf) @xfail_dask_expr("Newer dask-expr version needed") -@pytest.mark.parametrize("aggregations", [(sum, "sum"), (max, "max"), (min, "min")]) +@pytest.mark.parametrize( + "aggregations", [(sum, "sum"), (max, "max"), (min, "min")] +) def test_groupby_agg_redirect(aggregations): pdf = pd.DataFrame( { @@ -731,7 +764,9 @@ def test_groupby_with_list_of_series(): ddf = dd.from_pandas(df.to_pandas(), npartitions=2) pgs = dd.from_pandas(gs.to_pandas(), npartitions=2) - dd.assert_eq(gdf.groupby([ggs]).agg(["sum"]), ddf.groupby([pgs]).agg(["sum"])) + dd.assert_eq( + gdf.groupby([ggs]).agg(["sum"]), ddf.groupby([pgs]).agg(["sum"]) + ) @xfail_dask_expr("Nested renamer not supported in dask-expr") @@ -770,7 +805,9 @@ def test_groupby_nested_dict(func): lambda df: df.groupby(["x", "y"]).min(), pytest.param( lambda df: df.groupby(["x", "y"]).agg("min"), - marks=pytest.mark.skip(reason="https://github.com/dask/dask/issues/9093"), + marks=pytest.mark.skip( + reason="https://github.com/dask/dask/issues/9093" + ), ), lambda df: df.groupby(["x", "y"]).y.min(), lambda df: df.groupby(["x", "y"]).y.agg("min"), @@ -794,24 +831,32 @@ def test_groupby_all_columns(func): def test_groupby_shuffle(): - df = cudf.datasets.randomdata(nrows=640, dtypes={"a": str, "b": int, "c": int}) + df = cudf.datasets.randomdata( + nrows=640, dtypes={"a": str, "b": int, "c": int} + ) gddf = dask_cudf.from_cudf(df, 8) spec = {"b": "mean", "c": "max"} expect = df.groupby("a", sort=True).agg(spec) # Sorted aggregation, single-partition output # (sort=True, split_out=1) - got = gddf.groupby("a", sort=True).agg(spec, shuffle_method=True, split_out=1) + got = gddf.groupby("a", sort=True).agg( + spec, shuffle_method=True, split_out=1 + ) dd.assert_eq(expect, got) # Sorted aggregation, multi-partition output # (sort=True, split_out=2) - got = gddf.groupby("a", sort=True).agg(spec, shuffle_method=True, split_out=2) + got = gddf.groupby("a", sort=True).agg( + spec, shuffle_method=True, split_out=2 + ) dd.assert_eq(expect, got) # Un-sorted aggregation, single-partition output # (sort=False, split_out=1) - got = gddf.groupby("a", sort=False).agg(spec, shuffle_method=True, split_out=1) + got = gddf.groupby("a", sort=False).agg( + spec, shuffle_method=True, split_out=1 + ) dd.assert_eq(expect.sort_index(), got.compute().sort_index()) # Un-sorted aggregation, multi-partition output @@ -824,7 +869,9 @@ def test_groupby_shuffle(): # Sorted aggregation fails with split_out>1 when shuffle is False # (sort=True, split_out=2, shuffle_method=False) with pytest.raises(ValueError): - gddf.groupby("a", sort=True).agg(spec, shuffle_method=False, split_out=2) + gddf.groupby("a", sort=True).agg( + spec, shuffle_method=False, split_out=2 + ) # Check shuffle kwarg deprecation with pytest.warns(match="'shuffle' keyword is deprecated"): diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py index 4d3639bfb84..42ecc130298 100644 --- a/python/dask_cudf/dask_cudf/tests/test_join.py +++ b/python/dask_cudf/dask_cudf/tests/test_join.py @@ -140,7 +140,9 @@ def gather(df, grows): @pytest.mark.parametrize("right_nrows", param_nrows) @pytest.mark.parametrize("left_nkeys", [4, 5]) @pytest.mark.parametrize("right_nkeys", [4, 5]) -def test_merge_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"): +def test_merge_left( + left_nrows, right_nrows, left_nkeys, right_nkeys, how="left" +): chunksize = 3 np.random.seed(0) @@ -165,7 +167,9 @@ def test_merge_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how="left" def normalize(df): return ( - df.to_pandas().sort_values(["x", "y", "a_x", "a_y"]).reset_index(drop=True) + df.to_pandas() + .sort_values(["x", "y", "a_x", "a_y"]) + .reset_index(drop=True) ) # dask_cudf @@ -183,7 +187,9 @@ def normalize(df): @pytest.mark.parametrize("right_nrows", [5, 10]) @pytest.mark.parametrize("left_nkeys", [4]) @pytest.mark.parametrize("right_nkeys", [4]) -def test_merge_1col_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how="left"): +def test_merge_1col_left( + left_nrows, right_nrows, left_nkeys, right_nkeys, how="left" +): chunksize = 3 np.random.seed(0) @@ -203,7 +209,11 @@ def test_merge_1col_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how=" ) expect = left.merge(right, on=["x"], how=how) - expect = expect.to_pandas().sort_values(["x", "a_x", "a_y"]).reset_index(drop=True) + expect = ( + expect.to_pandas() + .sort_values(["x", "a_x", "a_y"]) + .reset_index(drop=True) + ) # dask_cudf left = dask_cudf.from_cudf(left, chunksize=chunksize) @@ -262,14 +272,18 @@ def test_indexed_join(how): # occasionally order is not correct (possibly do to hashing in the merge) d = d.sort_values("x") # index is preserved - dg = dg.sort_values("x") # index is reset -- sort_values will slow test down + dg = dg.sort_values( + "x" + ) # index is reset -- sort_values will slow test down dd.assert_eq(d, dg, check_index=False) @pytest.mark.parametrize("how", ["left", "inner"]) def test_how(how): - left = cudf.DataFrame({"x": [1, 2, 3, 4, None], "y": [1.0, 2.0, 3.0, 4.0, 0.0]}) + left = cudf.DataFrame( + {"x": [1, 2, 3, 4, None], "y": [1.0, 2.0, 3.0, 4.0, 0.0]} + ) right = cudf.DataFrame({"x": [2, 3, None, 2], "y": [20, 30, 0, 20]}) dleft = dd.from_pandas(left, npartitions=2) @@ -311,8 +325,12 @@ def test_single_dataframe_merge(daskify): @pytest.mark.parametrize("how", ["inner", "left"]) @pytest.mark.parametrize("on", ["id_1", ["id_1"], ["id_1", "id_2"]]) def test_on(how, on): - left = cudf.DataFrame({"id_1": [1, 2, 3, 4, 5], "id_2": [1.0, 2.0, 3.0, 4.0, 0.0]}) - right = cudf.DataFrame({"id_1": [2, 3, None, 2], "id_2": [2.0, 3.0, 4.0, 20]}) + left = cudf.DataFrame( + {"id_1": [1, 2, 3, 4, 5], "id_2": [1.0, 2.0, 3.0, 4.0, 0.0]} + ) + right = cudf.DataFrame( + {"id_1": [2, 3, None, 2], "id_2": [2.0, 3.0, 4.0, 20]} + ) dleft = dd.from_pandas(left, npartitions=2) dright = dd.from_pandas(right, npartitions=3) diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 333c64eb73b..c3056f2607c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -63,7 +63,9 @@ def test_series_reduce(reducer): ), ], ) -@pytest.mark.parametrize("op", ["max", "min", "sum", "prod", "mean", "var", "std"]) +@pytest.mark.parametrize( + "op", ["max", "min", "sum", "prod", "mean", "var", "std"] +) def test_rowwise_reductions(data, op): gddf = dask_cudf.from_cudf(data, npartitions=10) pddf = gddf.to_backend("pandas") diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index a4dea8901fe..9184ad996ad 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -22,7 +22,9 @@ "c", pytest.param( "d", - marks=xfail_dask_expr("Dask-expr fails to sort by categorical column."), + marks=xfail_dask_expr( + "Dask-expr fails to sort by categorical column." + ), ), ["a", "b"], ["c", "d"], @@ -91,8 +93,12 @@ def test_sort_values_with_nulls(data, by, ascending, na_position): ddf = dd.from_pandas(df, npartitions=5) with dask.config.set(scheduler="single-threaded"): - got = ddf.sort_values(by=by, ascending=ascending, na_position=na_position) - expect = df.sort_values(by=by, ascending=ascending, na_position=na_position) + got = ddf.sort_values( + by=by, ascending=ascending, na_position=na_position + ) + expect = df.sort_values( + by=by, ascending=ascending, na_position=na_position + ) # cudf ordering for nulls is non-deterministic dd.assert_eq(got[by], expect[by], check_index=False) From 90a570bbfc34aec9647393f6f95fe4b6759f4858 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Mar 2024 15:22:33 -0700 Subject: [PATCH 4/9] Probably errant comma --- python/cudf/cudf/core/udf/strings_lowering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index f2d58e97910..3c02ee52b25 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -249,7 +249,7 @@ def replace_impl(context, builder, sig, args): replacement_ptr = builder.alloca(args[2].type) builder.store(args[0], src_ptr) - (builder.store(args[1], to_replace_ptr),) + builder.store(args[1], to_replace_ptr) builder.store(args[2], replacement_ptr) udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) From 539745b1776c8692bac20006dc635fd6c3b80e14 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Mar 2024 17:14:51 -0700 Subject: [PATCH 5/9] Remove nbaqa-black --- .pre-commit-config.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e7bea7dbbb1..67a71021a63 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -57,9 +57,6 @@ repos: # Use the cudf_kafka isort orderings in notebooks so that dask # and RAPIDS packages have their own sections. args: ["--settings-file=python/cudf_kafka/pyproject.toml"] - - id: nbqa-black - # Explicitly specify the pyproject.toml at the repo root, not per-project. - args: ["--config=pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v16.0.6 hooks: From 9c890fa0c4f71e7b463b1c0fe4d8a9011211c72a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Mar 2024 13:12:53 -0700 Subject: [PATCH 6/9] Update pyproject.toml Co-authored-by: Lawrence Mitchell --- pyproject.toml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c0f6e328b4b..ed4f1aa9eaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,20 @@ exclude = [ "__init__.py", ] line-length = 79 - +[tool.ruff.format] +exclude = [ + "thirdparty", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".tox", + ".venv", + "_build", + "buck-out", + "build", + "dist", +] [tool.ruff.per-file-ignores] # Lots of pytest implicitly injected attributes in conftest-patch.py "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"] From 4646b9a4c2c61e54456b554abf1ce8696654a772 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Mar 2024 13:16:05 -0700 Subject: [PATCH 7/9] Ignore 501 due to copyright --- pyproject.toml | 2 ++ python/cudf/cudf/core/resample.py | 2 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 2 +- python/cudf/cudf/pandas/module_accelerator.py | 2 +- python/cudf/cudf/pandas/profiler.py | 2 +- python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 2 +- python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py | 2 +- 7 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ed4f1aa9eaf..d4408329408 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,8 @@ select = ["E", "F", "W"] ignore = [ # whitespace before : "E203", + # line-too-long (due to Copyright header) + "E501", ] fixable = ["ALL"] exclude = [ diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index ec191a974e4..1a79b122561 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 +# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 # diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 47403befd6e..b7c8e92e8db 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 import copyreg diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index bae31499280..e97d6e4af24 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py index 49a417eec09..0124d411e3b 100644 --- a/python/cudf/cudf/pandas/profiler.py +++ b/python/cudf/cudf/pandas/profiler.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index e7d327401d9..f017b46866f 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index a9a27c22225..631ad2f37b2 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. # noqa: E501 +# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 From 45092b79e48d236d9e5456b36b0fbc5a5cd1ac0f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Mar 2024 13:48:11 -0700 Subject: [PATCH 8/9] Update pyproject.toml Co-authored-by: Bradley Dice --- pyproject.toml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d4408329408..cd4f2aa078a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,20 +51,6 @@ exclude = [ "__init__.py", ] line-length = 79 -[tool.ruff.format] -exclude = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", -] [tool.ruff.per-file-ignores] # Lots of pytest implicitly injected attributes in conftest-patch.py "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"] From 05277fdb84412bec2a25653618ddc6ba99e5ad30 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Mar 2024 14:01:03 -0700 Subject: [PATCH 9/9] Blank line between sections --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index cd4f2aa078a..c71394058df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ exclude = [ "__init__.py", ] line-length = 79 + [tool.ruff.per-file-ignores] # Lots of pytest implicitly injected attributes in conftest-patch.py "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"]