From 6b3462bfbe796950dc3838e16e58d8bb2c0c9690 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 1 Feb 2024 07:04:26 -1000 Subject: [PATCH] Replace legacy cudf and dask_cudf imports as (d)gd (#14944) Discussed offline, replacing this legacy import style without aliasing Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14944 --- python/cudf/cudf/api/extensions/accessor.py | 20 +- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/datasets.py | 10 +- python/cudf/cudf/tests/test_concat.py | 410 +++++++++--------- .../cudf/cudf/tests/test_custom_accessor.py | 26 +- python/cudf/cudf/tests/test_datasets.py | 14 +- python/dask_cudf/dask_cudf/sorting.py | 10 +- .../dask_cudf/tests/test_accessor.py | 64 +-- python/dask_cudf/dask_cudf/tests/test_core.py | 58 +-- .../dask_cudf/tests/test_delayed_io.py | 58 +-- python/dask_cudf/dask_cudf/tests/test_join.py | 26 +- .../dask_cudf/tests/test_reductions.py | 6 +- 12 files changed, 357 insertions(+), 347 deletions(-) diff --git a/python/cudf/cudf/api/extensions/accessor.py b/python/cudf/cudf/api/extensions/accessor.py index 311b33a5ab8..e4988c1fa68 100644 --- a/python/cudf/cudf/api/extensions/accessor.py +++ b/python/cudf/cudf/api/extensions/accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import warnings @@ -37,8 +37,8 @@ _dataframe_example = """ In your library code: - >>> import cudf as gd - >>> @gd.api.extensions.register_dataframe_accessor("point") + >>> import cudf + >>> @cudf.api.extensions.register_dataframe_accessor("point") ... class PointsAccessor: ... def __init__(self, obj): ... self._validate(obj) @@ -57,7 +57,7 @@ Then in user code: - >>> df = gd.DataFrame({'x': [1,2,3,4,5,6], 'y':[7,6,5,4,3,2]}) + >>> df = cudf.DataFrame({'x': [1,2,3,4,5,6], 'y':[7,6,5,4,3,2]}) >>> df.point.bounding_box (1, 2, 6, 7) @@ -66,8 +66,8 @@ _index_example = """ In your library code: - >>> import cudf as gd - >>> @gd.api.extensions.register_index_accessor("odd") + >>> import cudf + >>> @cudf.api.extensions.register_index_accessor("odd") ... class OddRowAccessor: ... def __init__(self, obj): ... self._obj = obj @@ -76,7 +76,7 @@ Then in user code: - >>> gs = gd.Index(list(range(0, 50))) + >>> gs = cudf.Index(list(range(0, 50))) >>> gs.odd[1] 1 >>> gs.odd[2] @@ -89,8 +89,8 @@ _series_example = """ In your library code: - >>> import cudf as gd - >>> @gd.api.extensions.register_series_accessor("odd") + >>> import cudf + >>> @cudf.api.extensions.register_series_accessor("odd") ... class OddRowAccessor: ... def __init__(self, obj): ... self._obj = obj @@ -99,7 +99,7 @@ Then in user code: - >>> gs = gd.Series(list(range(0, 50))) + >>> gs = cudf.Series(list(range(0, 50))) >>> gs.odd[1] 1 >>> gs.odd[2] diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 0a0cefde9cd..659e323c57d 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4442,7 +4442,7 @@ def sample( Examples -------- - >>> import cudf as cudf + >>> import cudf >>> df = cudf.DataFrame({"a":{1, 2, 3, 4, 5}}) >>> df.sample(3) a diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index c6091ab60fc..7b183d5f1a3 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -42,8 +42,8 @@ def timeseries( Examples -------- - >>> import cudf as gd - >>> gdf = gd.datasets.timeseries() + >>> import cudf + >>> gdf = cudf.datasets.timeseries() >>> gdf.head() # doctest: +SKIP timestamp id name x y 2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633 @@ -97,8 +97,8 @@ def randomdata(nrows=10, dtypes=None, seed=None): Examples -------- - >>> import cudf as gd - >>> gdf = gd.datasets.randomdata() + >>> import cudf + >>> gdf = cudf.datasets.randomdata() >>> cdf.head() # doctest: +SKIP id x y 0 1014 0.28361267466770146 -0.44274170661264334 diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 4b0e46bf286..01c37005271 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -8,7 +8,7 @@ import pandas as pd import pytest -import cudf as gd +import cudf from cudf.api.types import _is_categorical_dtype from cudf.core._compat import PANDAS_GE_200 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype @@ -59,8 +59,8 @@ def make_frames(index=None, nulls="none"): mask = mask[:5] df.loc[mask, "y"] = np.nan df2.loc[mask, "y"] = np.nan - gdf = gd.DataFrame.from_pandas(df) - gdf2 = gd.DataFrame.from_pandas(df2) + gdf = cudf.DataFrame.from_pandas(df) + gdf2 = cudf.DataFrame.from_pandas(df2) if index: df = df.set_index(index) df2 = df2.set_index(index) @@ -83,7 +83,7 @@ def test_concat_dataframe(index, nulls, axis): # DataFrame with _hide_concat_empty_dtype_warning(): - res = gd.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas() + res = cudf.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas() sol = pd.concat([df, df2, df, df_empty1], axis=axis) assert_eq( res, @@ -95,7 +95,7 @@ def test_concat_dataframe(index, nulls, axis): # Series for c in [i for i in ("x", "y", "z") if i != index]: - res = gd.concat([gdf[c], gdf2[c], gdf[c]], axis=axis).to_pandas() + res = cudf.concat([gdf[c], gdf2[c], gdf[c]], axis=axis).to_pandas() sol = pd.concat([df[c], df2[c], df[c]], axis=axis) assert_eq( res, @@ -106,7 +106,7 @@ def test_concat_dataframe(index, nulls, axis): ) # Index - res = gd.concat([gdf.index, gdf2.index], axis=axis).to_pandas() + res = cudf.concat([gdf.index, gdf2.index], axis=axis).to_pandas() sol = df.index.append(df2.index) assert_eq(res, sol, check_names=False, check_categorical=False) @@ -120,9 +120,9 @@ def test_concat_all_nulls(values): pb = pd.Series([None]) ps = pd.concat([pa, pb]) - ga = gd.Series(values) - gb = gd.Series([None]) - gs = gd.concat([ga, gb]) + ga = cudf.Series(values) + gb = cudf.Series([None]) + gs = cudf.concat([ga, gb]) assert_eq( ps, @@ -139,7 +139,7 @@ def test_concat_errors(): # No objs assert_exceptions_equal( lfunc=pd.concat, - rfunc=gd.concat, + rfunc=cudf.concat, lfunc_args_and_kwargs=([], {"objs": []}), rfunc_args_and_kwargs=([], {"objs": []}), ) @@ -147,7 +147,7 @@ def test_concat_errors(): # All None assert_exceptions_equal( lfunc=pd.concat, - rfunc=gd.concat, + rfunc=cudf.concat, lfunc_args_and_kwargs=([], {"objs": [None, None]}), rfunc_args_and_kwargs=([], {"objs": [None, None]}), ) @@ -155,7 +155,7 @@ def test_concat_errors(): # Mismatched types assert_exceptions_equal( lfunc=pd.concat, - rfunc=gd.concat, + rfunc=cudf.concat, lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}), rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}), ) @@ -163,7 +163,7 @@ def test_concat_errors(): # Unknown type assert_exceptions_equal( lfunc=pd.concat, - rfunc=gd.concat, + rfunc=cudf.concat, lfunc_args_and_kwargs=([], {"objs": ["bar", "foo"]}), rfunc_args_and_kwargs=([], {"objs": ["bar", "foo"]}), ) @@ -174,12 +174,12 @@ def test_concat_errors(): gdf4 = gdf2.set_index("z") with pytest.raises(ValueError, match="All columns must be the same type"): - gd.concat([gdf3, gdf4]) + cudf.concat([gdf3, gdf4]) # Bad axis value assert_exceptions_equal( lfunc=pd.concat, - rfunc=gd.concat, + rfunc=cudf.concat, lfunc_args_and_kwargs=( [], {"objs": [gdf.to_pandas(), gdf2.to_pandas()], "axis": "bad_value"}, @@ -193,7 +193,7 @@ def test_concat_misordered_columns(): gdf2 = gdf2[["z", "x", "y"]] df2 = df2[["z", "x", "y"]] - res = gd.concat([gdf, gdf2]).to_pandas() + res = cudf.concat([gdf, gdf2]).to_pandas() sol = pd.concat([df, df2], sort=False) assert_eq( @@ -211,17 +211,17 @@ def test_concat_columns(axis): pdf2 = pd.DataFrame( np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7] ) - gdf1 = gd.from_pandas(pdf1) - gdf2 = gd.from_pandas(pdf2) + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) expect = pd.concat([pdf1, pdf2], axis=axis) - got = gd.concat([gdf1, gdf2], axis=axis) + got = cudf.concat([gdf1, gdf2], axis=axis) assert_eq(expect, got, check_index_type=True) def test_concat_multiindex_dataframe(): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "w": np.arange(4), "x": np.arange(4), @@ -233,22 +233,22 @@ def test_concat_multiindex_dataframe(): pdg = gdg.to_pandas() pdg1 = pdg.iloc[:, :1] pdg2 = pdg.iloc[:, 1:] - gdg1 = gd.from_pandas(pdg1) - gdg2 = gd.from_pandas(pdg2) + gdg1 = cudf.from_pandas(pdg1) + gdg2 = cudf.from_pandas(pdg2) assert_eq( - gd.concat([gdg1, gdg2]).astype("float64"), + cudf.concat([gdg1, gdg2]).astype("float64"), pd.concat([pdg1, pdg2]), check_index_type=True, ) assert_eq( - gd.concat([gdg1, gdg2], axis=1), + cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1), check_index_type=True, ) def test_concat_multiindex_series(): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "w": np.arange(4), "x": np.arange(4), @@ -260,16 +260,20 @@ def test_concat_multiindex_series(): pdg = gdg.to_pandas() pdg1 = pdg["y"] pdg2 = pdg["z"] - gdg1 = gd.from_pandas(pdg1) - gdg2 = gd.from_pandas(pdg2) + gdg1 = cudf.from_pandas(pdg1) + gdg2 = cudf.from_pandas(pdg2) assert_eq( - gd.concat([gdg1, gdg2]), pd.concat([pdg1, pdg2]), check_index_type=True + cudf.concat([gdg1, gdg2]), + pd.concat([pdg1, pdg2]), + check_index_type=True, + ) + assert_eq( + cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1) ) - assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1)) def test_concat_multiindex_dataframe_and_series(): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "w": np.arange(4), "x": np.arange(4), @@ -282,17 +286,17 @@ def test_concat_multiindex_dataframe_and_series(): pdg1 = pdg[["y", "z"]] pdg2 = pdg["z"] pdg2.name = "a" - gdg1 = gd.from_pandas(pdg1) - gdg2 = gd.from_pandas(pdg2) + gdg1 = cudf.from_pandas(pdg1) + gdg2 = cudf.from_pandas(pdg2) assert_eq( - gd.concat([gdg1, gdg2], axis=1), + cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1), check_index_type=True, ) def test_concat_multiindex_series_and_dataframe(): - gdf = gd.DataFrame( + gdf = cudf.DataFrame( { "w": np.arange(4), "x": np.arange(4), @@ -305,10 +309,10 @@ def test_concat_multiindex_series_and_dataframe(): pdg1 = pdg["z"] pdg2 = pdg[["y", "z"]] pdg1.name = "a" - gdg1 = gd.from_pandas(pdg1) - gdg2 = gd.from_pandas(pdg2) + gdg1 = cudf.from_pandas(pdg1) + gdg2 = cudf.from_pandas(pdg2) assert_eq( - gd.concat([gdg1, gdg2], axis=1), + cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1), check_index_type=True, ) @@ -318,27 +322,27 @@ def test_concat_multiindex_series_and_dataframe(): def test_concat_string_index_name(myindex): # GH-Issue #3420 data = {"a": [123, 456], "b": ["s1", "s2"]} - df1 = gd.DataFrame(data).set_index(myindex) + df1 = cudf.DataFrame(data).set_index(myindex) df2 = df1.copy() - df3 = gd.concat([df1, df2]) + df3 = cudf.concat([df1, df2]) assert df3.index.name == myindex def test_pandas_concat_compatibility_axis1(): - d1 = gd.datasets.randomdata( + d1 = cudf.datasets.randomdata( 3, dtypes={"a": float, "ind": float} ).set_index("ind") - d2 = gd.datasets.randomdata( + d2 = cudf.datasets.randomdata( 3, dtypes={"b": float, "ind": float} ).set_index("ind") - d3 = gd.datasets.randomdata( + d3 = cudf.datasets.randomdata( 3, dtypes={"c": float, "ind": float} ).set_index("ind") - d4 = gd.datasets.randomdata( + d4 = cudf.datasets.randomdata( 3, dtypes={"d": float, "ind": float} ).set_index("ind") - d5 = gd.datasets.randomdata( + d5 = cudf.datasets.randomdata( 3, dtypes={"e": float, "ind": float} ).set_index("ind") @@ -349,7 +353,7 @@ def test_pandas_concat_compatibility_axis1(): pd5 = d5.to_pandas() expect = pd.concat([pd1, pd2, pd3, pd4, pd5], axis=1) - got = gd.concat([d1, d2, d3, d4, d5], axis=1) + got = cudf.concat([d1, d2, d3, d4, d5], axis=1) assert_eq( got.sort_index(), @@ -368,28 +372,28 @@ def test_pandas_concat_compatibility_axis1(): ], ) def test_pandas_concat_compatibility_axis1_overlap(index, names, data): - s1 = gd.Series(data[0], index=[0, 1, 2]) - s2 = gd.Series(data[1], index=index) + s1 = cudf.Series(data[0], index=[0, 1, 2]) + s2 = cudf.Series(data[1], index=index) if names: s1.name = names[0] s2.name = names[1] ps1 = s1.to_pandas() ps2 = s2.to_pandas() - got = gd.concat([s1, s2], axis=1) + got = cudf.concat([s1, s2], axis=1) expect = pd.concat([ps1, ps2], axis=1) assert_eq(got, expect, check_index_type=True) def test_pandas_concat_compatibility_axis1_eq_index(): - s1 = gd.Series(["a", "b", "c"], index=[0, 1, 2]) - s2 = gd.Series(["a", "b", "c"], index=[1, 1, 1]) + s1 = cudf.Series(["a", "b", "c"], index=[0, 1, 2]) + s2 = cudf.Series(["a", "b", "c"], index=[1, 1, 1]) ps1 = s1.to_pandas() ps2 = s2.to_pandas() with expect_warning_if(not PANDAS_GE_200): assert_exceptions_equal( lfunc=pd.concat, - rfunc=gd.concat, + rfunc=cudf.concat, lfunc_args_and_kwargs=([], {"objs": [ps1, ps2], "axis": 1}), rfunc_args_and_kwargs=([], {"objs": [s1, s2], "axis": 1}), ) @@ -399,14 +403,14 @@ def test_pandas_concat_compatibility_axis1_eq_index(): def test_pandas_concat_compatibility_axis1_single_column(name): # Pandas renames series name `None` to 0 # and preserves anything else - s = gd.Series([1, 2, 3], name=name) - got = gd.concat([s], axis=1) + s = cudf.Series([1, 2, 3], name=name) + got = cudf.concat([s], axis=1) expected = pd.concat([s.to_pandas()], axis=1) assert_eq(expected, got) def test_concat_duplicate_columns(): - cdf = gd.DataFrame( + cdf = cudf.DataFrame( { "id4": 4 * list(range(6)), "id5": 4 * list(reversed(range(6))), @@ -416,30 +420,34 @@ def test_concat_duplicate_columns(): cdf_std = cdf.groupby(["id4", "id5"])[["v3"]].std() cdf_med = cdf.groupby(["id4", "id5"])[["v3"]].quantile(q=0.5) with pytest.raises(NotImplementedError): - gd.concat([cdf_med, cdf_std], axis=1) + cudf.concat([cdf_med, cdf_std], axis=1) def test_concat_mixed_input(): pdf1 = pd.DataFrame({"a": [10, 20, 30]}) pdf2 = pd.DataFrame({"a": [11, 22, 33]}) - gdf1 = gd.from_pandas(pdf1) - gdf2 = gd.from_pandas(pdf2) + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) assert_eq( pd.concat([pdf1, None, pdf2, None]), - gd.concat([gdf1, None, gdf2, None]), + cudf.concat([gdf1, None, gdf2, None]), check_index_type=True, ) assert_eq( - pd.concat([pdf1, None]), gd.concat([gdf1, None]), check_index_type=True + pd.concat([pdf1, None]), + cudf.concat([gdf1, None]), + check_index_type=True, ) assert_eq( - pd.concat([None, pdf2]), gd.concat([None, gdf2]), check_index_type=True + pd.concat([None, pdf2]), + cudf.concat([None, gdf2]), + check_index_type=True, ) assert_eq( pd.concat([None, pdf2, pdf1]), - gd.concat([None, gdf2, gdf1]), + cudf.concat([None, gdf2, gdf1]), check_index_type=True, ) @@ -491,11 +499,11 @@ def test_concat_mixed_input(): ) def test_concat_series_dataframe_input(objs): pd_objs = objs - gd_objs = [gd.from_pandas(obj) for obj in objs] + gd_objs = [cudf.from_pandas(obj) for obj in objs] with _hide_concat_empty_dtype_warning(): expected = pd.concat(pd_objs) - actual = gd.concat(gd_objs) + actual = cudf.concat(gd_objs) assert_eq( expected.fillna(-1), @@ -537,10 +545,10 @@ def test_concat_series_dataframe_input(objs): ) def test_concat_series_dataframe_input_str(objs): pd_objs = objs - gd_objs = [gd.from_pandas(obj) for obj in objs] + gd_objs = [cudf.from_pandas(obj) for obj in objs] expected = pd.concat(pd_objs) - actual = gd.concat(gd_objs) + actual = cudf.concat(gd_objs) assert_eq(expected, actual, check_dtype=False, check_index_type=False) @@ -593,11 +601,11 @@ def test_concat_series_dataframe_input_str(objs): def test_concat_empty_dataframes(df, other, ignore_index): other_pd = [df] + other - gdf = gd.from_pandas(df) - other_gd = [gdf] + [gd.from_pandas(o) for o in other] + gdf = cudf.from_pandas(df) + other_gd = [gdf] + [cudf.from_pandas(o) for o in other] expected = pd.concat(other_pd, ignore_index=ignore_index) - actual = gd.concat(other_gd, ignore_index=ignore_index) + actual = cudf.concat(other_gd, ignore_index=ignore_index) if expected.shape != df.shape: for key, col in actual[actual.columns].items(): if _is_categorical_dtype(col.dtype): @@ -636,11 +644,11 @@ def test_concat_empty_dataframes(df, other, ignore_index): ], ) def test_concat_empty_and_nonempty_series(ignore_index, data, axis): - s1 = gd.Series() - s2 = gd.Series(data[0]) + s1 = cudf.Series() + s2 = cudf.Series(data[0]) ps1 = s1.to_pandas() ps2 = s2.to_pandas() - got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index) + got = cudf.concat([s1, s2], axis=axis, ignore_index=ignore_index) expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index) assert_eq(got, expect, check_index_type=True) @@ -649,11 +657,11 @@ def test_concat_empty_and_nonempty_series(ignore_index, data, axis): @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("axis", [0, "index"]) def test_concat_two_empty_series(ignore_index, axis): - s1 = gd.Series() - s2 = gd.Series() + s1 = cudf.Series() + s2 = cudf.Series() ps1 = s1.to_pandas() ps2 = s2.to_pandas() - got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index) + got = cudf.concat([s1, s2], axis=axis, ignore_index=ignore_index) expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index) assert_eq(got, expect, check_index_type=True) @@ -663,12 +671,12 @@ def test_concat_two_empty_series(ignore_index, axis): "df1,df2", [ ( - gd.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}), - gd.DataFrame({"k1": [1, 0], "k2": [3, 2], "v2": [6, 7]}), + cudf.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}), + cudf.DataFrame({"k1": [1, 0], "k2": [3, 2], "v2": [6, 7]}), ), ( - gd.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}), - gd.DataFrame({"k1": [0, 1], "k2": [3, 2], "v2": [6, 7]}), + cudf.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}), + cudf.DataFrame({"k1": [0, 1], "k2": [3, 2], "v2": [6, 7]}), ), ], ) @@ -682,7 +690,7 @@ def test_concat_dataframe_with_multiindex(df1, df2): pdf1 = gdf1.to_pandas() pdf2 = gdf2.to_pandas() - actual = gd.concat([gdf1, gdf2], axis=1) + actual = cudf.concat([gdf1, gdf2], axis=1) expected = pd.concat([pdf1, pdf2], axis=1) # Will need to sort_index before comparing as @@ -743,13 +751,13 @@ def test_concat_dataframe_with_multiindex(df1, df2): @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0]) def test_concat_join(objs, ignore_index, sort, join, axis): - gpu_objs = [gd.from_pandas(o) for o in objs] + gpu_objs = [cudf.from_pandas(o) for o in objs] assert_eq( pd.concat( objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis ), - gd.concat( + cudf.concat( gpu_objs, sort=sort, join=join, @@ -778,7 +786,7 @@ def test_concat_join(objs, ignore_index, sort, join, axis): ], ) def test_concat_join_axis_1_dup_error(objs): - gpu_objs = [gd.from_pandas(o) for o in objs] + gpu_objs = [cudf.from_pandas(o) for o in objs] # we do not support duplicate columns with pytest.raises(NotImplementedError): assert_eq( @@ -786,7 +794,7 @@ def test_concat_join_axis_1_dup_error(objs): objs, axis=1, ), - gd.concat( + cudf.concat( gpu_objs, axis=1, ), @@ -816,11 +824,11 @@ def test_concat_join_axis_1_dup_error(objs): @pytest.mark.parametrize("axis", [1]) def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): # no duplicate columns - gpu_objs = [gd.from_pandas(o) for o in objs] + gpu_objs = [cudf.from_pandas(o) for o in objs] expected = pd.concat( objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis ) - actual = gd.concat( + actual = cudf.concat( gpu_objs, sort=sort, join=join, @@ -850,10 +858,10 @@ def test_concat_join_many_df_and_empty_df(ignore_index, sort, join, axis): pdf3 = pd.DataFrame({"j": [1, 2], "k": [1, 2], "s": [1, 2], "t": [1, 2]}) pdf_empty1 = pd.DataFrame() - gdf1 = gd.from_pandas(pdf1) - gdf2 = gd.from_pandas(pdf2) - gdf3 = gd.from_pandas(pdf3) - gdf_empty1 = gd.from_pandas(pdf_empty1) + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) + gdf3 = cudf.from_pandas(pdf3) + gdf_empty1 = cudf.from_pandas(pdf_empty1) with _hide_concat_empty_dtype_warning(): assert_eq( @@ -864,7 +872,7 @@ def test_concat_join_many_df_and_empty_df(ignore_index, sort, join, axis): ignore_index=ignore_index, axis=axis, ), - gd.concat( + cudf.concat( [gdf1, gdf2, gdf3, gdf_empty1], sort=sort, join=join, @@ -888,11 +896,11 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): } ) - gdf1 = gd.from_pandas(pdf1) + gdf1 = cudf.from_pandas(pdf1) expected = pd.concat( [pdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis ) - actual = gd.concat( + actual = cudf.concat( [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis ) @@ -923,8 +931,8 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): def test_concat_join_no_overlapping_columns( pdf1, pdf2, ignore_index, sort, join, axis ): - gdf1 = gd.from_pandas(pdf1) - gdf2 = gd.from_pandas(pdf2) + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) expected = pd.concat( [pdf1, pdf2], @@ -933,7 +941,7 @@ def test_concat_join_no_overlapping_columns( ignore_index=ignore_index, axis=axis, ) - actual = gd.concat( + actual = cudf.concat( [gdf1, gdf2], sort=sort, join=join, @@ -962,10 +970,10 @@ def test_concat_join_no_overlapping_columns_many_and_empty( ) pdf_empty = pd.DataFrame() - gdf4 = gd.from_pandas(pdf4) - gdf5 = gd.from_pandas(pdf5) - gdf6 = gd.from_pandas(pdf6) - gdf_empty = gd.from_pandas(pdf_empty) + gdf4 = cudf.from_pandas(pdf4) + gdf5 = cudf.from_pandas(pdf5) + gdf6 = cudf.from_pandas(pdf6) + gdf_empty = cudf.from_pandas(pdf_empty) with _hide_concat_empty_dtype_warning(): expected = pd.concat( @@ -975,7 +983,7 @@ def test_concat_join_no_overlapping_columns_many_and_empty( ignore_index=ignore_index, axis=axis, ) - actual = gd.concat( + actual = cudf.concat( [gdf4, gdf5, gdf6, gdf_empty], sort=sort, join=join, @@ -1038,7 +1046,7 @@ def test_concat_join_no_overlapping_columns_many_and_empty( def test_concat_join_no_overlapping_columns_many_and_empty2( objs, ignore_index, sort, join, axis ): - objs_gd = [gd.from_pandas(o) if o is not None else o for o in objs] + objs_gd = [cudf.from_pandas(o) if o is not None else o for o in objs] with _hide_concat_empty_dtype_warning(): expected = pd.concat( @@ -1048,7 +1056,7 @@ def test_concat_join_no_overlapping_columns_many_and_empty2( ignore_index=ignore_index, axis=axis, ) - actual = gd.concat( + actual = cudf.concat( objs_gd, sort=sort, join=join, @@ -1074,8 +1082,8 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( ) pdf_empty = pd.DataFrame() - gdf6 = gd.from_pandas(pdf6) - gdf_empty = gd.from_pandas(pdf_empty) + gdf6 = cudf.from_pandas(pdf6) + gdf_empty = cudf.from_pandas(pdf_empty) with _hide_concat_empty_dtype_warning(): expected = pd.concat( @@ -1085,7 +1093,7 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( ignore_index=ignore_index, axis=axis, ) - actual = gd.concat( + actual = cudf.concat( [gdf6, gdf_empty], sort=sort, join=join, @@ -1105,10 +1113,10 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( @pytest.mark.parametrize("join", ["inner", "outer"]) @pytest.mark.parametrize("axis", [0, 1]) def test_concat_join_series(ignore_index, sort, join, axis): - s1 = gd.Series(["a", "b", "c"]) - s2 = gd.Series(["a", "b"]) - s3 = gd.Series(["a", "b", "c", "d"]) - s4 = gd.Series(dtype="str") + s1 = cudf.Series(["a", "b", "c"]) + s2 = cudf.Series(["a", "b"]) + s3 = cudf.Series(["a", "b", "c", "d"]) + s4 = cudf.Series(dtype="str") ps1 = s1.to_pandas() ps2 = s2.to_pandas() @@ -1123,7 +1131,7 @@ def test_concat_join_series(ignore_index, sort, join, axis): axis=axis, ) with expect_warning_if(axis == 1): - actual = gd.concat( + actual = cudf.concat( [s1, s2, s3, s4], sort=sort, join=join, @@ -1191,13 +1199,13 @@ def test_concat_join_empty_dataframes( df, other, ignore_index, axis, join, sort ): other_pd = [df] + other - gdf = gd.from_pandas(df) - other_gd = [gdf] + [gd.from_pandas(o) for o in other] + gdf = cudf.from_pandas(df) + other_gd = [gdf] + [cudf.from_pandas(o) for o in other] expected = pd.concat( other_pd, ignore_index=ignore_index, axis=axis, join=join, sort=sort ) - actual = gd.concat( + actual = cudf.concat( other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort ) if expected.shape != df.shape: @@ -1302,8 +1310,8 @@ def test_concat_join_empty_dataframes_axis_1( ): # no duplicate columns other_pd = [df] + other - gdf = gd.from_pandas(df) - other_gd = [gdf] + [gd.from_pandas(o) for o in other] + gdf = cudf.from_pandas(df) + other_gd = [gdf] + [cudf.from_pandas(o) for o in other] with _hide_concat_empty_dtype_warning(): expected = pd.concat( @@ -1313,7 +1321,7 @@ def test_concat_join_empty_dataframes_axis_1( join=join, sort=sort, ) - actual = gd.concat( + actual = cudf.concat( other_gd, ignore_index=ignore_index, axis=axis, @@ -1356,18 +1364,18 @@ def test_concat_preserve_order(): assert_eq( pd.concat(dfs, join="inner"), - gd.concat([gd.DataFrame(df) for df in dfs], join="inner"), + cudf.concat([cudf.DataFrame(df) for df in dfs], join="inner"), check_index_type=True, ) @pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("typ", [gd.DataFrame, gd.Series]) +@pytest.mark.parametrize("typ", [cudf.DataFrame, cudf.Series]) def test_concat_single_object(ignore_index, typ): """Ensure that concat on a single object does not change it.""" obj = typ([1, 2, 3]) assert_eq( - gd.concat([obj], ignore_index=ignore_index, axis=0), + cudf.concat([obj], ignore_index=ignore_index, axis=0), obj, check_index_type=True, ) @@ -1382,15 +1390,15 @@ def test_concat_single_object(ignore_index, typ): [ Decimal64Dtype(3, 2), Decimal64Dtype(8, 4), - gd.Decimal128Dtype(3, 2), - gd.Decimal32Dtype(8, 4), + cudf.Decimal128Dtype(3, 2), + cudf.Decimal32Dtype(8, 4), ], ) def test_concat_decimal_dataframe(ltype, rtype): - gdf1 = gd.DataFrame( + gdf1 = cudf.DataFrame( {"id": np.random.randint(0, 10, 3), "val": ["22.3", "59.5", "81.1"]} ) - gdf2 = gd.DataFrame( + gdf2 = cudf.DataFrame( {"id": np.random.randint(0, 10, 3), "val": ["2.35", "5.59", "8.14"]} ) @@ -1400,7 +1408,7 @@ def test_concat_decimal_dataframe(ltype, rtype): pdf1 = gdf1.to_pandas() pdf2 = gdf2.to_pandas() - got = gd.concat([gdf1, gdf2]) + got = cudf.concat([gdf1, gdf2]) expected = pd.concat([pdf1, pdf2]) assert_eq(expected, got, check_index_type=True) @@ -1417,13 +1425,13 @@ def test_concat_decimal_dataframe(ltype, rtype): ], ) def test_concat_decimal_series(ltype, rtype): - gs1 = gd.Series(["228.3", "559.5", "281.1"]).astype(ltype) - gs2 = gd.Series(["2.345", "5.259", "8.154"]).astype(rtype) + gs1 = cudf.Series(["228.3", "559.5", "281.1"]).astype(ltype) + gs2 = cudf.Series(["2.345", "5.259", "8.154"]).astype(rtype) ps1 = gs1.to_pandas() ps2 = gs2.to_pandas() - got = gd.concat([gs1, gs2]) + got = cudf.concat([gs1, gs2]) expected = pd.concat([ps1, ps2]) assert_eq(expected, got, check_index_type=True) @@ -1433,16 +1441,16 @@ def test_concat_decimal_series(ltype, rtype): "df1, df2, df3, expected", [ ( - gd.DataFrame( + cudf.DataFrame( {"val": [Decimal("42.5"), Decimal("8.7")]}, dtype=Decimal64Dtype(5, 2), ), - gd.DataFrame( + cudf.DataFrame( {"val": [Decimal("9.23"), Decimal("-67.49")]}, dtype=Decimal64Dtype(6, 4), ), - gd.DataFrame({"val": [8, -5]}, dtype="int32"), - gd.DataFrame( + cudf.DataFrame({"val": [8, -5]}, dtype="int32"), + cudf.DataFrame( { "val": [ Decimal("42.5"), @@ -1458,13 +1466,13 @@ def test_concat_decimal_series(ltype, rtype): ), ), ( - gd.DataFrame( + cudf.DataFrame( {"val": [Decimal("95.2"), Decimal("23.4")]}, dtype=Decimal64Dtype(5, 2), ), - gd.DataFrame({"val": [54, 509]}, dtype="uint16"), - gd.DataFrame({"val": [24, -48]}, dtype="int32"), - gd.DataFrame( + cudf.DataFrame({"val": [54, 509]}, dtype="uint16"), + cudf.DataFrame({"val": [24, -48]}, dtype="int32"), + cudf.DataFrame( { "val": [ Decimal("95.2"), @@ -1480,13 +1488,13 @@ def test_concat_decimal_series(ltype, rtype): ), ), ( - gd.DataFrame( + cudf.DataFrame( {"val": [Decimal("36.56"), Decimal("-59.24")]}, dtype=Decimal64Dtype(9, 4), ), - gd.DataFrame({"val": [403.21, 45.13]}, dtype="float32"), - gd.DataFrame({"val": [52.262, -49.25]}, dtype="float64"), - gd.DataFrame( + cudf.DataFrame({"val": [403.21, 45.13]}, dtype="float32"), + cudf.DataFrame({"val": [52.262, -49.25]}, dtype="float64"), + cudf.DataFrame( { "val": [ Decimal("36.56"), @@ -1502,13 +1510,13 @@ def test_concat_decimal_series(ltype, rtype): ), ), ( - gd.DataFrame( + cudf.DataFrame( {"val": [Decimal("9563.24"), Decimal("236.633")]}, dtype=Decimal64Dtype(9, 4), ), - gd.DataFrame({"val": [5393, -95832]}, dtype="int64"), - gd.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), - gd.DataFrame( + cudf.DataFrame({"val": [5393, -95832]}, dtype="int64"), + cudf.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), + cudf.DataFrame( { "val": [ Decimal("9563.24"), @@ -1524,13 +1532,13 @@ def test_concat_decimal_series(ltype, rtype): ), ), ( - gd.DataFrame( + cudf.DataFrame( {"val": [Decimal("95633.24"), Decimal("236.633")]}, dtype=Decimal128Dtype(19, 4), ), - gd.DataFrame({"val": [5393, -95832]}, dtype="int64"), - gd.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), - gd.DataFrame( + cudf.DataFrame({"val": [5393, -95832]}, dtype="int64"), + cudf.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), + cudf.DataFrame( { "val": [ Decimal("95633.24"), @@ -1548,7 +1556,7 @@ def test_concat_decimal_series(ltype, rtype): ], ) def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): - df = gd.concat([df1, df2, df3]) + df = cudf.concat([df1, df2, df3]) assert_eq(df, expected, check_index_type=True) assert_eq(df.val.dtype, expected.val.dtype) @@ -1557,15 +1565,15 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): "s1, s2, s3, expected", [ ( - gd.Series( + cudf.Series( [Decimal("32.8"), Decimal("-87.7")], dtype=Decimal64Dtype(6, 2) ), - gd.Series( + cudf.Series( [Decimal("101.243"), Decimal("-92.449")], dtype=Decimal64Dtype(9, 6), ), - gd.Series([94, -22], dtype="int32"), - gd.Series( + cudf.Series([94, -22], dtype="int32"), + cudf.Series( [ Decimal("32.8"), Decimal("-87.7"), @@ -1579,12 +1587,12 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): ), ), ( - gd.Series( + cudf.Series( [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2) ), - gd.Series([33, 984], dtype="uint32"), - gd.Series([593, -702], dtype="int32"), - gd.Series( + cudf.Series([33, 984], dtype="uint32"), + cudf.Series([593, -702], dtype="int32"), + cudf.Series( [ Decimal("7.2"), Decimal("122.1"), @@ -1598,13 +1606,13 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): ), ), ( - gd.Series( + cudf.Series( [Decimal("982.94"), Decimal("-493.626")], dtype=Decimal64Dtype(9, 4), ), - gd.Series([847.98, 254.442], dtype="float32"), - gd.Series([5299.262, -2049.25], dtype="float64"), - gd.Series( + cudf.Series([847.98, 254.442], dtype="float32"), + cudf.Series([5299.262, -2049.25], dtype="float64"), + cudf.Series( [ Decimal("982.94"), Decimal("-493.626"), @@ -1618,13 +1626,13 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): ), ), ( - gd.Series( + cudf.Series( [Decimal("492.204"), Decimal("-72824.455")], dtype=Decimal64Dtype(9, 4), ), - gd.Series([8438, -27462], dtype="int64"), - gd.Series([-40.292, 49202.953], dtype="float64"), - gd.Series( + cudf.Series([8438, -27462], dtype="int64"), + cudf.Series([-40.292, 49202.953], dtype="float64"), + cudf.Series( [ Decimal("492.204"), Decimal("-72824.455"), @@ -1638,19 +1646,19 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): ), ), ( - gd.Series( + cudf.Series( [Decimal("492.204"), Decimal("-72824.455")], dtype=Decimal64Dtype(10, 4), ), - gd.Series( + cudf.Series( [Decimal("8438"), Decimal("-27462")], dtype=Decimal32Dtype(9, 4), ), - gd.Series( + cudf.Series( [Decimal("-40.292"), Decimal("49202.953")], dtype=Decimal128Dtype(19, 4), ), - gd.Series( + cudf.Series( [ Decimal("492.204"), Decimal("-72824.455"), @@ -1666,7 +1674,7 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): ], ) def test_concat_decimal_numeric_series(s1, s2, s3, expected): - s = gd.concat([s1, s2, s3]) + s = cudf.concat([s1, s2, s3]) assert_eq(s, expected, check_index_type=True) @@ -1674,11 +1682,11 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): "s1, s2, expected", [ ( - gd.Series( + cudf.Series( [Decimal("955.22"), Decimal("8.2")], dtype=Decimal64Dtype(5, 2) ), - gd.Series(["2007-06-12", "2006-03-14"], dtype="datetime64[s]"), - gd.Series( + cudf.Series(["2007-06-12", "2006-03-14"], dtype="datetime64[s]"), + cudf.Series( [ "955.22", "8.20", @@ -1689,17 +1697,17 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): ), ), ( - gd.Series( + cudf.Series( [Decimal("-52.44"), Decimal("365.22")], dtype=Decimal64Dtype(5, 2), ), - gd.Series( + cudf.Series( np.arange( "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" ), dtype="datetime64[s]", ), - gd.Series( + cudf.Series( [ "-52.44", "365.22", @@ -1711,25 +1719,25 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): ), ), ( - gd.Series( + cudf.Series( [Decimal("753.0"), Decimal("94.22")], dtype=Decimal64Dtype(5, 2), ), - gd.Series([np.timedelta64(111, "s"), np.timedelta64(509, "s")]), - gd.Series( + cudf.Series([np.timedelta64(111, "s"), np.timedelta64(509, "s")]), + cudf.Series( ["753.00", "94.22", "0 days 00:01:51", "0 days 00:08:29"], index=[0, 1, 0, 1], ), ), ( - gd.Series( + cudf.Series( [Decimal("753.0"), Decimal("94.22")], dtype=Decimal64Dtype(5, 2), ), - gd.Series( + cudf.Series( [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")] ), - gd.Series( + cudf.Series( ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"], index=[0, 1, 0, 1], ), @@ -1737,7 +1745,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected): ], ) def test_concat_decimal_non_numeric(s1, s2, expected): - s = gd.concat([s1, s2]) + s = cudf.concat([s1, s2]) assert_eq(s, expected, check_index_type=True) @@ -1745,9 +1753,9 @@ def test_concat_decimal_non_numeric(s1, s2, expected): "s1, s2, expected", [ ( - gd.Series([{"a": 5}, {"c": "hello"}, {"b": 7}]), - gd.Series([{"a": 5, "c": "hello", "b": 7}]), - gd.Series( + cudf.Series([{"a": 5}, {"c": "hello"}, {"b": 7}]), + cudf.Series([{"a": 5, "c": "hello", "b": 7}]), + cudf.Series( [ {"a": 5, "b": None, "c": None}, {"a": None, "b": None, "c": "hello"}, @@ -1760,7 +1768,7 @@ def test_concat_decimal_non_numeric(s1, s2, expected): ], ) def test_concat_struct_column(s1, s2, expected): - s = gd.concat([s1, s2]) + s = cudf.concat([s1, s2]) assert_eq(s, expected, check_index_type=True) @@ -1768,9 +1776,9 @@ def test_concat_struct_column(s1, s2, expected): "frame1, frame2, expected", [ ( - gd.Series([[{"b": 0}], [{"b": 1}], [{"b": 3}]]), - gd.Series([[{"b": 10}], [{"b": 12}], None]), - gd.Series( + cudf.Series([[{"b": 0}], [{"b": 1}], [{"b": 3}]]), + cudf.Series([[{"b": 10}], [{"b": 12}], None]), + cudf.Series( [ [{"b": 0}], [{"b": 1}], @@ -1783,9 +1791,9 @@ def test_concat_struct_column(s1, s2, expected): ), ), ( - gd.DataFrame({"a": [[{"b": 0}], [{"b": 1}], [{"b": 3}]]}), - gd.DataFrame({"a": [[{"b": 10}], [{"b": 12}], None]}), - gd.DataFrame( + cudf.DataFrame({"a": [[{"b": 0}], [{"b": 1}], [{"b": 3}]]}), + cudf.DataFrame({"a": [[{"b": 10}], [{"b": 12}], None]}), + cudf.DataFrame( { "a": [ [{"b": 0}], @@ -1802,7 +1810,7 @@ def test_concat_struct_column(s1, s2, expected): ], ) def test_concat_list_column(frame1, frame2, expected): - actual = gd.concat([frame1, frame2]) + actual = cudf.concat([frame1, frame2]) assert_eq(actual, expected, check_index_type=True) @@ -1814,10 +1822,10 @@ def test_concat_categorical_ordering(): sr = sr.cat.set_categories(["d", "a", "b", "c", "e"]) df = pd.DataFrame({"a": sr}) - gdf = gd.from_pandas(df) + gdf = cudf.from_pandas(df) expect = pd.concat([df, df, df]) - got = gd.concat([gdf, gdf, gdf]) + got = cudf.concat([gdf, gdf, gdf]) assert_eq(expect, got) @@ -1852,8 +1860,8 @@ def singleton_concat_obj(request, singleton_concat_index): def test_concat_singleton_sorting( axis, sort, ignore_index, singleton_concat_obj ): - gobj = gd.from_pandas(singleton_concat_obj) - gconcat = gd.concat( + gobj = cudf.from_pandas(singleton_concat_obj) + gconcat = cudf.concat( [gobj], axis=axis, sort=sort, ignore_index=ignore_index ) pconcat = pd.concat( @@ -1864,9 +1872,9 @@ def test_concat_singleton_sorting( @pytest.mark.parametrize("axis", [2, "invalid"]) def test_concat_invalid_axis(axis): - s = gd.Series([1, 2, 3]) + s = cudf.Series([1, 2, 3]) with pytest.raises(ValueError): - gd.concat([s], axis=axis) + cudf.concat([s], axis=axis) @pytest.mark.parametrize( @@ -1876,7 +1884,7 @@ def test_concat_invalid_axis(axis): ], ) def test_concat_mixed_list_types_error(s1, s2): - s1, s2 = gd.Series(s1), gd.Series(s2) + s1, s2 = cudf.Series(s1), cudf.Series(s2) with pytest.raises(NotImplementedError): - gd.concat([s1, s2], ignore_index=True) + cudf.concat([s1, s2], ignore_index=True) diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py index 325be954fe4..5ffe255d0f8 100644 --- a/python/cudf/cudf/tests/test_custom_accessor.py +++ b/python/cudf/cudf/tests/test_custom_accessor.py @@ -1,13 +1,13 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import pandas as pd import pytest -import cudf as gd +import cudf from cudf.testing._utils import assert_eq -@gd.api.extensions.register_dataframe_accessor("point") +@cudf.api.extensions.register_dataframe_accessor("point") @pd.api.extensions.register_dataframe_accessor("point") class PointsAccessor: def __init__(self, obj): @@ -29,7 +29,7 @@ def bounding_box(self): @pytest.mark.parametrize( - "gdf", [gd.datasets.randomdata(nrows=6, dtypes={"x": int, "y": int})] + "gdf", [cudf.datasets.randomdata(nrows=6, dtypes={"x": int, "y": int})] ) def test_dataframe_accessor(gdf): pdf = gdf.to_pandas() @@ -38,10 +38,10 @@ def test_dataframe_accessor(gdf): @pytest.mark.parametrize( - "gdf1", [gd.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})] + "gdf1", [cudf.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})] ) @pytest.mark.parametrize( - "gdf2", [gd.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})] + "gdf2", [cudf.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})] ) def test_dataframe_accessor_idendity(gdf1, gdf2): """Test for accessor identities @@ -55,8 +55,8 @@ def test_dataframe_accessor_idendity(gdf1, gdf2): @pd.api.extensions.register_index_accessor("odd") @pd.api.extensions.register_series_accessor("odd") -@gd.api.extensions.register_index_accessor("odd") -@gd.api.extensions.register_series_accessor("odd") +@cudf.api.extensions.register_index_accessor("odd") +@cudf.api.extensions.register_series_accessor("odd") class OddRowAccessor: def __init__(self, obj): self._obj = obj @@ -65,7 +65,7 @@ def __getitem__(self, i): return self._obj[2 * i - 1] -@pytest.mark.parametrize("gidx", [gd.Index(list(range(0, 50)))]) +@pytest.mark.parametrize("gidx", [cudf.Index(list(range(0, 50)))]) def test_index_accessor(gidx): pidx = gidx.to_pandas() @@ -73,7 +73,7 @@ def test_index_accessor(gidx): assert_eq(gidx.odd[i], pidx.odd[i]) -@pytest.mark.parametrize("gs", [gd.Series(list(range(1, 50)))]) +@pytest.mark.parametrize("gs", [cudf.Series(list(range(1, 50)))]) def test_series_accessor(gs): ps = gs.to_pandas() @@ -82,10 +82,10 @@ def test_series_accessor(gs): @pytest.mark.parametrize( - "gdf", [gd.datasets.randomdata(nrows=6, dtypes={"x": int, "y": int})] + "gdf", [cudf.datasets.randomdata(nrows=6, dtypes={"x": int, "y": int})] ) -@pytest.mark.parametrize("gidx", [gd.Index(list(range(1, 50)))]) -@pytest.mark.parametrize("gs", [gd.Series(list(range(1, 50)))]) +@pytest.mark.parametrize("gidx", [cudf.Index(list(range(1, 50)))]) +@pytest.mark.parametrize("gs", [cudf.Series(list(range(1, 50)))]) def test_accessor_space_separate(gdf, gidx, gs): assert not id(gdf._accessors) == id(gidx._accessors) assert not id(gidx._accessors) == id(gs._accessors) diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index 320c221fcb2..45629868ccc 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -2,15 +2,15 @@ import numpy as np -import cudf as gd +import cudf from cudf.testing._utils import assert_eq def test_dataset_timeseries(): - gdf1 = gd.datasets.timeseries( + gdf1 = cudf.datasets.timeseries( dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1 ) - gdf2 = gd.datasets.timeseries( + gdf2 = cudf.datasets.timeseries( dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1 ) @@ -20,7 +20,7 @@ def test_dataset_timeseries(): assert gdf1["y"].head().dtype == float assert gdf1.index.name == "timestamp" - gdf = gd.datasets.timeseries( + gdf = cudf.datasets.timeseries( "2000", "2010", freq="2H", @@ -33,13 +33,13 @@ def test_dataset_timeseries(): assert gdf["id"].head().dtype == int assert gdf["name"].head().dtype == "category" - gdf = gd.datasets.randomdata() + gdf = cudf.datasets.randomdata() assert gdf["id"].head().dtype == int assert gdf["x"].head().dtype == float assert gdf["y"].head().dtype == float assert len(gdf) == 10 - gdf = gd.datasets.randomdata( + gdf = cudf.datasets.randomdata( nrows=20, dtypes={"id": int, "a": int, "b": float} ) assert gdf["id"].head().dtype == int @@ -51,7 +51,7 @@ def test_dataset_timeseries(): def test_make_bool(): n = 10 state = np.random.RandomState(12) - arr = gd.datasets.make_bool(n, state) + arr = cudf.datasets.make_bool(n, state) assert np.all(np.isin(arr, [True, False])) assert arr.size == n assert arr.dtype == bool diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index d01ada92e33..f3774e20d32 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -16,7 +16,7 @@ from dask.highlevelgraph import HighLevelGraph from dask.utils import M -import cudf as gd +import cudf from cudf.api.types import _is_categorical_dtype from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate @@ -118,7 +118,7 @@ def _append_counts(val, count): return val # Sort by calculated quantile values, then number of observations. - combined_vals_counts = gd.core.reshape._merge_sorted( + combined_vals_counts = cudf.core.reshape._merge_sorted( [*map(_append_counts, vals, counts)] ) combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values) @@ -180,7 +180,7 @@ def finalize_tsk(tsk): if len(qs) == 0: name = "quantiles-" + token - empty_index = gd.Index([], dtype=float) + empty_index = cudf.Index([], dtype=float) return Series( { (name, 0): final_type( @@ -305,7 +305,7 @@ def sort_values( # Step 2 - Perform repartitioning shuffle meta = df._meta._constructor_sliced([0]) - if not isinstance(divisions, (gd.Series, gd.DataFrame)): + if not isinstance(divisions, (cudf.Series, cudf.DataFrame)): dtype = df[by[0]].dtype divisions = df._meta._constructor_sliced(divisions, dtype=dtype) @@ -330,7 +330,7 @@ def sort_values( # Step 3 - Return final sorted df df4 = df3.map_partitions(sort_function, **sort_kwargs) - if not isinstance(divisions, gd.DataFrame) and set_divisions: + if not isinstance(divisions, cudf.DataFrame) and set_divisions: # Can't have multi-column divisions elsewhere in dask (yet) df4.divisions = tuple(methods.tolist(divisions)) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index a6a457d98a4..8c9ce45df59 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -11,7 +11,7 @@ from cudf import DataFrame, Series, date_range from cudf.testing._utils import assert_eq, does_not_raise -import dask_cudf as dgd +import dask_cudf ############################################################################# # Datetime Accessor # @@ -33,7 +33,7 @@ def data_dt_2(): def test_datetime_accessor_initialization(data): pdsr = pd.Series(data.copy()) sr = Series(pdsr) - dsr = dgd.from_cudf(sr, npartitions=5) + dsr = dask_cudf.from_cudf(sr, npartitions=5) with pytest.raises(AttributeError): dsr.dt @@ -42,7 +42,7 @@ def test_datetime_accessor_initialization(data): def test_series(data): pdsr = pd.Series(data.copy()) sr = Series(pdsr) - dsr = dgd.from_cudf(sr, npartitions=5) + dsr = dask_cudf.from_cudf(sr, npartitions=5) np.testing.assert_equal(np.array(pdsr), dsr.compute().values_host) @@ -52,7 +52,7 @@ def test_series(data): def test_dt_series(data, field): pdsr = pd.Series(data.copy()) sr = Series(pdsr) - dsr = dgd.from_cudf(sr, npartitions=5) + dsr = dask_cudf.from_cudf(sr, npartitions=5) base = getattr(pdsr.dt, field) test = getattr(dsr.dt, field).compute() assert_eq(base, test, check_dtype=False) @@ -61,7 +61,7 @@ def test_dt_series(data, field): @pytest.mark.parametrize("data", [data_dt_1()]) def test_dt_accessor(data): df = DataFrame({"dt_col": data.copy()}) - ddf = dgd.from_cudf(df, npartitions=5) + ddf = dask_cudf.from_cudf(df, npartitions=5) for i in ["year", "month", "day", "hour", "minute", "second", "weekday"]: assert i in dir(ddf.dt_col.dt) @@ -98,14 +98,14 @@ def data_cat_3(): @pytest.mark.parametrize("data", [data_cat_1()]) def test_categorical_accessor_initialization1(data): sr = Series(data.copy()) - dsr = dgd.from_cudf(sr, npartitions=5) + dsr = dask_cudf.from_cudf(sr, npartitions=5) dsr.cat @pytest.mark.parametrize("data", [data_cat_2()]) def test_categorical_accessor_initialization2(data): sr = Series(data.copy()) - dsr = dgd.from_cudf(sr, npartitions=5) + dsr = dask_cudf.from_cudf(sr, npartitions=5) with pytest.raises(AttributeError): dsr.cat @@ -115,7 +115,7 @@ def test_categorical_basic(data): cat = data.copy() pdsr = pd.Series(cat) sr = Series(cat) - dsr = dgd.from_cudf(sr, npartitions=2) + dsr = dask_cudf.from_cudf(sr, npartitions=2) result = dsr.compute() np.testing.assert_array_equal(cat.codes, result.cat.codes.values_host) @@ -143,7 +143,7 @@ def test_categorical_basic(data): df["a"] = ["xyz", "abc", "def"] * 10 pdf = df.to_pandas() - cddf = dgd.from_cudf(df, 1) + cddf = dask_cudf.from_cudf(df, 1) cddf["b"] = cddf["a"].astype("category") ddf = dd.from_pandas(pdf, 1) @@ -169,7 +169,7 @@ def test_categorical_compare_unordered(data): cat = data.copy() pdsr = pd.Series(cat) sr = Series(cat) - dsr = dgd.from_cudf(sr, npartitions=2) + dsr = dask_cudf.from_cudf(sr, npartitions=2) # Test equality out = dsr == dsr @@ -209,8 +209,8 @@ def test_categorical_compare_ordered(data): pdsr2 = pd.Series(cat2) sr1 = Series(cat1) sr2 = Series(cat2) - dsr1 = dgd.from_cudf(sr1, npartitions=2) - dsr2 = dgd.from_cudf(sr2, npartitions=2) + dsr1 = dask_cudf.from_cudf(sr1, npartitions=2) + dsr2 = dask_cudf.from_cudf(sr2, npartitions=2) # Test equality out = dsr1 == dsr1 @@ -248,7 +248,7 @@ def data_str_1(): def test_string_slicing(data): pdsr = pd.Series(data.copy()) sr = Series(pdsr) - dsr = dgd.from_cudf(sr, npartitions=2) + dsr = dask_cudf.from_cudf(sr, npartitions=2) base = pdsr.str.slice(0, 4) test = dsr.str.slice(0, 4).compute() assert_eq(base, test) @@ -261,7 +261,7 @@ def test_categorical_categories(): df["a"] = df["a"].astype("category") pdf = df.to_pandas(nullable=False) - ddf = dgd.from_cudf(df, 2) + ddf = dask_cudf.from_cudf(df, 2) dpdf = dd.from_pandas(pdf, 2) dd.assert_eq( @@ -272,7 +272,7 @@ def test_categorical_categories(): def test_categorical_as_known(): - df = dgd.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) + df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2) df["col_1"] = df["col_1"].astype("category") actual = df["col_1"].cat.as_known() @@ -285,7 +285,7 @@ def test_categorical_as_known(): def test_str_slice(): df = DataFrame({"a": ["abc,def,123", "xyz,hi,bye"]}) - ddf = dgd.from_cudf(df, 1) + ddf = dask_cudf.from_cudf(df, 1) pdf = df.to_pandas() dd.assert_eq( @@ -345,7 +345,7 @@ def data_test_sort(): ) def test_create_list_series(data): expect = pd.Series(data) - ds_got = dgd.from_cudf(Series(data), 4) + ds_got = dask_cudf.from_cudf(Series(data), 4) assert_eq(expect, ds_got.compute()) @@ -355,7 +355,7 @@ def test_create_list_series(data): ) def test_unique(data): expect = Series(data).list.unique() - ds = dgd.from_cudf(Series(data), 5) + ds = dask_cudf.from_cudf(Series(data), 5) assert_eq(expect, ds.list.unique().compute()) @@ -365,7 +365,7 @@ def test_unique(data): ) def test_len(data): expect = Series(data).list.len() - ds = dgd.from_cudf(Series(data), 5) + ds = dask_cudf.from_cudf(Series(data), 5) assert_eq(expect, ds.list.len().compute()) @@ -375,7 +375,7 @@ def test_len(data): ) def test_contains(data, search_key): expect = Series(data).list.contains(search_key) - ds = dgd.from_cudf(Series(data), 5) + ds = dask_cudf.from_cudf(Series(data), 5) assert_eq(expect, ds.list.contains(search_key).compute()) @@ -388,7 +388,7 @@ def test_contains(data, search_key): ) def test_get(data, index): expect = Series(data).list.get(index) - ds = dgd.from_cudf(Series(data), 5) + ds = dask_cudf.from_cudf(Series(data), 5) assert_eq(expect, ds.list.get(index).compute()) @@ -398,7 +398,7 @@ def test_get(data, index): ) def test_leaves(data): expect = Series(data).list.leaves - ds = dgd.from_cudf(Series(data), 5) + ds = dask_cudf.from_cudf(Series(data), 5) got = ds.list.leaves.compute().reset_index(drop=True) assert_eq(expect, got) @@ -419,7 +419,7 @@ def test_take(data, list_indices, expectation): expect = Series(data).list.take(list_indices) if expectation == does_not_raise(): - ds = dgd.from_cudf(Series(data), 5) + ds = dask_cudf.from_cudf(Series(data), 5) assert_eq(expect, ds.list.take(list_indices).compute()) @@ -435,7 +435,7 @@ def test_sorting(data, ascending, na_position, ignore_index): ascending=ascending, na_position=na_position, ignore_index=ignore_index ) got = ( - dgd.from_cudf(Series(data), 5) + dask_cudf.from_cudf(Series(data), 5) .list.sort_values( ascending=ascending, na_position=na_position, @@ -464,7 +464,7 @@ def test_sorting(data, ascending, na_position, ignore_index): ) def test_create_struct_series(data): expect = pd.Series(data) - ds_got = dgd.from_cudf(Series(data), 2) + ds_got = dask_cudf.from_cudf(Series(data), 2) assert_eq(expect, ds_got.compute()) @@ -475,7 +475,7 @@ def test_create_struct_series(data): def test_struct_field_str(data): for test_key in ["a", "b"]: expect = Series(data).struct.field(test_key) - ds_got = dgd.from_cudf(Series(data), 2).struct.field(test_key) + ds_got = dask_cudf.from_cudf(Series(data), 2).struct.field(test_key) assert_eq(expect, ds_got.compute()) @@ -486,7 +486,7 @@ def test_struct_field_str(data): def test_struct_field_integer(data): for test_key in [0, 1]: expect = Series(data).struct.field(test_key) - ds_got = dgd.from_cudf(Series(data), 2).struct.field(test_key) + ds_got = dask_cudf.from_cudf(Series(data), 2).struct.field(test_key) assert_eq(expect, ds_got.compute()) @@ -495,7 +495,7 @@ def test_struct_field_integer(data): struct_accessor_data_params, ) def test_dask_struct_field_Key_Error(data): - got = dgd.from_cudf(Series(data), 2) + got = dask_cudf.from_cudf(Series(data), 2) with pytest.raises(KeyError): got.struct.field("notakey").compute() @@ -507,7 +507,7 @@ def test_dask_struct_field_Key_Error(data): ) def test_dask_struct_field_Int_Error(data): # breakpoint() - got = dgd.from_cudf(Series(data), 2) + got = dask_cudf.from_cudf(Series(data), 2) with pytest.raises(IndexError): got.struct.field(1000).compute() @@ -523,7 +523,7 @@ def test_dask_struct_field_Int_Error(data): ) def test_struct_explode(data): expect = Series(data).struct.explode() - got = dgd.from_cudf(Series(data), 2).struct.explode() + got = dask_cudf.from_cudf(Series(data), 2).struct.explode() # Output index will not agree for >1 partitions assert_eq(expect, got.compute().reset_index(drop=True)) @@ -533,7 +533,7 @@ def test_tz_localize(): expect = data.dt.tz_localize( "US/Eastern", ambiguous="NaT", nonexistent="NaT" ) - got = dgd.from_cudf(data, 2).dt.tz_localize( + got = dask_cudf.from_cudf(data, 2).dt.tz_localize( "US/Eastern", ambiguous="NaT", nonexistent="NaT" ) dd.assert_eq(expect, got) @@ -554,5 +554,5 @@ def test_tz_localize(): ) def test_tz_convert(data): expect = Series(data).dt.tz_convert("US/Pacific") - got = dgd.from_cudf(Series(data), 2).dt.tz_convert("US/Pacific") + got = dask_cudf.from_cudf(Series(data), 2).dt.tz_convert("US/Pacific") dd.assert_eq(expect, got) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index afe2a050695..250256d3356 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -15,7 +15,7 @@ import cudf -import dask_cudf as dgd +import dask_cudf def test_from_dict_backend_dispatch(): @@ -28,7 +28,7 @@ def test_from_dict_backend_dispatch(): expect = cudf.DataFrame(data) with dask.config.set({"dataframe.backend": "cudf"}): ddf = dd.from_dict(data, npartitions=2) - assert isinstance(ddf, dgd.DataFrame) + assert isinstance(ddf, dask_cudf.DataFrame) dd.assert_eq(expect, ddf) @@ -43,7 +43,7 @@ def test_to_backend(): assert isinstance(ddf._meta, pd.DataFrame) gdf = ddf.to_backend("cudf") - assert isinstance(gdf, dgd.DataFrame) + assert isinstance(gdf, dask_cudf.DataFrame) dd.assert_eq(cudf.DataFrame(data), ddf) assert isinstance(gdf.to_backend()._meta, pd.DataFrame) @@ -58,13 +58,13 @@ def test_to_backend_kwargs(): # Using `nan_as_null=False` will result in a cudf-backed # Series with a NaN element (ranther than ) gser_nan = dser.to_backend("cudf", nan_as_null=False) - assert isinstance(gser_nan, dgd.Series) + assert isinstance(gser_nan, dask_cudf.Series) assert np.isnan(gser_nan.compute()).sum() == 1 # Using `nan_as_null=True` will result in a cudf-backed # Series with a element (ranther than NaN) gser_null = dser.to_backend("cudf", nan_as_null=True) - assert isinstance(gser_null, dgd.Series) + assert isinstance(gser_null, dask_cudf.Series) assert np.isnan(gser_null.compute()).sum() == 0 # Check `nullable` argument for `cudf.Series.to_pandas` @@ -110,7 +110,7 @@ def test_from_cudf_multiindex_raises(): with pytest.raises(NotImplementedError): # dask_cudf does not support MultiIndex yet - dgd.from_cudf(df.set_index(["x", "y"])) + dask_cudf.from_cudf(df.set_index(["x", "y"])) def test_from_cudf_with_generic_idx(): @@ -122,7 +122,7 @@ def test_from_cudf_with_generic_idx(): } ) - ddf = dgd.from_cudf(cdf, npartitions=2) + ddf = dask_cudf.from_cudf(cdf, npartitions=2) assert isinstance(ddf.index.compute(), cudf.RangeIndex) dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]]) @@ -164,7 +164,7 @@ def test_query_local_dict(): {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} ) gdf = cudf.DataFrame.from_pandas(df) - ddf = dgd.from_cudf(gdf, npartitions=2) + ddf = dask_cudf.from_cudf(gdf, npartitions=2) val = 2 @@ -296,7 +296,7 @@ def test_set_index_sorted(): ddf1 = dd.from_pandas(df1, npartitions=2) gdf1 = cudf.from_pandas(df1) - gddf1 = dgd.from_cudf(gdf1, npartitions=2) + gddf1 = dask_cudf.from_cudf(gdf1, npartitions=2) expect = ddf1.set_index("id", sorted=True) got = gddf1.set_index("id", sorted=True) @@ -323,7 +323,9 @@ def test_rearrange_by_divisions(nelem, index): df["z"] = df["z"].astype("category") ddf1 = dd.from_pandas(df, npartitions=4) - gdf1 = dgd.from_cudf(cudf.DataFrame.from_pandas(df), npartitions=4) + gdf1 = dask_cudf.from_cudf( + cudf.DataFrame.from_pandas(df), npartitions=4 + ) ddf1.index.name = index gdf1.index.name = index divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20) @@ -487,7 +489,7 @@ def test_repartition_hash_staged(npartitions): # WARNING: Specific npartitions-max_branch combination # was specifically chosen to cover changes in #4676 npartitions_initial = 17 - ddf = dgd.from_cudf(gdf, npartitions=npartitions_initial) + ddf = dask_cudf.from_cudf(gdf, npartitions=npartitions_initial) ddf_new = ddf.shuffle( on=by, ignore_index=True, npartitions=npartitions, max_branch=4 ) @@ -527,7 +529,7 @@ def test_repartition_hash(by, npartitions, max_branch): } ) gdf.d = gdf.d.astype("datetime64[ms]") - ddf = dgd.from_cudf(gdf, npartitions=npartitions_i) + ddf = dask_cudf.from_cudf(gdf, npartitions=npartitions_i) ddf_new = ddf.shuffle( on=by, ignore_index=True, @@ -554,7 +556,7 @@ def test_repartition_hash(by, npartitions, max_branch): def test_repartition_no_extra_row(): # see https://github.com/rapidsai/cudf/issues/11930 gdf = cudf.DataFrame({"a": [10, 20, 30], "b": [1, 2, 3]}).set_index("a") - ddf = dgd.from_cudf(gdf, npartitions=1) + ddf = dask_cudf.from_cudf(gdf, npartitions=1) ddf_new = ddf.repartition([0, 5, 10, 30], force=True) dd.assert_eq(ddf, ddf_new) dd.assert_eq(gdf, ddf_new) @@ -669,20 +671,20 @@ def test_hash_object_dispatch(index): # DataFrame result = dd.core.hash_object_dispatch(obj, index=index) - expected = dgd.backends.hash_object_cudf(obj, index=index) + expected = dask_cudf.backends.hash_object_cudf(obj, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # Series result = dd.core.hash_object_dispatch(obj["x"], index=index) - expected = dgd.backends.hash_object_cudf(obj["x"], index=index) + expected = dask_cudf.backends.hash_object_cudf(obj["x"], index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # DataFrame with MultiIndex obj_multi = obj.set_index(["x", "z"], drop=True) result = dd.core.hash_object_dispatch(obj_multi, index=index) - expected = dgd.backends.hash_object_cudf(obj_multi, index=index) + expected = dask_cudf.backends.hash_object_cudf(obj_multi, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) @@ -729,7 +731,7 @@ def test_make_meta_backends(index): # Check dask code path if not MultiIndex if not isinstance(df.index, cudf.MultiIndex): - ddf = dgd.from_cudf(df, npartitions=1) + ddf = dask_cudf.from_cudf(df, npartitions=1) # Check "empty" metadata types dd.assert_eq(ddf._meta.dtypes, df.dtypes) @@ -751,7 +753,7 @@ def test_dataframe_series_replace(data): pdf = data.copy() gdf = cudf.from_pandas(pdf) - ddf = dgd.from_cudf(gdf, npartitions=5) + ddf = dask_cudf.from_cudf(gdf, npartitions=5) dd.assert_eq(ddf.replace(1, 2), pdf.replace(1, 2)) @@ -760,7 +762,7 @@ def test_dataframe_assign_col(): df = cudf.DataFrame(list(range(100))) pdf = pd.DataFrame(list(range(100))) - ddf = dgd.from_cudf(df, npartitions=4) + ddf = dask_cudf.from_cudf(df, npartitions=4) ddf["fold"] = 0 ddf["fold"] = ddf["fold"].map_partitions( lambda cudf_df: cp.random.randint(0, 4, len(cudf_df)) @@ -783,7 +785,7 @@ def test_dataframe_set_index(): pdf = df.to_pandas() with dask.config.set({"dataframe.convert-string": False}): - ddf = dgd.from_cudf(df, npartitions=4) + ddf = dask_cudf.from_cudf(df, npartitions=4) ddf = ddf.set_index("str") pddf = dd.from_pandas(pdf, npartitions=4) @@ -799,7 +801,7 @@ def test_series_describe(): sr = cudf.datasets.randomdata(20)["x"] psr = sr.to_pandas() - dsr = dgd.from_cudf(sr, npartitions=4) + dsr = dask_cudf.from_cudf(sr, npartitions=4) pdsr = dd.from_pandas(psr, npartitions=4) dd.assert_eq( @@ -814,7 +816,7 @@ def test_dataframe_describe(): df = cudf.datasets.randomdata(20) pdf = df.to_pandas() - ddf = dgd.from_cudf(df, npartitions=4) + ddf = dask_cudf.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) dd.assert_eq( @@ -831,7 +833,7 @@ def test_zero_std_describe(): } ) pdf = df.to_pandas() - ddf = dgd.from_cudf(df, npartitions=4) + ddf = dask_cudf.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) dd.assert_eq(ddf.describe(), pddf.describe(), rtol=1e-3) @@ -846,7 +848,7 @@ def test_large_numbers_var(): } ) pdf = df.to_pandas() - ddf = dgd.from_cudf(df, npartitions=4) + ddf = dask_cudf.from_cudf(df, npartitions=4) pddf = dd.from_pandas(pdf, npartitions=4) dd.assert_eq(ddf.var(), pddf.var(), rtol=1e-3) @@ -858,7 +860,7 @@ def test_index_map_partitions(): ddf = dd.from_pandas(pd.DataFrame({"a": range(10)}), npartitions=2) mins_pd = ddf.index.map_partitions(M.min, meta=ddf.index).compute() - gddf = dgd.from_cudf(cudf.DataFrame({"a": range(10)}), npartitions=2) + gddf = dask_cudf.from_cudf(cudf.DataFrame({"a": range(10)}), npartitions=2) mins_gd = gddf.index.map_partitions(M.min, meta=gddf.index).compute() dd.assert_eq(mins_pd, mins_gd) @@ -878,7 +880,7 @@ def test_merging_categorical_columns(): {"id_1": [0, 1, 2, 3], "cat_col": ["a", "b", "f", "f"]} ) - ddf_1 = dgd.from_cudf(df_1, npartitions=2) + ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2) ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"]) @@ -886,7 +888,7 @@ def test_merging_categorical_columns(): {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]} ) - ddf_2 = dgd.from_cudf(df_2, npartitions=2) + ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2) ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"]) expected = cudf.DataFrame( @@ -930,7 +932,7 @@ def test_categorical_dtype_round_trip(): s = cudf.Series(4 * ["foo"], dtype="category") assert s.dtype.ordered is False - ds = dgd.from_cudf(s, npartitions=2) + ds = dask_cudf.from_cudf(s, npartitions=2) pds = dd.from_pandas(s.to_pandas(), npartitions=2) dd.assert_eq(ds, pds) assert ds.dtype.ordered is False diff --git a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py index 6c68d92a8df..e6fb58ad6df 100644 --- a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py +++ b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. """ Test IO with dask.delayed API @@ -10,14 +10,14 @@ from dask.delayed import delayed -import cudf as gd +import cudf -import dask_cudf as dgd +import dask_cudf @delayed def load_data(nelem, ident): - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.arange(nelem) df["ident"] = np.asarray([ident] * nelem) return df @@ -30,39 +30,39 @@ def get_combined_column(df): def test_dataframe_from_delayed(): delays = [load_data(10 * i, i) for i in range(1, 3)] - out = dgd.from_delayed(delays) + out = dask_cudf.from_delayed(delays) res = out.compute() - assert isinstance(res, gd.DataFrame) + assert isinstance(res, cudf.DataFrame) - expected = gd.concat([d.compute() for d in delays]) + expected = cudf.concat([d.compute() for d in delays]) assert_frame_equal(res.to_pandas(), expected.to_pandas()) def test_series_from_delayed(): delays = [get_combined_column(load_data(10 * i, i)) for i in range(1, 3)] - out = dgd.from_delayed(delays) + out = dask_cudf.from_delayed(delays) res = out.compute() - assert isinstance(res, gd.Series) + assert isinstance(res, cudf.Series) - expected = gd.concat([d.compute() for d in delays]) + expected = cudf.concat([d.compute() for d in delays]) np.testing.assert_array_equal(res.to_pandas(), expected.to_pandas()) def test_dataframe_to_delayed(): nelem = 100 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.arange(nelem) df["y"] = np.random.randint(nelem, size=nelem) - ddf = dgd.from_cudf(df, npartitions=5) + ddf = dask_cudf.from_cudf(df, npartitions=5) delays = ddf.to_delayed() assert len(delays) == 5 # Concat the delayed partitions - got = gd.concat([d.compute() for d in delays]) + got = cudf.concat([d.compute() for d in delays]) assert_frame_equal(got.to_pandas(), df.to_pandas()) # Check individual partitions @@ -81,17 +81,17 @@ def test_dataframe_to_delayed(): def test_series_to_delayed(): nelem = 100 - sr = gd.Series(np.random.randint(nelem, size=nelem)) + sr = cudf.Series(np.random.randint(nelem, size=nelem)) - dsr = dgd.from_cudf(sr, npartitions=5) + dsr = dask_cudf.from_cudf(sr, npartitions=5) delays = dsr.to_delayed() assert len(delays) == 5 # Concat the delayed partitions - got = gd.concat([d.compute() for d in delays]) - assert isinstance(got, gd.Series) + got = cudf.concat([d.compute() for d in delays]) + assert isinstance(got, cudf.Series) np.testing.assert_array_equal(got.to_pandas(), sr.to_pandas()) # Check individual partitions @@ -110,15 +110,15 @@ def test_series_to_delayed(): def test_mixing_series_frame_error(): nelem = 20 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.arange(nelem) df["y"] = np.random.randint(nelem, size=nelem) - ddf = dgd.from_cudf(df, npartitions=5) + ddf = dask_cudf.from_cudf(df, npartitions=5) delay_frame = ddf.to_delayed() delay_series = ddf.x.to_delayed() - combined = dgd.from_delayed(delay_frame + delay_series) + combined = dask_cudf.from_delayed(delay_frame + delay_series) with pytest.raises(ValueError) as raises: combined.compute() @@ -129,15 +129,15 @@ def test_mixing_series_frame_error(): def test_frame_extra_columns_error(): nelem = 20 - df = gd.DataFrame() + df = cudf.DataFrame() df["x"] = np.arange(nelem) df["y"] = np.random.randint(nelem, size=nelem) - ddf1 = dgd.from_cudf(df, npartitions=5) + ddf1 = dask_cudf.from_cudf(df, npartitions=5) df["z"] = np.arange(nelem) - ddf2 = dgd.from_cudf(df, npartitions=5) + ddf2 = dask_cudf.from_cudf(df, npartitions=5) - combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) + combined = dask_cudf.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) with pytest.raises(ValueError) as raises: combined.compute() @@ -150,18 +150,18 @@ def test_frame_extra_columns_error(): def test_frame_dtype_error(): nelem = 20 - df1 = gd.DataFrame() + df1 = cudf.DataFrame() df1["bad"] = np.arange(nelem) df1["bad"] = np.arange(nelem, dtype=np.float64) - df2 = gd.DataFrame() + df2 = cudf.DataFrame() df2["bad"] = np.arange(nelem) df2["bad"] = np.arange(nelem, dtype=np.float32) - ddf1 = dgd.from_cudf(df1, npartitions=5) - ddf2 = dgd.from_cudf(df2, npartitions=5) + ddf1 = dask_cudf.from_cudf(df1, npartitions=5) + ddf2 = dask_cudf.from_cudf(df2, npartitions=5) - combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) + combined = dask_cudf.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) with pytest.raises(ValueError) as raises: combined.compute() diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py index 73fd37df6fa..eb500ad2462 100644 --- a/python/dask_cudf/dask_cudf/tests/test_join.py +++ b/python/dask_cudf/dask_cudf/tests/test_join.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from functools import partial @@ -10,7 +10,7 @@ import cudf -import dask_cudf as dgd +import dask_cudf param_nrows = [5, 10, 50, 100] @@ -44,8 +44,8 @@ def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys): expect = expect.to_pandas() # dask_cudf - left = dgd.from_cudf(left, chunksize=chunksize) - right = dgd.from_cudf(right, chunksize=chunksize) + left = dask_cudf.from_cudf(left, chunksize=chunksize) + right = dask_cudf.from_cudf(right, chunksize=chunksize) joined = left.set_index("x").join( right.set_index("x"), how="inner", lsuffix="l", rsuffix="r" @@ -102,8 +102,8 @@ def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how): expect = expect.to_pandas() # dask_cudf - left = dgd.from_cudf(left, chunksize=chunksize) - right = dgd.from_cudf(right, chunksize=chunksize) + left = dask_cudf.from_cudf(left, chunksize=chunksize) + right = dask_cudf.from_cudf(right, chunksize=chunksize) joined = left.set_index("x").join( right.set_index("x"), how=how, lsuffix="l", rsuffix="r" @@ -173,8 +173,8 @@ def normalize(df): ) # dask_cudf - left = dgd.from_cudf(left, chunksize=chunksize) - right = dgd.from_cudf(right, chunksize=chunksize) + left = dask_cudf.from_cudf(left, chunksize=chunksize) + right = dask_cudf.from_cudf(right, chunksize=chunksize) result = left.merge(right, on=("x", "y"), how=how).compute( scheduler="single-threaded" @@ -216,8 +216,8 @@ def test_merge_1col_left( ) # dask_cudf - left = dgd.from_cudf(left, chunksize=chunksize) - right = dgd.from_cudf(right, chunksize=chunksize) + left = dask_cudf.from_cudf(left, chunksize=chunksize) + right = dask_cudf.from_cudf(right, chunksize=chunksize) joined = left.merge(right, on=["x"], how=how) @@ -238,8 +238,8 @@ def test_merge_should_fail(): df2["a"] = [7, 2, 3, 8, 5, 9] * 2 df2["c"] = np.random.randint(0, 12, 12) - left = dgd.from_cudf(df1, 1).groupby("a").b.min().to_frame() - right = dgd.from_cudf(df2, 1).groupby("a").c.min().to_frame() + left = dask_cudf.from_cudf(df1, 1).groupby("a").b.min().to_frame() + right = dask_cudf.from_cudf(df2, 1).groupby("a").c.min().to_frame() with pytest.raises(KeyError): left.merge(right, how="left", on=["nonCol"]) @@ -250,7 +250,7 @@ def test_merge_should_fail(): # Same column names df2["b"] = np.random.randint(0, 12, 12) - right = dgd.from_cudf(df2, 1).groupby("a").b.min().to_frame() + right = dask_cudf.from_cudf(df2, 1).groupby("a").b.min().to_frame() with pytest.raises(KeyError): left.merge(right, how="left", on="NonCol") diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index e347e8be9e4..8688f830dcb 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -9,7 +9,7 @@ import cudf -import dask_cudf as dgd +import dask_cudf def _make_random_frame(nelem, npartitions=2): @@ -20,7 +20,7 @@ def _make_random_frame(nelem, npartitions=2): } ) gdf = cudf.DataFrame.from_pandas(df) - dgf = dgd.from_cudf(gdf, npartitions=npartitions) + dgf = dask_cudf.from_cudf(gdf, npartitions=npartitions) return df, dgf @@ -67,7 +67,7 @@ def test_series_reduce(reducer): "op", ["max", "min", "sum", "prod", "mean", "var", "std"] ) def test_rowwise_reductions(data, op): - gddf = dgd.from_cudf(data, npartitions=10) + gddf = dask_cudf.from_cudf(data, npartitions=10) pddf = gddf.to_dask_dataframe() with dask.config.set({"dataframe.convert-string": False}):