From 67a6187e3ed74320ae3fcea6639a8d65fa3d8f01 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 24 Mar 2022 13:06:11 -0700 Subject: [PATCH 01/17] initial --- python/cudf/cudf/core/dataframe.py | 34 +++++++++++++++++++++++ python/cudf/cudf/tests/test_applymap.py | 37 ++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 95370cdeff7..9a20b42954d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -78,6 +78,7 @@ from cudf.core.resample import DataFrameResampler from cudf.core.series import Series from cudf.core.udf.row_function import _get_row_kernel +from cudf.core.udf.scalar_function import _get_scalar_kernel from cudf.utils import applyutils, docutils, ioutils, queryutils, utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -94,6 +95,8 @@ _external_only_api, ) +import numba + T = TypeVar("T", bound="DataFrame") @@ -3712,6 +3715,37 @@ def apply( return self._apply(func, _get_row_kernel, *args, **kwargs) + def applymap(self, func, na_action=None, **kwargs) -> DataFrame: + """ + Apply a function to a Dataframe elementwise. + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If ``ignore``, propagate NaN values, without passing them to func. + + Returns + ------- + DataFrame + Transformed DataFrame. + """ + + # TODO: naive implementation + # this could be written as a single kernel + + if na_action == 'ignore': + devfunc = numba.cuda.jit(device=True)(func) + func = lambda x: cudf.NA if x is cudf.NA else devfunc(x) + + result = DataFrame() + for col in self._data.keys(): + result[col] = self[col].apply(func) + return result + + @_cudf_nvtx_annotate @applyutils.doc_apply() def apply_rows( diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index ff6e79e7804..ff7662a37e9 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from cudf import Series +from cudf import Series, DataFrame, NA from cudf.testing import _utils as utils @@ -56,3 +56,38 @@ def test_applymap_change_out_dtype(): expect = np.array(data, dtype=float) got = out.to_numpy() np.testing.assert_array_equal(expect, got) + + +@pytest.mark.parametrize('data', [ + { + 'a': [1,2,3], + 'b': [4,5,6] + }, + { + 'a': [1,2,3], + 'b': [1.0, 2.0, 3.0] + }, + { + 'a': [1,2,3], + 'b': [True, False, True] + }, + { + 'a': [1, NA, 2], + 'b': [NA, 4, NA] + } +]) +@pytest.mark.parametrize('func', [ + lambda x: x + 1, + lambda x: x - 1, + lambda x: x + 0.5, + lambda x: 2 if x is NA else 2 + (x + 1) / 4.1, + lambda x: 42 +]) +def test_applymap_dataframe(data, func): + gdf = DataFrame(data) + pdf = gdf.to_pandas(nullable=True) + + expect = pdf.applymap(func) + got = gdf.applymap(func) + + utils.assert_eq(expect, got, check_dtype=False) From 2871aa1061b8131262ffd08b4416c1194e6746c4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 29 Mar 2022 12:27:11 -0700 Subject: [PATCH 02/17] updates --- python/cudf/cudf/core/dataframe.py | 84 ++++++++++--------------- python/cudf/cudf/tests/test_applymap.py | 47 ++++++-------- 2 files changed, 53 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 47aa6386667..f5268fa1cc2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -25,6 +25,7 @@ ) import cupy +import numba import numpy as np import pandas as pd import pyarrow as pa @@ -78,7 +79,6 @@ from cudf.core.resample import DataFrameResampler from cudf.core.series import Series from cudf.core.udf.row_function import _get_row_kernel -from cudf.core.udf.scalar_function import _get_scalar_kernel from cudf.utils import applyutils, docutils, ioutils, queryutils, utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -95,8 +95,6 @@ _external_only_api, ) -import numba - T = TypeVar("T", bound="DataFrame") @@ -829,9 +827,7 @@ def _init_from_dict_like( masked = index is not None data = { key: cudf.core.column.column_empty( - row_count=row_count, - dtype=None, - masked=masked, + row_count=row_count, dtype=None, masked=masked, ) for key in extra_cols } @@ -860,10 +856,7 @@ def _init_from_dict_like( col_name, tuple ) self._insert( - i, - col_name, - data[col_name], - nan_as_null=nan_as_null, + i, col_name, data[col_name], nan_as_null=nan_as_null, ) if columns is not None: @@ -2103,9 +2096,7 @@ def _set_column_names(self, names, multiindex=False, level_names=None): raise ValueError("Duplicate column names are not allowed") self._data = ColumnAccessor( - data, - multiindex=multiindex, - level_names=level_names, + data, multiindex=multiindex, level_names=level_names, ) def _set_column_names_like(self, other): @@ -3380,13 +3371,7 @@ def merge( @_cudf_nvtx_annotate def join( - self, - other, - on=None, - how="left", - lsuffix="", - rsuffix="", - sort=False, + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False, ): """Join columns with other DataFrame on index or on a key column. @@ -3733,18 +3718,33 @@ def applymap(self, func, na_action=None, **kwargs) -> DataFrame: Transformed DataFrame. """ - # TODO: naive implementation - # this could be written as a single kernel + if kwargs: + raise ValueError( + "DataFrame.applymap does not yet support **kwargs." + ) - if na_action == 'ignore': + if na_action == "ignore": devfunc = numba.cuda.jit(device=True)(func) - func = lambda x: cudf.NA if x is cudf.NA else devfunc(x) - result = DataFrame() - for col in self._data.keys(): - result[col] = self[col].apply(func) - return result + # promote to a null-ignoring function + def _func(x): + # promote to a null-ignoring function + if x is cudf.NA: + return cudf.NA + else: + return devfunc(x) + + else: + _func = func + + # TODO: naive implementation + # this could be written as a single kernel + result = {} + for name, col in self._data.items(): + apply_sr = Series._from_data({None: col}) + result[name] = apply_sr.apply(_func) + return DataFrame._from_data(result, index=self.index) @_cudf_nvtx_annotate @applyutils.doc_apply() @@ -4554,9 +4554,7 @@ def to_arrow(self, preserve_index=True): gen_names, self.index._data.names ): data._insert( - data.shape[1], - gen_name, - self.index._data[col_name], + data.shape[1], gen_name, self.index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -5144,12 +5142,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): @_cudf_nvtx_annotate def _reduce( - self, - op, - axis=None, - level=None, - numeric_only=None, - **kwargs, + self, op, axis=None, level=None, numeric_only=None, **kwargs, ): if level is not None: raise NotImplementedError("level parameter is not implemented yet") @@ -5177,11 +5170,7 @@ def _reduce( @_cudf_nvtx_annotate def _scan( - self, - op, - axis=None, - *args, - **kwargs, + self, op, axis=None, *args, **kwargs, ): axis = self._get_axis_from_axis_arg(axis) @@ -5413,11 +5402,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result = result.set_mask( cudf._lib.transform.bools_to_mask(mask._column) ) - return Series( - result, - index=self.index, - dtype=result_dtype, - ) + return Series(result, index=self.index, dtype=result_dtype,) else: result_df = DataFrame(result).set_index(self.index) result_df._set_column_names_like(prepared) @@ -6594,10 +6579,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): if idx in categories: cols[idx] = ( cols[idx] - ._set_categories( - categories[idx], - is_unique=True, - ) + ._set_categories(categories[idx], is_unique=True,) .codes ) cols[idx] = cols[idx].astype(dtype) diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index ff7662a37e9..b450a78b1a1 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from cudf import Series, DataFrame, NA +from cudf import NA, DataFrame, Series from cudf.testing import _utils as utils @@ -58,36 +58,29 @@ def test_applymap_change_out_dtype(): np.testing.assert_array_equal(expect, got) -@pytest.mark.parametrize('data', [ - { - 'a': [1,2,3], - 'b': [4,5,6] - }, - { - 'a': [1,2,3], - 'b': [1.0, 2.0, 3.0] - }, - { - 'a': [1,2,3], - 'b': [True, False, True] - }, - { - 'a': [1, NA, 2], - 'b': [NA, 4, NA] - } -]) -@pytest.mark.parametrize('func', [ - lambda x: x + 1, - lambda x: x - 1, - lambda x: x + 0.5, - lambda x: 2 if x is NA else 2 + (x + 1) / 4.1, - lambda x: 42 -]) +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [4, 5, 6]}, + {"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}, + {"a": [1, 2, 3], "b": [True, False, True]}, + {"a": [1, NA, 2], "b": [NA, 4, NA]}, + ], +) +@pytest.mark.parametrize( + "func", + [ + lambda x: x + 1, + lambda x: x - 1, + lambda x: x + 0.5, + lambda x: 2 if x is NA else 2 + (x + 1) / 4.1, + lambda x: 42, + ], +) def test_applymap_dataframe(data, func): gdf = DataFrame(data) pdf = gdf.to_pandas(nullable=True) expect = pdf.applymap(func) got = gdf.applymap(func) - utils.assert_eq(expect, got, check_dtype=False) From b6827b541d3155ed0f2cea72861b5c4cafccb188 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 29 Mar 2022 12:30:57 -0700 Subject: [PATCH 03/17] add tests for na_action --- python/cudf/cudf/tests/test_applymap.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index b450a78b1a1..a51d0bb1c07 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -77,10 +77,12 @@ def test_applymap_change_out_dtype(): lambda x: 42, ], ) -def test_applymap_dataframe(data, func): +@pytest.mark.parametrize("na_action", [None, "ignore"]) +def test_applymap_dataframe(data, func, na_action): gdf = DataFrame(data) pdf = gdf.to_pandas(nullable=True) - expect = pdf.applymap(func) - got = gdf.applymap(func) + expect = pdf.applymap(func, na_action=na_action) + got = gdf.applymap(func, na_action=na_action) + utils.assert_eq(expect, got, check_dtype=False) From 06348f791fea3e564e4b9c7c99576bcb645d00e7 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 29 Mar 2022 13:45:10 -0700 Subject: [PATCH 04/17] match pandas error --- python/cudf/cudf/core/dataframe.py | 59 +++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f5268fa1cc2..f35a74596f1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -13,6 +13,7 @@ from collections.abc import Iterable, Sequence from typing import ( Any, + Callable, Dict, List, MutableMapping, @@ -827,7 +828,9 @@ def _init_from_dict_like( masked = index is not None data = { key: cudf.core.column.column_empty( - row_count=row_count, dtype=None, masked=masked, + row_count=row_count, + dtype=None, + masked=masked, ) for key in extra_cols } @@ -856,7 +859,10 @@ def _init_from_dict_like( col_name, tuple ) self._insert( - i, col_name, data[col_name], nan_as_null=nan_as_null, + i, + col_name, + data[col_name], + nan_as_null=nan_as_null, ) if columns is not None: @@ -2096,7 +2102,9 @@ def _set_column_names(self, names, multiindex=False, level_names=None): raise ValueError("Duplicate column names are not allowed") self._data = ColumnAccessor( - data, multiindex=multiindex, level_names=level_names, + data, + multiindex=multiindex, + level_names=level_names, ) def _set_column_names_like(self, other): @@ -3371,7 +3379,13 @@ def merge( @_cudf_nvtx_annotate def join( - self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False, + self, + other, + on=None, + how="left", + lsuffix="", + rsuffix="", + sort=False, ): """Join columns with other DataFrame on index or on a key column. @@ -3700,7 +3714,9 @@ def apply( return self._apply(func, _get_row_kernel, *args, **kwargs) - def applymap(self, func, na_action=None, **kwargs) -> DataFrame: + def applymap( + self, func: Callable, na_action: str | None = None, **kwargs + ) -> DataFrame: """ Apply a function to a Dataframe elementwise. This method applies a function that accepts and returns a scalar @@ -3723,6 +3739,11 @@ def applymap(self, func, na_action=None, **kwargs) -> DataFrame: "DataFrame.applymap does not yet support **kwargs." ) + if na_action not in {"ignore", None}: + raise ValueError( + f"na_action must be 'ignore' or None. Got {repr(na_action)}" + ) + if na_action == "ignore": devfunc = numba.cuda.jit(device=True)(func) @@ -4554,7 +4575,9 @@ def to_arrow(self, preserve_index=True): gen_names, self.index._data.names ): data._insert( - data.shape[1], gen_name, self.index._data[col_name], + data.shape[1], + gen_name, + self.index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -5142,7 +5165,12 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): @_cudf_nvtx_annotate def _reduce( - self, op, axis=None, level=None, numeric_only=None, **kwargs, + self, + op, + axis=None, + level=None, + numeric_only=None, + **kwargs, ): if level is not None: raise NotImplementedError("level parameter is not implemented yet") @@ -5170,7 +5198,11 @@ def _reduce( @_cudf_nvtx_annotate def _scan( - self, op, axis=None, *args, **kwargs, + self, + op, + axis=None, + *args, + **kwargs, ): axis = self._get_axis_from_axis_arg(axis) @@ -5402,7 +5434,11 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result = result.set_mask( cudf._lib.transform.bools_to_mask(mask._column) ) - return Series(result, index=self.index, dtype=result_dtype,) + return Series( + result, + index=self.index, + dtype=result_dtype, + ) else: result_df = DataFrame(result).set_index(self.index) result_df._set_column_names_like(prepared) @@ -6579,7 +6615,10 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): if idx in categories: cols[idx] = ( cols[idx] - ._set_categories(categories[idx], is_unique=True,) + ._set_categories( + categories[idx], + is_unique=True, + ) .codes ) cols[idx] = cols[idx].astype(dtype) From 6fb742d3a8e14351a21017f55d69030e8f90f92c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 30 Mar 2022 09:29:28 -0700 Subject: [PATCH 05/17] add dask_cudf tests --- .../dask_cudf/tests/test_applymap.py | 30 +++++++++++++++ .../dask_cudf/dask_cudf/tests/test_binops.py | 34 ++--------------- python/dask_cudf/dask_cudf/tests/utils.py | 38 +++++++++++++++++++ 3 files changed, 71 insertions(+), 31 deletions(-) create mode 100644 python/dask_cudf/dask_cudf/tests/test_applymap.py create mode 100644 python/dask_cudf/dask_cudf/tests/utils.py diff --git a/python/dask_cudf/dask_cudf/tests/test_applymap.py b/python/dask_cudf/dask_cudf/tests/test_applymap.py new file mode 100644 index 00000000000..1c075b8ddfa --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/test_applymap.py @@ -0,0 +1,30 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import pytest +from pandas import NA + +from dask import dataframe as dd + +from .utils import _make_random_frame + + +@pytest.mark.parametrize( + "func", + [ + lambda x: x + 1, + lambda x: x - 1, + lambda x: x + 0.5, + lambda x: 2 if x is NA else 2 + (x + 1) / 4.1, + lambda x: 42, + ], +) +@pytest.mark.parametrize("has_na", [True, False]) +def test_applymap_basic(func, has_na): + size = 2000 + pdf, dgdf = _make_random_frame(size, include_na=False) + # breakpoint() + dpdf = dd.from_pandas(pdf, npartitions=dgdf.npartitions) + + expect = dpdf.applymap(func) + got = dgdf.applymap(func) + dd.assert_eq(expect, got, check_dtype=False) diff --git a/python/dask_cudf/dask_cudf/tests/test_binops.py b/python/dask_cudf/dask_cudf/tests/test_binops.py index 64b7cc85971..09f14b029df 100644 --- a/python/dask_cudf/dask_cudf/tests/test_binops.py +++ b/python/dask_cudf/dask_cudf/tests/test_binops.py @@ -1,41 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + import operator import numpy as np -import pandas as pd import pytest from dask import dataframe as dd -import cudf - - -def _make_empty_frame(npartitions=2): - df = pd.DataFrame({"x": [], "y": []}) - gdf = cudf.DataFrame.from_pandas(df) - dgf = dd.from_pandas(gdf, npartitions=npartitions) - return dgf - - -def _make_random_frame(nelem, npartitions=2): - df = pd.DataFrame( - {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)} - ) - gdf = cudf.DataFrame.from_pandas(df) - dgf = dd.from_pandas(gdf, npartitions=npartitions) - return df, dgf - - -def _make_random_frame_float(nelem, npartitions=2): - df = pd.DataFrame( - { - "x": np.random.randint(0, 5, size=nelem), - "y": np.random.normal(size=nelem) + 1, - } - ) - gdf = cudf.from_pandas(df) - dgf = dd.from_pandas(gdf, npartitions=npartitions) - return df, dgf - +from .utils import _make_random_frame, _make_random_frame_float _binops = [ operator.add, diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py new file mode 100644 index 00000000000..b8117cd5e62 --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -0,0 +1,38 @@ +import numpy as np +import pandas as pd + +import dask.dataframe as dd + +import cudf + + +def _make_empty_frame(npartitions=2): + df = pd.DataFrame({"x": [], "y": []}) + gdf = cudf.DataFrame.from_pandas(df) + dgf = dd.from_pandas(gdf, npartitions=npartitions) + return dgf + + +def _make_random_frame(nelem, npartitions=2, include_na=False): + df = pd.DataFrame( + {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)} + ) + + if include_na: + df["x"][::2] = pd.NA + + gdf = cudf.DataFrame.from_pandas(df) + dgf = dd.from_pandas(gdf, npartitions=npartitions) + return df, dgf + + +def _make_random_frame_float(nelem, npartitions=2): + df = pd.DataFrame( + { + "x": np.random.randint(0, 5, size=nelem), + "y": np.random.normal(size=nelem) + 1, + } + ) + gdf = cudf.from_pandas(df) + dgf = dd.from_pandas(gdf, npartitions=npartitions) + return df, dgf From 6ce83838a5b54b5fb578359e7a80e24584a32450 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 30 Mar 2022 09:30:48 -0700 Subject: [PATCH 06/17] copyright --- python/dask_cudf/dask_cudf/tests/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index b8117cd5e62..549d5605652 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -1,3 +1,5 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + import numpy as np import pandas as pd From 262c95861296d2ab315fd1c989df6d6c5f86ba09 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 30 Mar 2022 09:33:38 -0700 Subject: [PATCH 07/17] little cleanup --- python/dask_cudf/dask_cudf/tests/test_applymap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_applymap.py b/python/dask_cudf/dask_cudf/tests/test_applymap.py index 1c075b8ddfa..f38726fc12f 100644 --- a/python/dask_cudf/dask_cudf/tests/test_applymap.py +++ b/python/dask_cudf/dask_cudf/tests/test_applymap.py @@ -22,7 +22,7 @@ def test_applymap_basic(func, has_na): size = 2000 pdf, dgdf = _make_random_frame(size, include_na=False) - # breakpoint() + dpdf = dd.from_pandas(pdf, npartitions=dgdf.npartitions) expect = dpdf.applymap(func) From bd311abbeb0f676b26ca7b1b04a75980c436ba68 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 5 Apr 2022 13:40:17 -0700 Subject: [PATCH 08/17] address reviews --- python/cudf/cudf/core/dataframe.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index eefecdf67a6..566ece73eb7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3723,16 +3723,18 @@ def apply( def applymap( self, func: Callable, na_action: str | None = None, **kwargs ) -> DataFrame: + """ Apply a function to a Dataframe elementwise. This method applies a function that accepts and returns a scalar to every element of a DataFrame. + Parameters ---------- func : callable Python function, returns a single value from a single value. na_action : {None, 'ignore'}, default None - If ``ignore``, propagate NaN values, without passing them to func. + If 'ignore', propagate NaN values, without passing them to func. Returns ------- @@ -3741,7 +3743,7 @@ def applymap( """ if kwargs: - raise ValueError( + raise NotImplementedError( "DataFrame.applymap does not yet support **kwargs." ) From db2fee98a8922edfddec4d72f34376e71a0a86a0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 6 Apr 2022 07:48:34 -0700 Subject: [PATCH 09/17] fix up type hints --- python/cudf/cudf/core/dataframe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 566ece73eb7..2d33efef44c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3721,7 +3721,10 @@ def apply( return self._apply(func, _get_row_kernel, *args, **kwargs) def applymap( - self, func: Callable, na_action: str | None = None, **kwargs + self, + func: Callable[[Any], Any], + na_action: Union[str, None] = None, + **kwargs, ) -> DataFrame: """ From 7081276fc7bc226e187563882edf9d6212a22665 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 6 Apr 2022 14:08:25 -0700 Subject: [PATCH 10/17] respond to ci review.. --- python/cudf/cudf/core/dataframe.py | 61 +++++++------------------ python/cudf/cudf/tests/test_applymap.py | 13 ++++++ 2 files changed, 29 insertions(+), 45 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2d33efef44c..7713240e1b7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -828,9 +828,7 @@ def _init_from_dict_like( masked = index is not None data = { key: cudf.core.column.column_empty( - row_count=row_count, - dtype=None, - masked=masked, + row_count=row_count, dtype=None, masked=masked, ) for key in extra_cols } @@ -859,10 +857,7 @@ def _init_from_dict_like( col_name, tuple ) self._insert( - i, - col_name, - data[col_name], - nan_as_null=nan_as_null, + i, col_name, data[col_name], nan_as_null=nan_as_null, ) if columns is not None: @@ -1347,8 +1342,7 @@ def memory_usage(self, index=True, deep=False): mem_usage.append(self._index.memory_usage()) names.append("Index") return Series._from_data( - data={None: as_column(mem_usage)}, - index=as_index(names), + data={None: as_column(mem_usage)}, index=as_index(names), ) @_cudf_nvtx_annotate @@ -2108,9 +2102,7 @@ def _set_column_names(self, names, multiindex=False, level_names=None): raise ValueError("Duplicate column names are not allowed") self._data = ColumnAccessor( - data, - multiindex=multiindex, - level_names=level_names, + data, multiindex=multiindex, level_names=level_names, ) def _set_column_names_like(self, other): @@ -3385,13 +3377,7 @@ def merge( @_cudf_nvtx_annotate def join( - self, - other, - on=None, - how="left", - lsuffix="", - rsuffix="", - sort=False, + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False, ): """Join columns with other DataFrame on index or on a key column. @@ -3759,11 +3745,14 @@ def applymap( devfunc = numba.cuda.jit(device=True)(func) # promote to a null-ignoring function + # this code is never run in python, it only + # exists to provide numba with the correct + # bytecode to generate the equivalent PTX + # as a null-ignoring version of the function def _func(x): - # promote to a null-ignoring function - if x is cudf.NA: + if x is cudf.NA: # pragma: no cover return cudf.NA - else: + else: # pragma: no cover return devfunc(x) else: @@ -4586,9 +4575,7 @@ def to_arrow(self, preserve_index=True): gen_names, self.index._data.names ): data._insert( - data.shape[1], - gen_name, - self.index._data[col_name], + data.shape[1], gen_name, self.index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -5176,12 +5163,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): @_cudf_nvtx_annotate def _reduce( - self, - op, - axis=None, - level=None, - numeric_only=None, - **kwargs, + self, op, axis=None, level=None, numeric_only=None, **kwargs, ): if level is not None: raise NotImplementedError("level parameter is not implemented yet") @@ -5209,11 +5191,7 @@ def _reduce( @_cudf_nvtx_annotate def _scan( - self, - op, - axis=None, - *args, - **kwargs, + self, op, axis=None, *args, **kwargs, ): axis = self._get_axis_from_axis_arg(axis) @@ -5445,11 +5423,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result = result.set_mask( cudf._lib.transform.bools_to_mask(mask._column) ) - return Series( - result, - index=self.index, - dtype=result_dtype, - ) + return Series(result, index=self.index, dtype=result_dtype,) else: result_df = DataFrame(result).set_index(self.index) result_df._set_column_names_like(prepared) @@ -6626,10 +6600,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): if idx in categories: cols[idx] = ( cols[idx] - ._set_categories( - categories[idx], - is_unique=True, - ) + ._set_categories(categories[idx], is_unique=True,) .codes ) cols[idx] = cols[idx].astype(dtype) diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index 5ad8cb98111..06025009b77 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -88,3 +88,16 @@ def test_applymap_dataframe(data, func, na_action): got = gdf.applymap(func, na_action=na_action) utils.assert_eq(expect, got, check_dtype=False) + + +def test_applymap_raise_cases(): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + def f(x, some_kwarg=0): + return x + some_kwarg + + with pytest.raises(NotImplementedError): + df.applymap(f, some_kwarg=1) + + with pytest.raises(ValueError): + df.applymap(f, na_action="some_invalid_option") From 7d7b30491c9391dcb9fd6541fbc3227914966466 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 6 Apr 2022 14:19:40 -0700 Subject: [PATCH 11/17] use black from the correct conda environment --- python/cudf/cudf/core/dataframe.py | 52 ++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7713240e1b7..d7f6c002ddf 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -828,7 +828,9 @@ def _init_from_dict_like( masked = index is not None data = { key: cudf.core.column.column_empty( - row_count=row_count, dtype=None, masked=masked, + row_count=row_count, + dtype=None, + masked=masked, ) for key in extra_cols } @@ -857,7 +859,10 @@ def _init_from_dict_like( col_name, tuple ) self._insert( - i, col_name, data[col_name], nan_as_null=nan_as_null, + i, + col_name, + data[col_name], + nan_as_null=nan_as_null, ) if columns is not None: @@ -1342,7 +1347,8 @@ def memory_usage(self, index=True, deep=False): mem_usage.append(self._index.memory_usage()) names.append("Index") return Series._from_data( - data={None: as_column(mem_usage)}, index=as_index(names), + data={None: as_column(mem_usage)}, + index=as_index(names), ) @_cudf_nvtx_annotate @@ -2102,7 +2108,9 @@ def _set_column_names(self, names, multiindex=False, level_names=None): raise ValueError("Duplicate column names are not allowed") self._data = ColumnAccessor( - data, multiindex=multiindex, level_names=level_names, + data, + multiindex=multiindex, + level_names=level_names, ) def _set_column_names_like(self, other): @@ -3377,7 +3385,13 @@ def merge( @_cudf_nvtx_annotate def join( - self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False, + self, + other, + on=None, + how="left", + lsuffix="", + rsuffix="", + sort=False, ): """Join columns with other DataFrame on index or on a key column. @@ -4575,7 +4589,9 @@ def to_arrow(self, preserve_index=True): gen_names, self.index._data.names ): data._insert( - data.shape[1], gen_name, self.index._data[col_name], + data.shape[1], + gen_name, + self.index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -5163,7 +5179,12 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): @_cudf_nvtx_annotate def _reduce( - self, op, axis=None, level=None, numeric_only=None, **kwargs, + self, + op, + axis=None, + level=None, + numeric_only=None, + **kwargs, ): if level is not None: raise NotImplementedError("level parameter is not implemented yet") @@ -5191,7 +5212,11 @@ def _reduce( @_cudf_nvtx_annotate def _scan( - self, op, axis=None, *args, **kwargs, + self, + op, + axis=None, + *args, + **kwargs, ): axis = self._get_axis_from_axis_arg(axis) @@ -5423,7 +5448,11 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result = result.set_mask( cudf._lib.transform.bools_to_mask(mask._column) ) - return Series(result, index=self.index, dtype=result_dtype,) + return Series( + result, + index=self.index, + dtype=result_dtype, + ) else: result_df = DataFrame(result).set_index(self.index) result_df._set_column_names_like(prepared) @@ -6600,7 +6629,10 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): if idx in categories: cols[idx] = ( cols[idx] - ._set_categories(categories[idx], is_unique=True,) + ._set_categories( + categories[idx], + is_unique=True, + ) .codes ) cols[idx] = cols[idx].astype(dtype) From 85963a8c9689ca76c31b515aa90f2cae2a60ede0 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 7 Apr 2022 15:35:44 -0500 Subject: [PATCH 12/17] Add blank link. --- python/cudf/cudf/core/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d7f6c002ddf..5e48cb3f816 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3729,6 +3729,7 @@ def applymap( """ Apply a function to a Dataframe elementwise. + This method applies a function that accepts and returns a scalar to every element of a DataFrame. From 6b91f33f6375747fd1a15a1796c8c7d8b898fb14 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Fri, 8 Apr 2022 10:57:03 -0500 Subject: [PATCH 13/17] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/cudf/core/dataframe.py | 6 +++--- python/cudf/cudf/tests/test_applymap.py | 3 +-- python/dask_cudf/dask_cudf/tests/test_applymap.py | 3 +-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5e48cb3f816..a284dfdf940 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3764,10 +3764,10 @@ def applymap( # exists to provide numba with the correct # bytecode to generate the equivalent PTX # as a null-ignoring version of the function - def _func(x): - if x is cudf.NA: # pragma: no cover + def _func(x): # pragma: no cover + if x is cudf.NA: return cudf.NA - else: # pragma: no cover + else: return devfunc(x) else: diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index 06025009b77..c8a9b5d03f5 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -73,8 +73,7 @@ def test_applymap_change_out_dtype(): "func", [ lambda x: x + 1, - lambda x: x - 1, - lambda x: x + 0.5, + lambda x: x - 0.5, lambda x: 2 if x is NA else 2 + (x + 1) / 4.1, lambda x: 42, ], diff --git a/python/dask_cudf/dask_cudf/tests/test_applymap.py b/python/dask_cudf/dask_cudf/tests/test_applymap.py index f38726fc12f..688c5591316 100644 --- a/python/dask_cudf/dask_cudf/tests/test_applymap.py +++ b/python/dask_cudf/dask_cudf/tests/test_applymap.py @@ -12,8 +12,7 @@ "func", [ lambda x: x + 1, - lambda x: x - 1, - lambda x: x + 0.5, + lambda x: x - 0.5, lambda x: 2 if x is NA else 2 + (x + 1) / 4.1, lambda x: 42, ], From b344532ee59d5b1f4e2320f00e1b0f913562d4ae Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 8 Apr 2022 09:20:26 -0700 Subject: [PATCH 14/17] dont move all functions to utils yet --- .../dask_cudf/dask_cudf/tests/test_binops.py | 24 ++++++++++++++++++- python/dask_cudf/dask_cudf/tests/utils.py | 19 --------------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_binops.py b/python/dask_cudf/dask_cudf/tests/test_binops.py index 09f14b029df..ae17dfc3206 100644 --- a/python/dask_cudf/dask_cudf/tests/test_binops.py +++ b/python/dask_cudf/dask_cudf/tests/test_binops.py @@ -3,11 +3,33 @@ import operator import numpy as np +import pandas as pd import pytest +from .utils import _make_random_frame from dask import dataframe as dd -from .utils import _make_random_frame, _make_random_frame_float +import cudf + + +def _make_empty_frame(npartitions=2): + df = pd.DataFrame({"x": [], "y": []}) + gdf = cudf.DataFrame.from_pandas(df) + dgf = dd.from_pandas(gdf, npartitions=npartitions) + return dgf + + +def _make_random_frame_float(nelem, npartitions=2): + df = pd.DataFrame( + { + "x": np.random.randint(0, 5, size=nelem), + "y": np.random.normal(size=nelem) + 1, + } + ) + gdf = cudf.from_pandas(df) + dgf = dd.from_pandas(gdf, npartitions=npartitions) + return df, dgf + _binops = [ operator.add, diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index 549d5605652..88a2116fb0a 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -8,13 +8,6 @@ import cudf -def _make_empty_frame(npartitions=2): - df = pd.DataFrame({"x": [], "y": []}) - gdf = cudf.DataFrame.from_pandas(df) - dgf = dd.from_pandas(gdf, npartitions=npartitions) - return dgf - - def _make_random_frame(nelem, npartitions=2, include_na=False): df = pd.DataFrame( {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)} @@ -26,15 +19,3 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): gdf = cudf.DataFrame.from_pandas(df) dgf = dd.from_pandas(gdf, npartitions=npartitions) return df, dgf - - -def _make_random_frame_float(nelem, npartitions=2): - df = pd.DataFrame( - { - "x": np.random.randint(0, 5, size=nelem), - "y": np.random.normal(size=nelem) + 1, - } - ) - gdf = cudf.from_pandas(df) - dgf = dd.from_pandas(gdf, npartitions=npartitions) - return df, dgf From d342f8e1fa80fd7893b9d1045cafb29d15351c23 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 8 Apr 2022 09:24:30 -0700 Subject: [PATCH 15/17] fix imports --- python/dask_cudf/dask_cudf/tests/test_binops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_binops.py b/python/dask_cudf/dask_cudf/tests/test_binops.py index ae17dfc3206..1c4d0de872e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_binops.py +++ b/python/dask_cudf/dask_cudf/tests/test_binops.py @@ -5,12 +5,13 @@ import numpy as np import pandas as pd import pytest -from .utils import _make_random_frame from dask import dataframe as dd import cudf +from .utils import _make_random_frame + def _make_empty_frame(npartitions=2): df = pd.DataFrame({"x": [], "y": []}) From 477a8245f3d02591138285939581dce32fc024a3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 13 Apr 2022 11:23:18 -0700 Subject: [PATCH 16/17] add applymap to docs --- docs/cudf/source/api_docs/dataframe.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst index 7a7c9c195b2..c9c67fc3108 100644 --- a/docs/cudf/source/api_docs/dataframe.rst +++ b/docs/cudf/source/api_docs/dataframe.rst @@ -107,6 +107,7 @@ Function application, GroupBy & window :toctree: api/ DataFrame.apply + DataFrame.applymap DataFrame.apply_chunks DataFrame.apply_rows DataFrame.pipe From e1d444c9cabc0e0380b951a017883fa381c174b6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 13 Apr 2022 12:50:32 -0700 Subject: [PATCH 17/17] use absolute imports --- python/dask_cudf/dask_cudf/tests/test_applymap.py | 2 +- python/dask_cudf/dask_cudf/tests/test_binops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_applymap.py b/python/dask_cudf/dask_cudf/tests/test_applymap.py index 688c5591316..929f00ec296 100644 --- a/python/dask_cudf/dask_cudf/tests/test_applymap.py +++ b/python/dask_cudf/dask_cudf/tests/test_applymap.py @@ -5,7 +5,7 @@ from dask import dataframe as dd -from .utils import _make_random_frame +from dask_cudf.tests.utils import _make_random_frame @pytest.mark.parametrize( diff --git a/python/dask_cudf/dask_cudf/tests/test_binops.py b/python/dask_cudf/dask_cudf/tests/test_binops.py index 1c4d0de872e..87bd401accd 100644 --- a/python/dask_cudf/dask_cudf/tests/test_binops.py +++ b/python/dask_cudf/dask_cudf/tests/test_binops.py @@ -10,7 +10,7 @@ import cudf -from .utils import _make_random_frame +from dask_cudf.tests.utils import _make_random_frame def _make_empty_frame(npartitions=2):