Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cudf.DataFrame.applymap #10542

Merged
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from collections.abc import Iterable, Sequence
from typing import (
Any,
Callable,
Dict,
List,
MutableMapping,
Expand All @@ -25,6 +26,7 @@
)

import cupy
import numba
import numpy as np
import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -3718,6 +3720,68 @@ def apply(

return self._apply(func, _get_row_kernel, *args, **kwargs)

def applymap(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self,
func: Callable[[Any], Any],
na_action: Union[str, None] = None,
**kwargs,
) -> DataFrame:

"""
Apply a function to a Dataframe elementwise.
bdice marked this conversation as resolved.
Show resolved Hide resolved

This method applies a function that accepts and returns a scalar
to every element of a DataFrame.

Parameters
bdice marked this conversation as resolved.
Show resolved Hide resolved
----------
func : callable
Python function, returns a single value from a single value.
na_action : {None, 'ignore'}, default None
If 'ignore', propagate NaN values, without passing them to func.

Returns
-------
DataFrame
Transformed DataFrame.
"""

if kwargs:
raise NotImplementedError(
"DataFrame.applymap does not yet support **kwargs."
)

if na_action not in {"ignore", None}:
raise ValueError(
f"na_action must be 'ignore' or None. Got {repr(na_action)}"
)

if na_action == "ignore":
devfunc = numba.cuda.jit(device=True)(func)

# promote to a null-ignoring function
# this code is never run in python, it only
# exists to provide numba with the correct
# bytecode to generate the equivalent PTX
# as a null-ignoring version of the function
def _func(x):
if x is cudf.NA: # pragma: no cover
return cudf.NA
else: # pragma: no cover
return devfunc(x)
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved

else:
_func = func

# TODO: naive implementation
# this could be written as a single kernel
result = {}
for name, col in self._data.items():
apply_sr = Series._from_data({None: col})
result[name] = apply_sr.apply(_func)

return DataFrame._from_data(result, index=self.index)

@_cudf_nvtx_annotate
@applyutils.doc_apply()
def apply_rows(
Expand Down
45 changes: 44 additions & 1 deletion python/cudf/cudf/tests/test_applymap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
import pytest

from cudf import Series
from cudf import NA, DataFrame, Series
from cudf.testing import _utils as utils


Expand Down Expand Up @@ -58,3 +58,46 @@ def test_applymap_change_out_dtype():
expect = np.array(data, dtype=float)
got = out.to_numpy()
np.testing.assert_array_equal(expect, got)


@pytest.mark.parametrize(
"data",
[
{"a": [1, 2, 3], "b": [4, 5, 6]},
{"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]},
{"a": [1, 2, 3], "b": [True, False, True]},
{"a": [1, NA, 2], "b": [NA, 4, NA]},
],
)
@pytest.mark.parametrize(
"func",
[
lambda x: x + 1,
lambda x: x - 1,
lambda x: x + 0.5,
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
lambda x: 2 if x is NA else 2 + (x + 1) / 4.1,
lambda x: 42,
],
)
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_applymap_dataframe(data, func, na_action):
gdf = DataFrame(data)
pdf = gdf.to_pandas(nullable=True)

expect = pdf.applymap(func, na_action=na_action)
got = gdf.applymap(func, na_action=na_action)

utils.assert_eq(expect, got, check_dtype=False)


def test_applymap_raise_cases():
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

def f(x, some_kwarg=0):
return x + some_kwarg

with pytest.raises(NotImplementedError):
df.applymap(f, some_kwarg=1)

with pytest.raises(ValueError):
df.applymap(f, na_action="some_invalid_option")
30 changes: 30 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_applymap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) 2022, NVIDIA CORPORATION.

import pytest
from pandas import NA

from dask import dataframe as dd

from .utils import _make_random_frame
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we do an absolute import here instead of a relative import so that it is consistent with other imports here and elsewhere in the code-base?



@pytest.mark.parametrize(
"func",
[
lambda x: x + 1,
lambda x: x - 1,
lambda x: x + 0.5,
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
lambda x: 2 if x is NA else 2 + (x + 1) / 4.1,
lambda x: 42,
],
)
@pytest.mark.parametrize("has_na", [True, False])
def test_applymap_basic(func, has_na):
size = 2000
pdf, dgdf = _make_random_frame(size, include_na=False)

dpdf = dd.from_pandas(pdf, npartitions=dgdf.npartitions)

expect = dpdf.applymap(func)
got = dgdf.applymap(func)
dd.assert_eq(expect, got, check_dtype=False)
34 changes: 3 additions & 31 deletions python/dask_cudf/dask_cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,13 @@
# Copyright (c) 2022, NVIDIA CORPORATION.

import operator

import numpy as np
import pandas as pd
import pytest

from dask import dataframe as dd

import cudf


def _make_empty_frame(npartitions=2):
df = pd.DataFrame({"x": [], "y": []})
gdf = cudf.DataFrame.from_pandas(df)
dgf = dd.from_pandas(gdf, npartitions=npartitions)
return dgf


def _make_random_frame(nelem, npartitions=2):
df = pd.DataFrame(
{"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)}
)
gdf = cudf.DataFrame.from_pandas(df)
dgf = dd.from_pandas(gdf, npartitions=npartitions)
return df, dgf


def _make_random_frame_float(nelem, npartitions=2):
df = pd.DataFrame(
{
"x": np.random.randint(0, 5, size=nelem),
"y": np.random.normal(size=nelem) + 1,
}
)
gdf = cudf.from_pandas(df)
dgf = dd.from_pandas(gdf, npartitions=npartitions)
return df, dgf

from .utils import _make_random_frame, _make_random_frame_float

_binops = [
operator.add,
Expand Down
40 changes: 40 additions & 0 deletions python/dask_cudf/dask_cudf/tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright (c) 2022, NVIDIA CORPORATION.

import numpy as np
import pandas as pd

import dask.dataframe as dd

import cudf


def _make_empty_frame(npartitions=2):
df = pd.DataFrame({"x": [], "y": []})
gdf = cudf.DataFrame.from_pandas(df)
dgf = dd.from_pandas(gdf, npartitions=npartitions)
return dgf
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems strange that this only returns dgf while _make_random_frame and _make_random_frame_float return df, dgf. Should we symmetrize this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I probably shouldn't have moved this function in the first place since it's being consumed elsewhere and not actually used in my tests. I just moved it back for now.



def _make_random_frame(nelem, npartitions=2, include_na=False):
df = pd.DataFrame(
{"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)}
)

if include_na:
df["x"][::2] = pd.NA

gdf = cudf.DataFrame.from_pandas(df)
dgf = dd.from_pandas(gdf, npartitions=npartitions)
return df, dgf


def _make_random_frame_float(nelem, npartitions=2):
df = pd.DataFrame(
{
"x": np.random.randint(0, 5, size=nelem),
"y": np.random.normal(size=nelem) + 1,
}
)
gdf = cudf.from_pandas(df)
dgf = dd.from_pandas(gdf, npartitions=npartitions)
return df, dgf