-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add cudf.DataFrame.applymap
#10542
Add cudf.DataFrame.applymap
#10542
Changes from 9 commits
67a6187
e087124
454d9d8
2871aa1
b6827b5
06348f7
6fb742d
6ce8383
262c958
1c5d7ad
bd311ab
db2fee9
7081276
7d7b304
85963a8
6b91f33
b344532
d342f8e
137604e
477a824
e1d444c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -13,6 +13,7 @@ | |||||
from collections.abc import Iterable, Sequence | ||||||
from typing import ( | ||||||
Any, | ||||||
Callable, | ||||||
Dict, | ||||||
List, | ||||||
MutableMapping, | ||||||
|
@@ -25,6 +26,7 @@ | |||||
) | ||||||
|
||||||
import cupy | ||||||
import numba | ||||||
import numpy as np | ||||||
import pandas as pd | ||||||
import pyarrow as pa | ||||||
|
@@ -3712,6 +3714,59 @@ def apply( | |||||
|
||||||
return self._apply(func, _get_row_kernel, *args, **kwargs) | ||||||
|
||||||
def applymap( | ||||||
self, func: Callable, na_action: str | None = None, **kwargs | ||||||
) -> DataFrame: | ||||||
""" | ||||||
Apply a function to a Dataframe elementwise. | ||||||
bdice marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
This method applies a function that accepts and returns a scalar | ||||||
to every element of a DataFrame. | ||||||
Parameters | ||||||
bdice marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
---------- | ||||||
func : callable | ||||||
Python function, returns a single value from a single value. | ||||||
na_action : {None, 'ignore'}, default None | ||||||
If ``ignore``, propagate NaN values, without passing them to func. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use quotes here, not code font.
Suggested change
|
||||||
|
||||||
Returns | ||||||
------- | ||||||
DataFrame | ||||||
Transformed DataFrame. | ||||||
""" | ||||||
|
||||||
if kwargs: | ||||||
raise ValueError( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we usually raise
Suggested change
|
||||||
"DataFrame.applymap does not yet support **kwargs." | ||||||
) | ||||||
|
||||||
if na_action not in {"ignore", None}: | ||||||
raise ValueError( | ||||||
f"na_action must be 'ignore' or None. Got {repr(na_action)}" | ||||||
) | ||||||
|
||||||
if na_action == "ignore": | ||||||
devfunc = numba.cuda.jit(device=True)(func) | ||||||
|
||||||
# promote to a null-ignoring function | ||||||
def _func(x): | ||||||
# promote to a null-ignoring function | ||||||
if x is cudf.NA: | ||||||
return cudf.NA | ||||||
else: | ||||||
return devfunc(x) | ||||||
|
||||||
else: | ||||||
_func = func | ||||||
|
||||||
# TODO: naive implementation | ||||||
# this could be written as a single kernel | ||||||
result = {} | ||||||
for name, col in self._data.items(): | ||||||
apply_sr = Series._from_data({None: col}) | ||||||
result[name] = apply_sr.apply(_func) | ||||||
|
||||||
return DataFrame._from_data(result, index=self.index) | ||||||
|
||||||
@_cudf_nvtx_annotate | ||||||
@applyutils.doc_apply() | ||||||
def apply_rows( | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. | ||
|
||
import pytest | ||
from pandas import NA | ||
|
||
from dask import dataframe as dd | ||
|
||
from .utils import _make_random_frame | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we do an absolute import here instead of a relative import so that it is consistent with other imports here and elsewhere in the code-base? |
||
|
||
|
||
@pytest.mark.parametrize( | ||
"func", | ||
[ | ||
lambda x: x + 1, | ||
lambda x: x - 1, | ||
lambda x: x + 0.5, | ||
brandon-b-miller marked this conversation as resolved.
Show resolved
Hide resolved
|
||
lambda x: 2 if x is NA else 2 + (x + 1) / 4.1, | ||
lambda x: 42, | ||
], | ||
) | ||
@pytest.mark.parametrize("has_na", [True, False]) | ||
def test_applymap_basic(func, has_na): | ||
size = 2000 | ||
pdf, dgdf = _make_random_frame(size, include_na=False) | ||
|
||
dpdf = dd.from_pandas(pdf, npartitions=dgdf.npartitions) | ||
|
||
expect = dpdf.applymap(func) | ||
got = dgdf.applymap(func) | ||
dd.assert_eq(expect, got, check_dtype=False) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
import dask.dataframe as dd | ||
|
||
import cudf | ||
|
||
|
||
def _make_empty_frame(npartitions=2): | ||
df = pd.DataFrame({"x": [], "y": []}) | ||
gdf = cudf.DataFrame.from_pandas(df) | ||
dgf = dd.from_pandas(gdf, npartitions=npartitions) | ||
return dgf | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems strange that this only returns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I probably shouldn't have moved this function in the first place since it's being consumed elsewhere and not actually used in my tests. I just moved it back for now. |
||
|
||
|
||
def _make_random_frame(nelem, npartitions=2, include_na=False): | ||
df = pd.DataFrame( | ||
{"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)} | ||
) | ||
|
||
if include_na: | ||
df["x"][::2] = pd.NA | ||
|
||
gdf = cudf.DataFrame.from_pandas(df) | ||
dgf = dd.from_pandas(gdf, npartitions=npartitions) | ||
return df, dgf | ||
|
||
|
||
def _make_random_frame_float(nelem, npartitions=2): | ||
df = pd.DataFrame( | ||
{ | ||
"x": np.random.randint(0, 5, size=nelem), | ||
"y": np.random.normal(size=nelem) + 1, | ||
} | ||
) | ||
gdf = cudf.from_pandas(df) | ||
dgf = dd.from_pandas(gdf, npartitions=npartitions) | ||
return df, dgf |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you also add this entry to this section of docs: https://github.com/rapidsai/cudf/blob/branch-22.06/docs/cudf/source/api_docs/dataframe.rst#function-application-groupby--window