Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #3326 DataFrame.assign #3327

Merged
merged 1 commit into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions PROTO_tests/tests/dataframe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1381,6 +1381,33 @@ def get_tail_values(col):
df.to_pandas(retain_index=True).groupby("a").tail(n=2),
)

def test_assign(self):
ak_df = ak.DataFrame(
{"temp_c": ak.array([17.0, 25.0])}, index=ak.array(["Portland", "Berkeley"])
)
pd_df = ak_df.to_pandas()

assert_frame_equal(
ak_df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32).to_pandas(),
pd_df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32),
)

assert_frame_equal(
ak_df.assign(temp_f=ak_df["temp_c"] * 9 / 5 + 32).to_pandas(),
pd_df.assign(temp_f=pd_df["temp_c"] * 9 / 5 + 32),
)

assert_frame_equal(
ak_df.assign(
temp_f=lambda x: x["temp_c"] * 9 / 5 + 32,
temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9,
).to_pandas(),
pd_df.assign(
temp_f=lambda x: x["temp_c"] * 9 / 5 + 32,
temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9,
),
)


def pda_to_str_helper(pda):
return ak.array([f"str {i}" for i in pda.to_list()])
88 changes: 88 additions & 0 deletions arkouda/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,23 @@
]


def apply_if_callable(maybe_callable, obj, **kwargs):
"""
Evaluate possibly callable input using obj and kwargs if it is callable,
otherwise return as it is.

Parameters
----------
maybe_callable : possibly a callable
obj : NDFrame
**kwargs
"""
if callable(maybe_callable):
return maybe_callable(obj, **kwargs)

return maybe_callable


def groupby_operators(cls):
for name in GROUPBY_REDUCTION_TYPES:
setattr(cls, name, cls._make_aggop(name))
Expand Down Expand Up @@ -1073,6 +1090,9 @@ def __setitem__(self, key, value):

# Set a single column in the dataframe using a an arkouda array
elif isinstance(key, str):
if isinstance(value, Series):
value = value.values

if not isinstance(value, self._COLUMN_CLASSES):
raise ValueError(f"Column must be one of {self._COLUMN_CLASSES}.")
elif self._nrows is not None and self._nrows != value.size:
Expand Down Expand Up @@ -5495,6 +5515,74 @@ def from_return_msg(cls, rep_msg):

return cls(columns, idx)

def assign(self, **kwargs) -> DataFrame:
r"""
Assign new columns to a DataFrame.

Returns a new object with all original columns in addition to new ones.
Existing columns that are re-assigned will be overwritten.

Parameters
----------
**kwargs : dict of {str: callable or Series}
The column names are keywords. If the values are
callable, they are computed on the DataFrame and
assigned to the new columns. The callable must not
change input DataFrame (though pandas doesn't check it).
If the values are not callable, (e.g. a Series, scalar, or array),
they are simply assigned.

Returns
-------
DataFrame
A new DataFrame with the new columns in addition to
all the existing columns.

Notes
-----
Assigning multiple columns within the same ``assign`` is possible.
Later items in '\*\*kwargs' may refer to newly created or modified
columns in 'df'; items are computed and assigned into 'df' in order.

Examples
--------
>>> df = ak.DataFrame({'temp_c': [17.0, 25.0]},
... index=['Portland', 'Berkeley'])
>>> df
temp_c
Portland 17.0
Berkeley 25.0

Where the value is a callable, evaluated on `df`:

>>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
temp_c temp_f
Portland 17.0 62.6
Berkeley 25.0 77.0

Alternatively, the same behavior can be achieved by directly
referencing an existing Series or sequence:

>>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
temp_c temp_f
Portland 17.0 62.6
Berkeley 25.0 77.0

You can create multiple columns within the same assign where one
of the columns depends on another one defined within the same assign:

>>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
temp_c temp_f temp_k
Portland 17.0 62.6 290.15
Berkeley 25.0 77.0 298.15
"""
data = self.copy(deep=None)

for k, v in kwargs.items():
data[k] = apply_if_callable(v, data)
return data


def intx(a, b):
"""
Expand Down
Loading