Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] DataFrame insert and creation optimizations #10285

Merged
merged 11 commits into from
Feb 16, 2022
52 changes: 41 additions & 11 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,8 +831,8 @@ def _init_from_dict_like(
self._data.multiindex = self._data.multiindex and isinstance(
col_name, tuple
)
self.insert(
i, col_name, data[col_name], nan_as_null=nan_as_null
self._insert(
i, col_name, data[col_name], nan_as_null=nan_as_null,
)

if columns is not None:
Expand Down Expand Up @@ -1093,7 +1093,7 @@ def __setitem__(self, arg, value):
)
else:
for col_name in self._data:
scatter_map = arg[col_name]
scatter_map = arg._data[col_name]
if is_scalar(value):
self._data[col_name][scatter_map] = value
else:
Expand Down Expand Up @@ -2571,6 +2571,29 @@ def insert(self, loc, name, value, nan_as_null=None):
name : number or string
name or label of column to be inserted
value : Series or array-like
nan_as_null : bool, Default None
If ``None``/``True``, converts ``np.nan`` values to
``null`` values.
If ``False``, leaves ``np.nan`` values as is.
"""
return self._insert(
loc=loc,
name=name,
value=value,
nan_as_null=nan_as_null,
ignore_index=False,
)

@annotate("DATAFRAME__INSERT", color="green", domain="cudf_python")
def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
"""
Same as `insert`, with additional `ignore_index` param.

ignore_index : bool, default True
If True, there will be no index equality check & reindexing
happening.
If False, a reindexing operation is performed if
`value.index` is not equal to `self.index`.
"""
if name in self._data:
raise NameError(f"duplicated column name {name}")
Expand All @@ -2591,7 +2614,8 @@ def insert(self, loc, name, value, nan_as_null=None):

if len(self) == 0:
if isinstance(value, (pd.Series, Series)):
self._index = as_index(value.index)
if not ignore_index:
self._index = as_index(value.index)
elif len(value) > 0:
self._index = RangeIndex(start=0, stop=len(value))
new_data = self._data.__class__()
Expand All @@ -2604,9 +2628,11 @@ def insert(self, loc, name, value, nan_as_null=None):
)
self._data = new_data
elif isinstance(value, (pd.Series, Series)):
value = Series(value, nan_as_null=nan_as_null)._align_to_index(
self._index, how="right", sort=False
)
value = Series(value, nan_as_null=nan_as_null)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's probably a faster way to construct this if we know the input is a cudf.Series. I'm not sure how much we could save for a pandas Series by handling it manually, I don't think we typically do anything special for those. That may be worth exploring in a future PR (basically seeing if we can implement Series.from_pandas in a more efficient manner than just calling the constructor), but is out of scope for now.

Copy link
Contributor Author

@galipremsagar galipremsagar Feb 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think it is out of scope for this PR. The reason is we have other places in the code-base which use similar patterns we might be better off tackling that in a separate PR all at a time.

if not ignore_index:
value = value._align_to_index(
self._index, how="right", sort=False
)

value = column.as_column(value, nan_as_null=nan_as_null)

Expand Down Expand Up @@ -4731,8 +4757,8 @@ def to_arrow(self, preserve_index=True):
for gen_name, col_name in zip(
gen_names, self.index._data.names
):
data.insert(
data.shape[1], gen_name, self.index._data[col_name]
data._insert(
data.shape[1], gen_name, self.index._data[col_name],
)
descr = gen_names[0]
index_descr.append(descr)
Expand Down Expand Up @@ -5725,7 +5751,7 @@ def select_dtypes(self, include=None, exclude=None):
for k, col in self._data.items():
infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
if infered_type in inclusion:
df.insert(len(df._data), k, col)
df._insert(len(df._data), k, col)

return df

Expand Down Expand Up @@ -6540,7 +6566,11 @@ def _setitem_with_dataframe(
raise ValueError("Can not insert new column with a bool mask")
else:
# handle append case
input_df.insert(len(input_df._data), col_1, replace_df[col_2])
input_df._insert(
loc=len(input_df._data),
name=col_1,
value=replace_df[col_2],
)


def extract_col(df, col):
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6806,10 +6806,9 @@ def _drop_rows_by_labels(
join_res = working_df.join(to_join, how="leftanti")

# 4. Reconstruct original layout, and rename
join_res.insert(
join_res._insert(
ilevel, name=join_res._index.name, value=join_res._index
)
join_res = join_res.reset_index(drop=True)

midx = cudf.MultiIndex.from_frame(
join_res.iloc[:, 0:idx_nlv], names=obj._index.names
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def agg(self, func):

if not self._as_index:
for col_name in reversed(self.grouping._named_columns):
result.insert(
result._insert(
0,
col_name,
result.index.get_level_values(col_name)._values,
Expand Down
10 changes: 1 addition & 9 deletions python/cudf/cudf/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,7 @@ def scalar_broadcast_to(scalar, size, dtype=None):
scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
dtype = scalar.dtype

if cudf.dtype(dtype).kind in ("O", "U"):
gather_map = column.full(size, 0, dtype="int32")
scalar_str_col = column.as_column([scalar], dtype="str")
return scalar_str_col[gather_map]
else:
out_col = column.column_empty(size, dtype=dtype)
if out_col.size != 0:
out_col.data_array_view[:] = scalar
return out_col
return cudf.core.column.full(size=size, fill_value=scalar, dtype=dtype)


def initfunc(f):
Expand Down