Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use as_column instead of full #14698

Merged
merged 28 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
3016905
Refactor scalar handling in as_column
mroeschke Dec 12, 2023
575dcf3
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Dec 13, 2023
6d2a310
Add exception for Interval
mroeschke Dec 13, 2023
23544ea
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Dec 16, 2023
2a83e16
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Dec 19, 2023
9b701bf
lint
mroeschke Dec 19, 2023
5e7cb98
Add return
mroeschke Dec 19, 2023
d000e90
Merge remote-tracking branch 'upstream/branch-24.02' into ref/column/…
mroeschke Jan 3, 2024
3c87fa8
Use as_column instead of full
mroeschke Jan 3, 2024
d7fd78f
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Jan 4, 2024
0811adc
Fix typo
mroeschke Jan 4, 2024
779c369
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Jan 5, 2024
30e77a2
Unpack 0D arrays
mroeschke Jan 5, 2024
ea335e4
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Jan 5, 2024
b179069
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Jan 12, 2024
2bc69d2
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Jan 22, 2024
6ecb5cb
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Jan 22, 2024
5ebd6f5
For len 0 return empty of same dtype
mroeschke Jan 22, 2024
a2045eb
Merge remote-tracking branch 'upstream/branch-24.04' into ref/as_colu…
mroeschke Jan 31, 2024
8792a96
Merge remote-tracking branch 'upstream/branch-24.04' into ref/as_colu…
mroeschke Feb 2, 2024
35e8aa5
Fix usages
mroeschke Feb 2, 2024
2356c80
Merge remote-tracking branch 'upstream/branch-24.04' into ref/as_colu…
mroeschke Feb 21, 2024
8daab08
Merge remote-tracking branch 'upstream/branch-24.04' into ref/as_colu…
mroeschke Feb 24, 2024
030eabb
Merge remote-tracking branch 'upstream/branch-24.04' into ref/as_colu…
mroeschke Feb 26, 2024
789d273
Merge branch 'branch-24.04' into ref/as_column_full
vyasr Feb 27, 2024
0067286
Merge remote-tracking branch 'upstream/branch-24.04' into ref/as_colu…
mroeschke Feb 29, 2024
40affa9
Merge branch 'ref/as_column_full' of https://github.com/mroeschke/cud…
mroeschke Feb 29, 2024
5ae05a0
Merge branch 'branch-24.04' into ref/as_column_full
mroeschke Mar 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
column_empty_like_same_mask,
concat_columns,
deserialize_columns,
full,
serialize_columns,
)
from cudf.core.column.datetime import DatetimeColumn # noqa: F401
Expand Down
12 changes: 5 additions & 7 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,8 +734,8 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
)
return other

ary = column.full(
len(self), self._encode(other), dtype=self.codes.dtype
ary = column.as_column(
self._encode(other), length=len(self), dtype=self.codes.dtype
)
return column.build_categorical_column(
categories=self.dtype.categories._values,
Expand Down Expand Up @@ -1438,11 +1438,9 @@ def _create_empty_categorical_column(
return column.build_categorical_column(
categories=column.as_column(dtype.categories),
codes=column.as_column(
column.full(
categorical_column.size,
_DEFAULT_CATEGORICAL_VALUE,
categorical_column.codes.dtype,
)
_DEFAULT_CATEGORICAL_VALUE,
length=categorical_column.size,
dtype=categorical_column.codes.dtype,
),
offset=categorical_column.offset,
size=categorical_column.size,
Expand Down
100 changes: 31 additions & 69 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
infer_dtype,
is_bool_dtype,
is_datetime64_dtype,
is_decimal_dtype,
is_dtype_equal,
is_integer_dtype,
is_list_dtype,
Expand Down Expand Up @@ -855,7 +854,7 @@ def isin(self, values: Sequence) -> ColumnBase:
except ValueError:
# pandas functionally returns all False when cleansing via
# typecasting fails
return full(len(self), False, dtype="bool")
return as_column(False, length=len(self), dtype="bool")

return lhs._obtain_isin_result(rhs)

Expand All @@ -882,9 +881,9 @@ def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]:
if self.null_count and rhs.null_count:
return self.isnull()
else:
return cudf.core.column.full(len(self), False, dtype="bool")
return as_column(False, length=len(self), dtype="bool")
elif self.null_count == 0 and (rhs.null_count == len(rhs)):
return cudf.core.column.full(len(self), False, dtype="bool")
return as_column(False, length=len(self), dtype="bool")
else:
return None

Expand Down Expand Up @@ -1345,9 +1344,7 @@ def _label_encoding(
na_sentinel = cudf.Scalar(-1)

def _return_sentinel_column():
return cudf.core.column.full(
size=len(self), fill_value=na_sentinel, dtype=dtype
)
return as_column(na_sentinel, dtype=dtype, length=len(self))

if dtype is None:
dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
Expand Down Expand Up @@ -1444,7 +1441,9 @@ def column_empty(
elif isinstance(dtype, ListDtype):
data = None
children = (
full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype),
as_column(
0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
),
column_empty(row_count, dtype=dtype.element_type),
)
elif isinstance(dtype, CategoricalDtype):
Expand All @@ -1463,7 +1462,9 @@ def column_empty(
elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype):
data = as_buffer(rmm.DeviceBuffer(size=0))
children = (
full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype),
as_column(
0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
),
)
else:
data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
Expand Down Expand Up @@ -2006,33 +2007,32 @@ def as_column(
if dtype is not None:
data = data.astype(dtype)

elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)):
# This will always treat NaTs as nulls since it's not technically a
# discrete value like NaN
length = length or 1
data = as_column(
pa.array(pd.Series([arbitrary] * length), from_pandas=True)
)
if dtype is not None:
data = data.astype(dtype)

elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
length = length or 1
elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview):
if length is None:
length = 1
elif length < 0:
raise ValueError(f"{length=} must be >=0.")
if isinstance(arbitrary, pd.Interval):
# No cudf.Scalar support yet
return as_column(
pd.Series([arbitrary] * length),
nan_as_null=nan_as_null,
dtype=dtype,
length=length,
)
if (
(nan_as_null is True)
nan_as_null is True
and isinstance(arbitrary, (np.floating, float))
and np.isnan(arbitrary)
):
arbitrary = None
if dtype is None:
dtype = cudf.dtype("float64")

data = as_column(full(length, arbitrary, dtype=dtype))
if not nan_as_null and not is_decimal_dtype(data.dtype):
if np.issubdtype(data.dtype, np.floating):
data = data.fillna(np.nan)
elif np.issubdtype(data.dtype, np.datetime64):
data = data.fillna(np.datetime64("NaT"))
dtype = getattr(arbitrary, "dtype", cudf.dtype("float64"))
arbitrary = None
arbitrary = cudf.Scalar(arbitrary, dtype=dtype)
if length == 0:
return column_empty(length, dtype=arbitrary.dtype)
else:
return ColumnBase.from_scalar(arbitrary, length)

elif hasattr(arbitrary, "__array_interface__"):
# CUDF assumes values are always contiguous
Expand Down Expand Up @@ -2150,8 +2150,6 @@ def as_column(
return as_column(
np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
)
elif isinstance(arbitrary, cudf.Scalar):
data = ColumnBase.from_scalar(arbitrary, length if length else 1)
else:
if dtype is not None:
# Arrow throws a type error if the input is of
Expand Down Expand Up @@ -2494,42 +2492,6 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]:
return columns


def full(
size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None
) -> ColumnBase:
"""
Returns a column of given size and dtype, filled with a given value.

Parameters
----------
size : int
size of the expected column.
fill_value : scalar
A scalar value to fill a new array.
dtype : default None
Data type specifier. It is inferred from other arguments by default.

Returns
-------
Column

Examples
--------
>>> import cudf
>>> col = cudf.core.column.full(size=5, fill_value=7, dtype='int8')
>>> col
<cudf.core.column.numerical.NumericalColumn object at 0x7fa0912e8b90>
>>> cudf.Series(col)
0 7
1 7
2 7
3 7
4 7
dtype: int8
"""
return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)


def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
"""Concatenate a sequence of columns."""
if len(objs) == 0:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ def as_string_column(
def __pow__(self, other):
if isinstance(other, int):
if other == 0:
res = cudf.core.column.full(
size=len(self), fill_value=1, dtype=self.dtype
res = cudf.core.column.as_column(
1, dtype=self.dtype, length=len(self)
)
if self.nullable:
res = res.set_mask(self.mask)
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
as_column,
build_column,
column,
full,
string,
)
from cudf.core.dtypes import CategoricalDtype
Expand Down Expand Up @@ -513,7 +512,7 @@ def find_and_replace(
)
if len(replacement_col) == 1 and len(to_replace_col) > 1:
replacement_col = column.as_column(
full(len(to_replace_col), replacement[0], self.dtype)
replacement[0], length=len(to_replace_col), dtype=self.dtype
)
elif len(replacement_col) == 1 and len(to_replace_col) == 0:
return self.copy()
Expand Down
12 changes: 8 additions & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5499,7 +5499,9 @@ def __init__(

if len(children) == 0 and size != 0:
# all nulls-column:
offsets = column.full(size + 1, 0, dtype=size_type_dtype)
offsets = column.as_column(
0, length=size + 1, dtype=size_type_dtype
)

children = (offsets,)

Expand Down Expand Up @@ -5921,8 +5923,8 @@ def _binaryop(
"__eq__",
"__ne__",
}:
return column.full(
len(self), op == "__ne__", dtype="bool"
return column.as_column(
op == "__ne__", length=len(self), dtype="bool"
).set_mask(self.mask)
else:
return NotImplemented
Expand All @@ -5931,7 +5933,9 @@ def _binaryop(
if isinstance(other, cudf.Scalar):
other = cast(
StringColumn,
column.full(len(self), other, dtype="object"),
column.as_column(
other, length=len(self), dtype="object"
),
)

# Explicit types are necessary because mypy infers ColumnBase
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ def components(self, index=None) -> "cudf.DataFrame":
break

for name in keys_list:
res_col = cudf.core.column.full(len(self), 0, dtype="int64")
res_col = column.as_column(0, length=len(self), dtype="int64")
if self.nullable:
res_col = res_col.set_mask(self.mask)
data[name] = res_col
Expand Down Expand Up @@ -588,7 +588,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
# of nanoseconds.

if self._time_unit != "ns":
res_col = cudf.core.column.full(len(self), 0, dtype="int64")
res_col = column.as_column(0, length=len(self), dtype="int64")
if self.nullable:
res_col = res_col.set_mask(self.mask)
return cast("cudf.core.column.NumericalColumn", res_col)
Expand Down
26 changes: 16 additions & 10 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1407,7 +1407,7 @@ def __setitem__(self, arg, value):
allow_non_unique=True,
)
if is_scalar(value):
self._data[arg] = column.full(len(self), value)
self._data[arg] = as_column(value, length=len(self))
else:
value = as_column(value)
self._data[arg] = value
Expand Down Expand Up @@ -1455,8 +1455,8 @@ def __setitem__(self, arg, value):
else:
for col in arg:
if is_scalar(value):
self._data[col] = column.full(
size=len(self), fill_value=value
self._data[col] = as_column(
value, length=len(self)
)
else:
self._data[col] = column.as_column(value)
Expand Down Expand Up @@ -3205,10 +3205,16 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
)

if _is_scalar_or_zero_d_array(value):
value = column.full(
len(self),
dtype = None
if isinstance(value, (np.ndarray, cupy.ndarray)):
dtype = value.dtype
value = value.item()
if libcudf.scalar._is_null_host_scalar(value):
dtype = "str"
value = as_column(
value,
"str" if libcudf.scalar._is_null_host_scalar(value) else None,
length=len(self),
dtype=dtype,
)

if len(self) == 0:
Expand Down Expand Up @@ -5898,7 +5904,7 @@ def isin(self, values):
fill_value = cudf.Scalar(False)

def make_false_column_like_self():
return column.full(len(self), fill_value, "bool")
return column.as_column(fill_value, length=len(self), dtype="bool")

# Preprocess different input types into a mapping from column names to
# a list of values to check.
Expand Down Expand Up @@ -6017,7 +6023,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
{
name: filtered._data[name]._get_mask_as_column()
if filtered._data[name].nullable
else column.full(len(filtered._data[name]), True)
else as_column(True, length=len(filtered._data[name]))
for name in filtered._data.names
}
)
Expand Down Expand Up @@ -7808,8 +7814,8 @@ def func(left, right, output):
return output

for name in uncommon_columns:
output._data[name] = column.full(
size=len(output), fill_value=value, dtype="bool"
output._data[name] = as_column(
value, length=len(output), dtype="bool"
)
return output

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,9 +1227,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
)

needle = as_column(target)
result = cudf.core.column.full(
len(needle),
fill_value=-1,
result = as_column(
-1,
length=len(needle),
dtype=libcudf.types.size_type_dtype,
)

Expand Down
Loading
Loading