Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Column.strftime/strptime instead of overloading as_string/datetime/timedelta_column #16243

Merged
merged 3 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 6 additions & 18 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1136,26 +1136,14 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
return self._get_decategorized_column().as_numerical_column(dtype)

def as_string_column(
self, dtype, format: str | None = None
) -> StringColumn:
return self._get_decategorized_column().as_string_column(
dtype, format=format
)
def as_string_column(self) -> StringColumn:
return self._get_decategorized_column().as_string_column()

def as_datetime_column(
self, dtype, format: str | None = None
) -> DatetimeColumn:
return self._get_decategorized_column().as_datetime_column(
dtype, format
)
def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
return self._get_decategorized_column().as_datetime_column(dtype)

def as_timedelta_column(
self, dtype, format: str | None = None
) -> TimeDeltaColumn:
return self._get_decategorized_column().as_timedelta_column(
dtype, format
)
def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
return self._get_decategorized_column().as_timedelta_column(dtype)

def _get_decategorized_column(self) -> ColumnBase:
if self.null_count == len(self):
Expand Down
14 changes: 6 additions & 8 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,7 +1003,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
f"Casting to {dtype} is not supported, use "
"`.astype('str')` instead."
)
result = self.as_string_column(dtype)
result = self.as_string_column()
else:
result = self.as_numerical_column(dtype)

Expand Down Expand Up @@ -1059,8 +1059,8 @@ def as_numerical_column(
raise NotImplementedError

def as_datetime_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.DatetimeColumn":
self, dtype: Dtype
) -> cudf.core.column.DatetimeColumn:
raise NotImplementedError

def as_interval_column(
Expand All @@ -1069,13 +1069,11 @@ def as_interval_column(
raise NotImplementedError

def as_timedelta_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.TimeDeltaColumn":
self, dtype: Dtype
) -> cudf.core.column.TimeDeltaColumn:
raise NotImplementedError

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
def as_string_column(self) -> cudf.core.column.StringColumn:
raise NotImplementedError

def as_decimal_column(
Expand Down
126 changes: 57 additions & 69 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,43 +178,6 @@ def _resolve_mixed_dtypes(
return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]")


def _get_datetime_format(col, dtype, time_unit):
format = _dtype_to_format_conversion.get(dtype.name, "%Y-%m-%d %H:%M:%S")
if format.endswith("f"):
sub_second_res_len = 3
else:
sub_second_res_len = 0

has_nanos = time_unit in {"ns"} and col.get_dt_field("nanosecond").any()
has_micros = (
time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any()
)
has_millis = (
time_unit in {"ns", "us", "ms"}
and col.get_dt_field("millisecond").any()
)
has_seconds = col.get_dt_field("second").any()
has_minutes = col.get_dt_field("minute").any()
has_hours = col.get_dt_field("hour").any()
if sub_second_res_len:
if has_nanos:
# format should be intact and rest of the
# following conditions shouldn't execute.
pass
elif has_micros:
format = format[:-sub_second_res_len] + "%6f"
elif has_millis:
format = format[:-sub_second_res_len] + "%3f"
elif has_seconds or has_minutes or has_hours:
format = format[:-4]
else:
format = format.split(" ")[0]
else:
if not (has_seconds or has_minutes or has_hours):
format = format.split(" ")[0]
return format


class DatetimeColumn(column.ColumnBase):
"""
A Column implementation for Date-time types.
Expand Down Expand Up @@ -381,9 +344,7 @@ def round(self, freq: str) -> ColumnBase:

def isocalendar(self) -> dict[str, ColumnBase]:
return {
field: self.as_string_column("str", format=directive).astype(
"uint32"
)
field: self.strftime(format=directive).astype("uint32")
for field, directive in zip(
["year", "week", "day"], ["%G", "%V", "%u"]
)
Expand Down Expand Up @@ -445,17 +406,12 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:

return NotImplemented

def as_datetime_column(
self, dtype: Dtype, format: str | None = None
) -> DatetimeColumn:
dtype = cudf.dtype(dtype)
def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
if dtype == self.dtype:
return self
return libcudf.unary.cast(self, dtype=dtype)

def as_timedelta_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.TimeDeltaColumn":
def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override]
raise TypeError(
f"cannot astype a datetimelike from {self.dtype} to {dtype}"
)
Expand All @@ -472,32 +428,63 @@ def as_numerical_column(
)
return cast("cudf.core.column.NumericalColumn", col.astype(dtype))

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
if format is None:
format = _dtype_to_format_conversion.get(
self.dtype.name, "%Y-%m-%d %H:%M:%S"
def strftime(self, format: str) -> cudf.core.column.StringColumn:
if len(self) == 0:
return cast(
cudf.core.column.StringColumn,
column.column_empty(0, dtype="object", masked=False),
)
if cudf.get_option("mode.pandas_compatible"):
format = _get_datetime_format(
self, dtype=self.dtype, time_unit=self.time_unit
)
if format in _DATETIME_SPECIAL_FORMATS:
names = as_column(_DATETIME_NAMES)
else:
names = cudf.core.column.column_empty(
0, dtype="object", masked=False
)
if len(self) > 0:
return string._datetime_to_str_typecast_functions[
cudf.dtype(self.dtype)
](self, format, names)
else:
return cast(
"cudf.core.column.StringColumn",
column.column_empty(0, dtype="object", masked=False),
return string._datetime_to_str_typecast_functions[self.dtype](
self, format, names
)

def as_string_column(self) -> cudf.core.column.StringColumn:
format = _dtype_to_format_conversion.get(
self.dtype.name, "%Y-%m-%d %H:%M:%S"
)
if cudf.get_option("mode.pandas_compatible"):
if format.endswith("f"):
sub_second_res_len = 3
else:
sub_second_res_len = 0

has_nanos = (
self.time_unit in {"ns"}
and self.get_dt_field("nanosecond").any()
)
has_micros = (
self.time_unit in {"ns", "us"}
and self.get_dt_field("microsecond").any()
)
has_millis = (
self.time_unit in {"ns", "us", "ms"}
and self.get_dt_field("millisecond").any()
)
has_seconds = self.get_dt_field("second").any()
has_minutes = self.get_dt_field("minute").any()
has_hours = self.get_dt_field("hour").any()
if sub_second_res_len:
if has_nanos:
# format should be intact and rest of the
# following conditions shouldn't execute.
pass
elif has_micros:
format = format[:-sub_second_res_len] + "%6f"
elif has_millis:
format = format[:-sub_second_res_len] + "%3f"
elif has_seconds or has_minutes or has_hours:
format = format[:-4]
else:
format = format.split(" ")[0]
elif not (has_seconds or has_minutes or has_hours):
format = format.split(" ")[0]
return self.strftime(format)

def mean(
self, skipna=None, min_count: int = 0, dtype=np.float64
Expand Down Expand Up @@ -872,10 +859,11 @@ def _local_time(self):
offsets_from_utc = offsets.take(indices, nullify=True)
return self + offsets_from_utc

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
return self._local_time.as_string_column(dtype, format)
def strftime(self, format: str) -> cudf.core.column.StringColumn:
return self._local_time.strftime(format)

def as_string_column(self) -> cudf.core.column.StringColumn:
return self._local_time.as_string_column()

def get_dt_field(self, field: str) -> ColumnBase:
return libcudf.datetime.extract_datetime_component(
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,7 @@ def as_decimal_column(
return self
return libcudf.unary.cast(self, dtype)

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
def as_string_column(self) -> cudf.core.column.StringColumn:
if len(self) > 0:
return cpp_from_decimal(self)
else:
Expand Down
8 changes: 2 additions & 6 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,15 +253,11 @@ def from_sequences(
)
return res

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
def as_string_column(self) -> cudf.core.column.StringColumn:
"""
Create a strings column from a list column
"""
lc = self._transform_leaves(
lambda col, dtype: col.as_string_column(dtype), dtype
)
lc = self._transform_leaves(lambda col: col.as_string_column())

# Separator strings to match the Python format
separators = as_column([", ", "[", "]"])
Expand Down
12 changes: 5 additions & 7 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,7 @@ def int2ip(self) -> "cudf.core.column.StringColumn":

return libcudf.string_casting.int2ip(self)

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
def as_string_column(self) -> cudf.core.column.StringColumn:
if len(self) > 0:
return string._numeric_to_str_typecast_functions[
cudf.dtype(self.dtype)
Expand All @@ -345,8 +343,8 @@ def as_string_column(
)

def as_datetime_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.DatetimeColumn":
self, dtype: Dtype
) -> cudf.core.column.DatetimeColumn:
return cast(
"cudf.core.column.DatetimeColumn",
build_column(
Expand All @@ -359,8 +357,8 @@ def as_datetime_column(
)

def as_timedelta_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.TimeDeltaColumn":
self, dtype: Dtype
) -> cudf.core.column.TimeDeltaColumn:
return cast(
"cudf.core.column.TimeDeltaColumn",
build_column(
Expand Down
Loading
Loading