Skip to content

Commit

Permalink
Add Column.strftime/strptime instead of overloading `as_string/dateti…
Browse files Browse the repository at this point in the history
…me/timedelta_column` (#16243)

`Column.as_string/datetime/timedelta_column` had a `format` argument that was not used for columns that weren't these types or didn't require conversion to these types.

This PR introduces a `strftime` and `strptime` on the column that will handle this `format` argument.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16243
  • Loading branch information
mroeschke authored Jul 11, 2024
1 parent b06d883 commit 53de73d
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 201 deletions.
24 changes: 6 additions & 18 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1136,26 +1136,14 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
return self._get_decategorized_column().as_numerical_column(dtype)

def as_string_column(
self, dtype, format: str | None = None
) -> StringColumn:
return self._get_decategorized_column().as_string_column(
dtype, format=format
)
def as_string_column(self) -> StringColumn:
return self._get_decategorized_column().as_string_column()

def as_datetime_column(
self, dtype, format: str | None = None
) -> DatetimeColumn:
return self._get_decategorized_column().as_datetime_column(
dtype, format
)
def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
return self._get_decategorized_column().as_datetime_column(dtype)

def as_timedelta_column(
self, dtype, format: str | None = None
) -> TimeDeltaColumn:
return self._get_decategorized_column().as_timedelta_column(
dtype, format
)
def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
return self._get_decategorized_column().as_timedelta_column(dtype)

def _get_decategorized_column(self) -> ColumnBase:
if self.null_count == len(self):
Expand Down
14 changes: 6 additions & 8 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,7 +1003,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
f"Casting to {dtype} is not supported, use "
"`.astype('str')` instead."
)
result = self.as_string_column(dtype)
result = self.as_string_column()
else:
result = self.as_numerical_column(dtype)

Expand Down Expand Up @@ -1059,8 +1059,8 @@ def as_numerical_column(
raise NotImplementedError

def as_datetime_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.DatetimeColumn":
self, dtype: Dtype
) -> cudf.core.column.DatetimeColumn:
raise NotImplementedError

def as_interval_column(
Expand All @@ -1069,13 +1069,11 @@ def as_interval_column(
raise NotImplementedError

def as_timedelta_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.TimeDeltaColumn":
self, dtype: Dtype
) -> cudf.core.column.TimeDeltaColumn:
raise NotImplementedError

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
def as_string_column(self) -> cudf.core.column.StringColumn:
raise NotImplementedError

def as_decimal_column(
Expand Down
126 changes: 57 additions & 69 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,43 +178,6 @@ def _resolve_mixed_dtypes(
return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]")


def _get_datetime_format(col, dtype, time_unit):
format = _dtype_to_format_conversion.get(dtype.name, "%Y-%m-%d %H:%M:%S")
if format.endswith("f"):
sub_second_res_len = 3
else:
sub_second_res_len = 0

has_nanos = time_unit in {"ns"} and col.get_dt_field("nanosecond").any()
has_micros = (
time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any()
)
has_millis = (
time_unit in {"ns", "us", "ms"}
and col.get_dt_field("millisecond").any()
)
has_seconds = col.get_dt_field("second").any()
has_minutes = col.get_dt_field("minute").any()
has_hours = col.get_dt_field("hour").any()
if sub_second_res_len:
if has_nanos:
# format should be intact and rest of the
# following conditions shouldn't execute.
pass
elif has_micros:
format = format[:-sub_second_res_len] + "%6f"
elif has_millis:
format = format[:-sub_second_res_len] + "%3f"
elif has_seconds or has_minutes or has_hours:
format = format[:-4]
else:
format = format.split(" ")[0]
else:
if not (has_seconds or has_minutes or has_hours):
format = format.split(" ")[0]
return format


class DatetimeColumn(column.ColumnBase):
"""
A Column implementation for Date-time types.
Expand Down Expand Up @@ -381,9 +344,7 @@ def round(self, freq: str) -> ColumnBase:

def isocalendar(self) -> dict[str, ColumnBase]:
return {
field: self.as_string_column("str", format=directive).astype(
"uint32"
)
field: self.strftime(format=directive).astype("uint32")
for field, directive in zip(
["year", "week", "day"], ["%G", "%V", "%u"]
)
Expand Down Expand Up @@ -445,17 +406,12 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:

return NotImplemented

def as_datetime_column(
self, dtype: Dtype, format: str | None = None
) -> DatetimeColumn:
dtype = cudf.dtype(dtype)
def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
if dtype == self.dtype:
return self
return libcudf.unary.cast(self, dtype=dtype)

def as_timedelta_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.TimeDeltaColumn":
def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override]
raise TypeError(
f"cannot astype a datetimelike from {self.dtype} to {dtype}"
)
Expand All @@ -472,32 +428,63 @@ def as_numerical_column(
)
return cast("cudf.core.column.NumericalColumn", col.astype(dtype))

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
if format is None:
format = _dtype_to_format_conversion.get(
self.dtype.name, "%Y-%m-%d %H:%M:%S"
def strftime(self, format: str) -> cudf.core.column.StringColumn:
if len(self) == 0:
return cast(
cudf.core.column.StringColumn,
column.column_empty(0, dtype="object", masked=False),
)
if cudf.get_option("mode.pandas_compatible"):
format = _get_datetime_format(
self, dtype=self.dtype, time_unit=self.time_unit
)
if format in _DATETIME_SPECIAL_FORMATS:
names = as_column(_DATETIME_NAMES)
else:
names = cudf.core.column.column_empty(
0, dtype="object", masked=False
)
if len(self) > 0:
return string._datetime_to_str_typecast_functions[
cudf.dtype(self.dtype)
](self, format, names)
else:
return cast(
"cudf.core.column.StringColumn",
column.column_empty(0, dtype="object", masked=False),
return string._datetime_to_str_typecast_functions[self.dtype](
self, format, names
)

def as_string_column(self) -> cudf.core.column.StringColumn:
format = _dtype_to_format_conversion.get(
self.dtype.name, "%Y-%m-%d %H:%M:%S"
)
if cudf.get_option("mode.pandas_compatible"):
if format.endswith("f"):
sub_second_res_len = 3
else:
sub_second_res_len = 0

has_nanos = (
self.time_unit in {"ns"}
and self.get_dt_field("nanosecond").any()
)
has_micros = (
self.time_unit in {"ns", "us"}
and self.get_dt_field("microsecond").any()
)
has_millis = (
self.time_unit in {"ns", "us", "ms"}
and self.get_dt_field("millisecond").any()
)
has_seconds = self.get_dt_field("second").any()
has_minutes = self.get_dt_field("minute").any()
has_hours = self.get_dt_field("hour").any()
if sub_second_res_len:
if has_nanos:
# format should be intact and rest of the
# following conditions shouldn't execute.
pass
elif has_micros:
format = format[:-sub_second_res_len] + "%6f"
elif has_millis:
format = format[:-sub_second_res_len] + "%3f"
elif has_seconds or has_minutes or has_hours:
format = format[:-4]
else:
format = format.split(" ")[0]
elif not (has_seconds or has_minutes or has_hours):
format = format.split(" ")[0]
return self.strftime(format)

def mean(
self, skipna=None, min_count: int = 0, dtype=np.float64
Expand Down Expand Up @@ -872,10 +859,11 @@ def _local_time(self):
offsets_from_utc = offsets.take(indices, nullify=True)
return self + offsets_from_utc

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
return self._local_time.as_string_column(dtype, format)
def strftime(self, format: str) -> cudf.core.column.StringColumn:
return self._local_time.strftime(format)

def as_string_column(self) -> cudf.core.column.StringColumn:
return self._local_time.as_string_column()

def get_dt_field(self, field: str) -> ColumnBase:
return libcudf.datetime.extract_datetime_component(
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,7 @@ def as_decimal_column(
return self
return libcudf.unary.cast(self, dtype)

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
def as_string_column(self) -> cudf.core.column.StringColumn:
if len(self) > 0:
return cpp_from_decimal(self)
else:
Expand Down
8 changes: 2 additions & 6 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,15 +253,11 @@ def from_sequences(
)
return res

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
def as_string_column(self) -> cudf.core.column.StringColumn:
"""
Create a strings column from a list column
"""
lc = self._transform_leaves(
lambda col, dtype: col.as_string_column(dtype), dtype
)
lc = self._transform_leaves(lambda col: col.as_string_column())

# Separator strings to match the Python format
separators = as_column([", ", "[", "]"])
Expand Down
12 changes: 5 additions & 7 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,7 @@ def int2ip(self) -> "cudf.core.column.StringColumn":

return libcudf.string_casting.int2ip(self)

def as_string_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.StringColumn":
def as_string_column(self) -> cudf.core.column.StringColumn:
if len(self) > 0:
return string._numeric_to_str_typecast_functions[
cudf.dtype(self.dtype)
Expand All @@ -345,8 +343,8 @@ def as_string_column(
)

def as_datetime_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.DatetimeColumn":
self, dtype: Dtype
) -> cudf.core.column.DatetimeColumn:
return cast(
"cudf.core.column.DatetimeColumn",
build_column(
Expand All @@ -359,8 +357,8 @@ def as_datetime_column(
)

def as_timedelta_column(
self, dtype: Dtype, format: str | None = None
) -> "cudf.core.column.TimeDeltaColumn":
self, dtype: Dtype
) -> cudf.core.column.TimeDeltaColumn:
return cast(
"cudf.core.column.TimeDeltaColumn",
build_column(
Expand Down
Loading

0 comments on commit 53de73d

Please sign in to comment.