Skip to content

Commit

Permalink
Implement to_datetime(..., utc=True) (#14749)
Browse files Browse the repository at this point in the history
closes #13661

Also added typing and fixes a bug where `uint` data would raise a TypeError

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: #14749
  • Loading branch information
mroeschke authored Jan 22, 2024
1 parent f258d04 commit d1c0e25
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 50 deletions.
100 changes: 50 additions & 50 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

import math
import re
import warnings
from typing import Sequence, Union
from typing import Literal, Optional, Sequence, Union

import cupy as cp
import numpy as np
Expand Down Expand Up @@ -49,16 +49,16 @@

def to_datetime(
arg,
errors="raise",
dayfirst=False,
yearfirst=False,
utc=None,
format=None,
exact=True,
unit="ns",
infer_datetime_format=False,
errors: Literal["raise", "coerce", "warn", "ignore"] = "raise",
dayfirst: bool = False,
yearfirst: bool = False,
utc: bool = False,
format: Optional[str] = None,
exact: bool = True,
unit: str = "ns",
infer_datetime_format: bool = False,
origin="unix",
cache=True,
cache: bool = True,
):
"""
Convert argument to datetime.
Expand All @@ -80,6 +80,8 @@ def to_datetime(
2012-11-10.
Warning: dayfirst=True is not strict, but will prefer to parse
with day first (this is a known bug, based on dateutil behavior).
utc : bool, default False
Whether the result should be have a UTC timezone.
format : str, default None
The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
all the way up to nanoseconds.
Expand Down Expand Up @@ -148,9 +150,6 @@ def to_datetime(
if yearfirst:
raise NotImplementedError("yearfirst support is not yet implemented")

if utc:
raise NotImplementedError("utc is not yet implemented")

if format is not None:
if "%Z" in format or "%z" in format:
raise NotImplementedError(
Expand All @@ -165,24 +164,24 @@ def to_datetime(
required = ["year", "month", "day"]
req = list(set(required) - set(arg._data.names))
if len(req):
req = ",".join(req)
err_req = ",".join(req)
raise ValueError(
f"to assemble mappings requires at least that "
f"[year, month, day] be specified: [{req}] "
f"[year, month, day] be specified: [{err_req}] "
f"is missing"
)

# replace passed column name with values in _unit_map
unit = {k: get_units(k) for k in arg._data.names}
unit_rev = {v: k for k, v in unit.items()}
got_units = {k: get_units(k) for k in arg._data.names}
unit_rev = {v: k for k, v in got_units.items()}

# keys we don't recognize
excess = set(unit_rev.keys()) - set(_unit_map.values())
if len(excess):
excess = ",".join(excess)
err_excess = ",".join(excess)
raise ValueError(
f"extra keys have been passed to the "
f"datetime assemblage: [{excess}]"
f"datetime assemblage: [{err_excess}]"
)

new_series = (
Expand Down Expand Up @@ -245,38 +244,29 @@ def to_datetime(
col = (col.astype(dtype="int64") + times_column).astype(
dtype=col.dtype
)
return cudf.Series(col, index=arg.index)
elif isinstance(arg, cudf.BaseIndex):
col = arg._values
col = _process_col(
col=col,
unit=unit,
dayfirst=dayfirst,
infer_datetime_format=infer_datetime_format,
format=format,
)
return as_index(col, name=arg.name)
elif isinstance(arg, (cudf.Series, pd.Series)):
col = column.as_column(arg)
col = _process_col(
col=col,
unit=unit,
dayfirst=dayfirst,
infer_datetime_format=infer_datetime_format,
format=format,
utc=utc,
)
return cudf.Series(col, index=arg.index, name=arg.name)
return cudf.Series(col, index=arg.index)
else:
col = column.as_column(arg)
col = _process_col(
col=col,
col=column.as_column(arg),
unit=unit,
dayfirst=dayfirst,
infer_datetime_format=infer_datetime_format,
format=format,
utc=utc,
)

if is_scalar(arg):
if isinstance(arg, (cudf.BaseIndex, pd.Index)):
return as_index(col, name=arg.name)
elif isinstance(arg, (cudf.Series, pd.Series)):
return cudf.Series(col, index=arg.index, name=arg.name)
elif is_scalar(arg):
return col.element_indexing(0)
else:
return as_index(col)
Expand All @@ -295,11 +285,18 @@ def to_datetime(
return arg


def _process_col(col, unit, dayfirst, infer_datetime_format, format):
if col.dtype.kind == "M":
return col
def _process_col(
col,
unit: str,
dayfirst: bool,
infer_datetime_format: bool,
format: Optional[str],
utc: bool,
):
# Causes circular import
from cudf.core._internals.timezones import localize

elif col.dtype.kind in ("f"):
if col.dtype.kind == "f":
if unit not in (None, "ns"):
factor = cudf.Scalar(
column.datetime._unit_to_nanoseconds_conversion[unit]
Expand All @@ -325,9 +322,8 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
)
else:
col = col.as_datetime_column(dtype="datetime64[ns]")
return col

elif col.dtype.kind in ("i"):
elif col.dtype.kind in "iu":
if unit in ("D", "h", "m"):
factor = cudf.Scalar(
column.datetime._unit_to_nanoseconds_conversion[unit]
Expand All @@ -341,9 +337,8 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
)
else:
col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
return col

elif col.dtype.kind in ("O"):
elif col.dtype.kind == "O":
if unit not in (None, "ns") or col.null_count == len(col):
try:
col = col.astype(dtype="int64")
Expand All @@ -355,6 +350,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
dayfirst=dayfirst,
infer_datetime_format=infer_datetime_format,
format=format,
utc=utc,
)
else:
if format is None:
Expand All @@ -367,13 +363,17 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
element=col.element_indexing(0),
dayfirst=dayfirst,
)
return col.as_datetime_column(
col = col.as_datetime_column(
dtype=_unit_dtype_map[unit],
format=format,
)
raise TypeError(
f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
)
elif col.dtype.kind != "M":
raise TypeError(
f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
)
if utc and not isinstance(col.dtype, pd.DatetimeTZDtype):
return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT")
return col


def get_units(value):
Expand Down
35 changes: 35 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2431,6 +2431,41 @@ def test_to_datetime_errors_non_scalar_not_implemented(errors):
cudf.to_datetime([1, ""], unit="s", errors=errors)


@pytest.mark.parametrize(
"box", [list, pd.Index, cudf.Index, pd.Series, cudf.Series]
)
@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
def test_to_datetime_arraylike_utc_true(box, dtype):
pd_data = [1, 2]
cudf_data = box(pd_data)
if box is not list:
cudf_data = cudf_data.astype(dtype)
if box is cudf.Series or box is pd.Series:
pd_data = pd.Series(pd_data)
result = cudf.to_datetime(cudf_data, utc=True)
expected = pd.to_datetime(pd_data, utc=True)
assert_eq(result, expected)


@pytest.mark.xfail(
raises=TypeError,
reason="libcudf.copying.get_element doesn't understand pd.DatetimeTZDtype",
)
def test_to_datetime_scalar_utc_true():
data = pd.Timestamp(2020, 1, 1)
with cudf.option_context("mode.pandas_compatible", True):
result = cudf.to_datetime(data, utc=True)
expected = pd.Timestamp(year=2020, month=1, day=1, tz="UTC")
assert_eq(result, expected)


def test_to_datetime_dataframe_utc_true():
data = cudf.DataFrame([[2020, 1, 1]], columns=["year", "month", "day"])
result = cudf.to_datetime(data, utc=True)
expected = pd.Series([datetime.datetime(2020, 1, 1)]).dt.tz_localize("UTC")
assert_eq(result, expected)


def test_datetimeindex_dtype_np_dtype():
dtype = np.dtype("datetime64[ns]")
data = [1]
Expand Down

0 comments on commit d1c0e25

Please sign in to comment.