Implement to_datetime(..., utc=True) (#14749)

closes #13661 Also added typing and fixes a bug where `uint` data would raise a TypeError Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: #14749
rapidsai · Jan 22, 2024 · d1c0e25 · d1c0e25
1 parent f258d04
commit d1c0e25
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 50 deletions.
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import math
 import re
 import warnings
-from typing import Sequence, Union
+from typing import Literal, Optional, Sequence, Union
 
 import cupy as cp
 import numpy as np
@@ -49,16 +49,16 @@
 
 def to_datetime(
     arg,
-    errors="raise",
-    dayfirst=False,
-    yearfirst=False,
-    utc=None,
-    format=None,
-    exact=True,
-    unit="ns",
-    infer_datetime_format=False,
+    errors: Literal["raise", "coerce", "warn", "ignore"] = "raise",
+    dayfirst: bool = False,
+    yearfirst: bool = False,
+    utc: bool = False,
+    format: Optional[str] = None,
+    exact: bool = True,
+    unit: str = "ns",
+    infer_datetime_format: bool = False,
     origin="unix",
-    cache=True,
+    cache: bool = True,
 ):
     """
     Convert argument to datetime.
@@ -80,6 +80,8 @@ def to_datetime(
         2012-11-10.
         Warning: dayfirst=True is not strict, but will prefer to parse
         with day first (this is a known bug, based on dateutil behavior).
+    utc : bool, default False
+        Whether the result should be have a UTC timezone.
     format : str, default None
         The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
         all the way up to nanoseconds.
@@ -148,9 +150,6 @@ def to_datetime(
     if yearfirst:
         raise NotImplementedError("yearfirst support is not yet implemented")
 
-    if utc:
-        raise NotImplementedError("utc is not yet implemented")
-
     if format is not None:
         if "%Z" in format or "%z" in format:
             raise NotImplementedError(
@@ -165,24 +164,24 @@ def to_datetime(
             required = ["year", "month", "day"]
             req = list(set(required) - set(arg._data.names))
             if len(req):
-                req = ",".join(req)
+                err_req = ",".join(req)
                 raise ValueError(
                     f"to assemble mappings requires at least that "
-                    f"[year, month, day] be specified: [{req}] "
+                    f"[year, month, day] be specified: [{err_req}] "
                     f"is missing"
                 )
 
             # replace passed column name with values in _unit_map
-            unit = {k: get_units(k) for k in arg._data.names}
-            unit_rev = {v: k for k, v in unit.items()}
+            got_units = {k: get_units(k) for k in arg._data.names}
+            unit_rev = {v: k for k, v in got_units.items()}
 
             # keys we don't recognize
             excess = set(unit_rev.keys()) - set(_unit_map.values())
             if len(excess):
-                excess = ",".join(excess)
+                err_excess = ",".join(excess)
                 raise ValueError(
                     f"extra keys have been passed to the "
-                    f"datetime assemblage: [{excess}]"
+                    f"datetime assemblage: [{err_excess}]"
                 )
 
             new_series = (
@@ -245,38 +244,29 @@ def to_datetime(
                 col = (col.astype(dtype="int64") + times_column).astype(
                     dtype=col.dtype
                 )
-            return cudf.Series(col, index=arg.index)
-        elif isinstance(arg, cudf.BaseIndex):
-            col = arg._values
-            col = _process_col(
-                col=col,
-                unit=unit,
-                dayfirst=dayfirst,
-                infer_datetime_format=infer_datetime_format,
-                format=format,
-            )
-            return as_index(col, name=arg.name)
-        elif isinstance(arg, (cudf.Series, pd.Series)):
-            col = column.as_column(arg)
             col = _process_col(
                 col=col,
                 unit=unit,
                 dayfirst=dayfirst,
                 infer_datetime_format=infer_datetime_format,
                 format=format,
+                utc=utc,
             )
-            return cudf.Series(col, index=arg.index, name=arg.name)
+            return cudf.Series(col, index=arg.index)
         else:
-            col = column.as_column(arg)
             col = _process_col(
-                col=col,
+                col=column.as_column(arg),
                 unit=unit,
                 dayfirst=dayfirst,
                 infer_datetime_format=infer_datetime_format,
                 format=format,
+                utc=utc,
             )
-
-            if is_scalar(arg):
+            if isinstance(arg, (cudf.BaseIndex, pd.Index)):
+                return as_index(col, name=arg.name)
+            elif isinstance(arg, (cudf.Series, pd.Series)):
+                return cudf.Series(col, index=arg.index, name=arg.name)
+            elif is_scalar(arg):
                 return col.element_indexing(0)
             else:
                 return as_index(col)
@@ -295,11 +285,18 @@ def to_datetime(
         return arg
 
 
-def _process_col(col, unit, dayfirst, infer_datetime_format, format):
-    if col.dtype.kind == "M":
-        return col
+def _process_col(
+    col,
+    unit: str,
+    dayfirst: bool,
+    infer_datetime_format: bool,
+    format: Optional[str],
+    utc: bool,
+):
+    # Causes circular import
+    from cudf.core._internals.timezones import localize
 
-    elif col.dtype.kind in ("f"):
+    if col.dtype.kind == "f":
         if unit not in (None, "ns"):
             factor = cudf.Scalar(
                 column.datetime._unit_to_nanoseconds_conversion[unit]
@@ -325,9 +322,8 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             )
         else:
             col = col.as_datetime_column(dtype="datetime64[ns]")
-        return col
 
-    elif col.dtype.kind in ("i"):
+    elif col.dtype.kind in "iu":
         if unit in ("D", "h", "m"):
             factor = cudf.Scalar(
                 column.datetime._unit_to_nanoseconds_conversion[unit]
@@ -341,9 +337,8 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             )
         else:
             col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
-        return col
 
-    elif col.dtype.kind in ("O"):
+    elif col.dtype.kind == "O":
         if unit not in (None, "ns") or col.null_count == len(col):
             try:
                 col = col.astype(dtype="int64")
@@ -355,6 +350,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
                 dayfirst=dayfirst,
                 infer_datetime_format=infer_datetime_format,
                 format=format,
+                utc=utc,
             )
         else:
             if format is None:
@@ -367,13 +363,17 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
                     element=col.element_indexing(0),
                     dayfirst=dayfirst,
                 )
-            return col.as_datetime_column(
+            col = col.as_datetime_column(
                 dtype=_unit_dtype_map[unit],
                 format=format,
             )
-    raise TypeError(
-        f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
-    )
+    elif col.dtype.kind != "M":
+        raise TypeError(
+            f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
+        )
+    if utc and not isinstance(col.dtype, pd.DatetimeTZDtype):
+        return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT")
+    return col
 
 
 def get_units(value):

diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
@@ -2431,6 +2431,41 @@ def test_to_datetime_errors_non_scalar_not_implemented(errors):
         cudf.to_datetime([1, ""], unit="s", errors=errors)
 
 
+@pytest.mark.parametrize(
+    "box", [list, pd.Index, cudf.Index, pd.Series, cudf.Series]
+)
+@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
+def test_to_datetime_arraylike_utc_true(box, dtype):
+    pd_data = [1, 2]
+    cudf_data = box(pd_data)
+    if box is not list:
+        cudf_data = cudf_data.astype(dtype)
+    if box is cudf.Series or box is pd.Series:
+        pd_data = pd.Series(pd_data)
+    result = cudf.to_datetime(cudf_data, utc=True)
+    expected = pd.to_datetime(pd_data, utc=True)
+    assert_eq(result, expected)
+
+
+@pytest.mark.xfail(
+    raises=TypeError,
+    reason="libcudf.copying.get_element doesn't understand pd.DatetimeTZDtype",
+)
+def test_to_datetime_scalar_utc_true():
+    data = pd.Timestamp(2020, 1, 1)
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = cudf.to_datetime(data, utc=True)
+    expected = pd.Timestamp(year=2020, month=1, day=1, tz="UTC")
+    assert_eq(result, expected)
+
+
+def test_to_datetime_dataframe_utc_true():
+    data = cudf.DataFrame([[2020, 1, 1]], columns=["year", "month", "day"])
+    result = cudf.to_datetime(data, utc=True)
+    expected = pd.Series([datetime.datetime(2020, 1, 1)]).dt.tz_localize("UTC")
+    assert_eq(result, expected)
+
+
 def test_datetimeindex_dtype_np_dtype():
     dtype = np.dtype("datetime64[ns]")
     data = [1]