From 9a5a4fec5c80620ca906d28d7e8f662eb21e8198 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jan 2024 15:25:40 -0800 Subject: [PATCH 01/10] default timedelta format the same for all types --- python/cudf/cudf/core/column/timedelta.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 572b3b894dc..7fead6182ee 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -18,13 +18,6 @@ from cudf.utils.dtypes import np_to_pa_dtype from cudf.utils.utils import _all_bools_with_nulls -_dtype_to_format_conversion = { - "timedelta64[ns]": "%D days %H:%M:%S", - "timedelta64[us]": "%D days %H:%M:%S", - "timedelta64[ms]": "%D days %H:%M:%S", - "timedelta64[s]": "%D days %H:%M:%S", -} - _unit_to_nanoseconds_conversion = { "ns": 1, "us": 1_000, @@ -318,9 +311,7 @@ def as_string_column( self, dtype: Dtype, format=None, **kwargs ) -> "cudf.core.column.StringColumn": if format is None: - format = _dtype_to_format_conversion.get( - self.dtype.name, "%D days %H:%M:%S" - ) + format = "%D days %H:%M:%S" if len(self) > 0: return string._timedelta_to_str_typecast_functions[ cudf.dtype(self.dtype) From 2f624085b9c0918aef5414f061c9c73931bbde98 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jan 2024 16:00:23 -0800 Subject: [PATCH 02/10] Simplify some timedelta logic --- python/cudf/cudf/core/column/timedelta.py | 25 ++++++++--------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 7fead6182ee..79ab0f32a4b 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -79,6 +79,8 @@ def __init__( null_count: Optional[int] = None, ): dtype = cudf.dtype(dtype) + if self.dtype.kind != "m": + raise TypeError(f"{self.dtype} is not a supported duration type") if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") @@ -94,14 +96,9 @@ def __init__( null_count=null_count, ) - if self.dtype.type is not np.timedelta64: - raise TypeError(f"{self.dtype} is not a supported duration type") - - self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item: DatetimeLikeScalar) -> bool: try: - item = np.timedelta64(item, self._time_unit) + item = np.timedelta64(item, self.time_unit) except ValueError: # If item cannot be converted to duration type # np.timedelta64 raises ValueError, hence `item` @@ -228,16 +225,12 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand: "Cannot perform binary operation on timezone-naive columns" " and timezone-aware timestamps." ) - if isinstance(other, pd.Timestamp): - if other.tz is not None: + if isinstance(other, datetime.datetime): + if other.tzinfo is not None: raise NotImplementedError(tz_error_msg) - other = other.to_datetime64() - elif isinstance(other, pd.Timedelta): - other = other.to_timedelta64() + other = pd.Timestamp(other).to_datetime64() elif isinstance(other, datetime.timedelta): - other = np.timedelta64(other) - elif isinstance(other, datetime.datetime) and other.tzinfo is not None: - raise NotImplementedError(tz_error_msg) + other = pd.Timedelta(other).to_timedelta64() if isinstance(other, np.timedelta64): other_time_unit = cudf.utils.dtypes.get_time_unit(other) @@ -249,7 +242,7 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand: else: common_dtype = determine_out_dtype(self.dtype, other.dtype) return cudf.Scalar(other.astype(common_dtype)) - elif np.isscalar(other): + elif is_scalar(other): return cudf.Scalar(other) return NotImplemented @@ -268,7 +261,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": @property def time_unit(self) -> str: - return self._time_unit + return np.datetime_data(self.dtype)[0] def fillna( self, From 1be1f785e80ee61bc4852741397c796381238c32 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jan 2024 16:11:03 -0800 Subject: [PATCH 03/10] Remove private _time_unit attribute --- python/cudf/cudf/core/_internals/timezones.py | 8 ++++---- python/cudf/cudf/core/column/datetime.py | 16 +++++++--------- python/cudf/cudf/core/column/timedelta.py | 3 ++- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 67043d3fbb3..552fe877fb4 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. import os import zoneinfo @@ -111,7 +111,7 @@ def _find_ambiguous_and_nonexistent( tz_data_for_zone = get_tz_data(zone_name) transition_times = tz_data_for_zone["transition_times"] offsets = tz_data_for_zone["offsets"].astype( - f"timedelta64[{data._time_unit}]" + f"timedelta64[{data.time_unit}]" ) if len(offsets) == 1: # no transitions @@ -180,7 +180,7 @@ def localize( "Already localized. " "Use `tz_convert` to convert between time zones." ) - dtype = pd.DatetimeTZDtype(data._time_unit, zone_name) + dtype = pd.DatetimeTZDtype(data.time_unit, zone_name) ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name) localized = cast( DatetimeColumn, @@ -227,7 +227,7 @@ def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn: DatetimeTZColumn, build_column( data=utc_time.base_data, - dtype=pd.DatetimeTZDtype(data._time_unit, zone_name), + dtype=pd.DatetimeTZDtype(data.time_unit, zone_name), mask=utc_time.base_mask, size=utc_time.size, offset=utc_time.offset, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 7980b58ab8b..7d96b300d8f 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,8 +1,9 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations import datetime +import functools import locale import re from locale import nl_langinfo @@ -236,6 +237,8 @@ def __init__( null_count: Optional[int] = None, ): dtype = cudf.dtype(dtype) + if self.dtype.kind != "M": + raise TypeError(f"{self.dtype} is not a supported datetime type") if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") @@ -251,14 +254,9 @@ def __init__( null_count=null_count, ) - if self.dtype.type is not np.datetime64: - raise TypeError(f"{self.dtype} is not a supported datetime type") - - self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item: ScalarLike) -> bool: try: - item_as_dt64 = np.datetime64(item, self._time_unit) + item_as_dt64 = np.datetime64(item, self.time_unit) except ValueError: # If item cannot be converted to datetime type # np.datetime64 raises ValueError, hence `item` @@ -266,9 +264,9 @@ def __contains__(self, item: ScalarLike) -> bool: return False return item_as_dt64.astype("int64") in self.as_numerical - @property + @functools.cached_property def time_unit(self) -> str: - return self._time_unit + return np.datetime_data(self.dtype)[0] @property def year(self) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 79ab0f32a4b..f604cfedc88 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -3,6 +3,7 @@ from __future__ import annotations import datetime +import functools from typing import Any, Optional, Sequence, cast import numpy as np @@ -259,7 +260,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": ), ) - @property + @functools.cached_property def time_unit(self) -> str: return np.datetime_data(self.dtype)[0] From 433ff6e8986f7a0910d5acf46ab11a5c90d0f7b8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Jan 2024 10:12:22 -0800 Subject: [PATCH 04/10] Replace self --- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index dfeac2860a5..c1f1d7d44c9 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -240,7 +240,7 @@ def __init__( null_count: Optional[int] = None, ): dtype = cudf.dtype(dtype) - if self.dtype.kind != "M": + if dtype.kind != "M": raise TypeError(f"{self.dtype} is not a supported datetime type") if data.size % dtype.itemsize: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index f604cfedc88..11278048bb1 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -80,7 +80,7 @@ def __init__( null_count: Optional[int] = None, ): dtype = cudf.dtype(dtype) - if self.dtype.kind != "m": + if dtype.kind != "m": raise TypeError(f"{self.dtype} is not a supported duration type") if data.size % dtype.itemsize: From 8365eee62d2f5b99cc96b783cd07eed36c824e01 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Jan 2024 14:18:49 -0800 Subject: [PATCH 05/10] fix for DatetimeTZDtype --- python/cudf/cudf/core/column/datetime.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index c1f1d7d44c9..5e23f4e8c10 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -269,6 +269,8 @@ def __contains__(self, item: ScalarLike) -> bool: @functools.cached_property def time_unit(self) -> str: + if isinstance(self.dtype, pd.DatetimeTZDtype): + return self.dtype.unit return np.datetime_data(self.dtype)[0] @property From f0373d894fc8f8c98b013a02cfac705430858d93 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 11:41:17 -0800 Subject: [PATCH 06/10] Fix contains for tz aware types --- python/cudf/cudf/core/column/datetime.py | 16 ++++++++++------ .../cudf/cudf/tests/series/test_datetimelike.py | 15 +++++++++++++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 5e23f4e8c10..0e24a706e03 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -259,13 +259,17 @@ def __init__( def __contains__(self, item: ScalarLike) -> bool: try: - item_as_dt64 = np.datetime64(item, self.time_unit) - except ValueError: - # If item cannot be converted to datetime type - # np.datetime64 raises ValueError, hence `item` - # cannot exist in `self`. + # TODO(pandas2.0): Change _as_unit to as_unit + ts = pd.Timestamp(item)._as_unit(self.time_unit) + except Exception: + # pandas can raise a variety of errors + # item cannot exist in self. return False - return item_as_dt64.astype("int64") in self.as_numerical + if ts.tzinfo is None and isinstance(self.dtype, pd.DatetimeTZDtype): + return False + elif ts.tzinfo is not None: + ts = ts.tz_convert(None) + return ts.to_numpy().astype("int64") in self.as_numerical @functools.cached_property def time_unit(self) -> str: diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index df68eaca399..1e1c80b11bb 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -203,3 +203,18 @@ def test_tz_aware_attributes_local(): result = dti.hour expected = cudf.Index([9, 9, 9], dtype="int16") assert_eq(result, expected) + + +@pytest.mark.parametrize( + "item, expected", + [ + ["2020-01-01", False], + ["2020-01-01T00:00:00+00:00", True], + ["2020-01-01T00:00:00-08:00", False], + ["2019-12-31T16:00:00-08:00", True], + ], +) +def test_contains_tz_aware(item, expected): + dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC") + result = item in dti + assert result == expected From f8397bb949015e8bdb9299109f02b201fdd5f77c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:25:58 -0800 Subject: [PATCH 07/10] Add copyright --- python/cudf/cudf/tests/series/test_datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 1e1c80b11bb..3352da443dd 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. import os From 2e95199c536d87e5b3870582b433a7dfc29e5554 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 Jan 2024 14:28:33 -0800 Subject: [PATCH 08/10] Push element indexing logic to subclass --- python/cudf/cudf/core/column/column.py | 8 +------- python/cudf/cudf/core/column/datetime.py | 6 ++++++ python/cudf/cudf/core/column/timedelta.py | 6 ++++++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index df5d1c3879a..fe4fd29111f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -529,13 +529,7 @@ def element_indexing(self, index: int): idx = len(self) + idx if idx > len(self) - 1 or idx < 0: raise IndexError("single positional indexer is out-of-bounds") - result = libcudf.copying.get_element(self, idx).value - if cudf.get_option("mode.pandas_compatible"): - if isinstance(result, np.datetime64): - return pd.Timestamp(result) - elif isinstance(result, np.timedelta64): - return pd.Timedelta(result) - return result + return libcudf.copying.get_element(self, idx).value def slice( self, start: int, stop: int, stride: Optional[int] = None diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 7bfc2b2e0f1..34bee25f31e 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -346,6 +346,12 @@ def values(self): "DateTime Arrays is not yet implemented in cudf" ) + def element_indexing(self, index: int): + result = super().element_indexing(index) + if cudf.get_option("mode.pandas_compatible"): + return pd.Timestamp(result) + return result + def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 4415f5ad481..2e540280c9a 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -117,6 +117,12 @@ def values(self): "TimeDelta Arrays is not yet implemented in cudf" ) + def element_indexing(self, index: int): + result = super().element_indexing(index) + if cudf.get_option("mode.pandas_compatible"): + return pd.Timedelta(result) + return result + @acquire_spill_lock() def to_arrow(self) -> pa.Array: mask = None From 512a0232bea1a7fc961f969d47746ed9429c9200 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 31 Jan 2024 13:54:40 -0800 Subject: [PATCH 09/10] Remove _time_unit --- python/cudf/cudf/core/column/timedelta.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 8f8c8f96d64..c543499fb2a 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -496,7 +496,7 @@ def components(self, index=None) -> "cudf.DataFrame": _unit_to_nanoseconds_conversion[value[1]], "ns" ).astype(self.dtype) ) - if self._time_unit == value[1]: + if self.time_unit == value[1]: break for name in keys_list: @@ -588,7 +588,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": # performing division operation to extract the number # of nanoseconds. - if self._time_unit != "ns": + if self.time_unit != "ns": res_col = cudf.core.column.full(len(self), 0, dtype="int64") if self.nullable: res_col = res_col.set_mask(self.mask) From 2f88a3ce3555177f0a672649270e41028a03cbc2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 2 Feb 2024 15:37:09 -0800 Subject: [PATCH 10/10] pandas 2.0 in --- python/cudf/cudf/core/column/datetime.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 7de4101a2b2..bd529b6936e 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -266,8 +266,7 @@ def __init__( def __contains__(self, item: ScalarLike) -> bool: try: - # TODO(pandas2.0): Change _as_unit to as_unit - ts = pd.Timestamp(item)._as_unit(self.time_unit) + ts = pd.Timestamp(item).as_unit(self.time_unit) except Exception: # pandas can raise a variety of errors # item cannot exist in self.