From 27d84c9f4b415307685f92039777ec730ed370a2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sat, 12 Aug 2023 13:20:19 -0700 Subject: [PATCH 01/10] Change NA to NaT for datetime and timedelta types --- python/cudf/cudf/__init__.py | 4 +- python/cudf/cudf/_lib/scalar.pyx | 6 +- python/cudf/cudf/core/dataframe.py | 11 ++- python/cudf/cudf/core/index.py | 9 +- python/cudf/cudf/core/missing.py | 5 +- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/series.py | 14 ++- python/cudf/cudf/tests/test_repr.py | 105 ++++++++++------------- python/cudf/cudf/tests/test_timedelta.py | 2 +- python/cudf/cudf/utils/dtypes.py | 1 + 10 files changed, 84 insertions(+), 75 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index d8cee514fb7..3bb25e051e3 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -58,7 +58,7 @@ UInt64Index, interval_range, ) -from cudf.core.missing import NA +from cudf.core.missing import NA, NaT from cudf.core.multiindex import MultiIndex from cudf.core.reshape import ( concat, @@ -90,7 +90,7 @@ option_context, set_option, ) -from cudf.utils.dtypes import _NA_REP +from cudf.utils.dtypes import _NA_REP, _NAT_REP from cudf.utils.utils import clear_cache cuda.set_memory_manager(RMMNumbaManager) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 0ff736b9204..5513e864444 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -31,7 +31,7 @@ from cudf._lib.types import ( duration_unit_map, ) from cudf.core.dtypes import ListDtype, StructDtype -from cudf.core.missing import NA +from cudf.core.missing import NA, NaT from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view @@ -495,7 +495,7 @@ cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s): cdef scalar* s_ptr = s.get() if not s_ptr[0].is_valid(): - return NA + return NaT cdef libcudf_types.data_type cdtype = s_ptr[0].type() @@ -536,7 +536,7 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s): cdef scalar* s_ptr = s.get() if not s_ptr[0].is_valid(): - return NA + return NaT cdef libcudf_types.data_type cdtype = s_ptr[0].type() diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e4b944a88af..84e0e38fafc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1694,7 +1694,16 @@ def _clean_nulls_from_dataframe(self, df): # TODO we need to handle this pass elif df._data[col].has_nulls(): - df[col] = df._data[col].astype("str").fillna(cudf._NA_REP) + if isinstance( + df._data[col], + ( + cudf.core.column.DatetimeColumn, + cudf.core.column.TimeDeltaColumn, + ), + ): + df[col] = df._data[col].astype("str").fillna(cudf._NAT_REP) + else: + df[col] = df._data[col].astype("str").fillna(cudf._NA_REP) else: df[col] = df._data[col] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c3d468e5656..8436fec4f90 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1500,7 +1500,12 @@ def __contains__(self, item): def _clean_nulls_from_index(self): if self._values.has_nulls(): return cudf.Index( - self._values.astype("str").fillna(cudf._NA_REP), name=self.name + self._values.astype("str").fillna( + cudf._NAT_REP + if isinstance(self, (DatetimeIndex, TimedeltaIndex)) + else cudf._NA_REP + ), + name=self.name, ) return self @@ -2611,7 +2616,7 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"): ... '2018-10-28 03:46:00'])) >>> s.dt.tz_localize("CET") 0 2018-10-28 01:20:00.000000000 - 1 + 1 NaT 2 2018-10-28 03:46:00.000000000 dtype: datetime64[ns, CET] diff --git a/python/cudf/cudf/core/missing.py b/python/cudf/cudf/core/missing.py index 02bcb7636f4..75985a2cf3e 100644 --- a/python/cudf/cudf/core/missing.py +++ b/python/cudf/cudf/core/missing.py @@ -1,9 +1,10 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # Pandas NAType enforces a single instance exists at a time # instantiating this class will yield the existing instance # of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA). from pandas import NA +from pandas import NaT -__all__ = ["NA"] +__all__ = ["NA", "NaT"] diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bb0c25a9970..cda4dfc6f44 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -504,7 +504,7 @@ def __repr__(self): ), ): preprocess_df[name] = col.astype("str").fillna( - cudf._NA_REP + cudf._NAT_REP ) tuples_list = list( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a0cec69eca9..7b9e9593550 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1404,7 +1404,19 @@ def __repr__(self): cudf.core.column.timedelta.TimeDeltaColumn, ): output = repr( - preprocess.astype("O").fillna(cudf._NA_REP).to_pandas() + preprocess.astype("O") + .fillna( + cudf._NAT_REP + if isinstance( + preprocess._column, + ( + cudf.core.column.TimeDeltaColumn, + cudf.core.column.DatetimeColumn, + ), + ) + else cudf._NA_REP + ) + .to_pandas() ) elif isinstance( preprocess._column, cudf.core.column.CategoricalColumn diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index e7fa401f1ec..0dc65d3cbee 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -41,9 +41,7 @@ def test_null_series(nrows, dtype): pd.options.display.max_rows = int(nrows) psrepr = repr(ps) - psrepr = psrepr.replace("NaN", "") - psrepr = psrepr.replace("NaT", "") - psrepr = psrepr.replace("None", "") + psrepr = psrepr.replace("NaN", "").replace("None", "") if "UInt" in psrepr: psrepr = psrepr.replace("UInt", "uint") elif "Int" in psrepr: @@ -71,12 +69,7 @@ def test_null_dataframe(ncols): gdf[dtype] = sr pdf = gdf.to_pandas() pd.options.display.max_columns = int(ncols) - pdf_repr = ( - repr(pdf) - .replace("NaN", "") - .replace("NaT", "") - .replace("None", "") - ) + pdf_repr = repr(pdf).replace("NaN", "").replace("None", "") assert pdf_repr.split() == repr(gdf).split() pd.reset_option("display.max_columns") @@ -359,33 +352,33 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ns]")), "DatetimeIndex([1970-01-01 00:00:00.000000010, " "1970-01-01 00:00:00.000000020," - "\n 1970-01-01 00:00:00.000000030, ],\n " + "\n 1970-01-01 00:00:00.000000030, NaT],\n " "dtype='datetime64[ns]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")), "DatetimeIndex([1970-01-01 00:00:10, " "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" - " ],\n dtype='datetime64[s]')", + " NaT],\n dtype='datetime64[s]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")), "DatetimeIndex([1970-01-01 00:00:00.000010, " "1970-01-01 00:00:00.000020,\n " - "1970-01-01 00:00:00.000030, ],\n " + "1970-01-01 00:00:00.000030, NaT],\n " "dtype='datetime64[us]')", ), ( cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")), "DatetimeIndex([1970-01-01 00:00:00.010, " "1970-01-01 00:00:00.020,\n " - "1970-01-01 00:00:00.030, ],\n " + "1970-01-01 00:00:00.030, NaT],\n " "dtype='datetime64[ms]')", ), ( cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), - "DatetimeIndex([, , , , , , , , " - ",\n ],\n dtype='datetime64[ms]')", + "DatetimeIndex([NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, " + "NaT, NaT], dtype='datetime64[ms]')", ), ], ) @@ -473,12 +466,7 @@ def test_dataframe_null_index_repr(df, pandas_special_case): pdf = df gdf = cudf.from_pandas(pdf) - expected_repr = ( - repr(pdf) - .replace("NaN", "") - .replace("NaT", "") - .replace("None", "") - ) + expected_repr = repr(pdf).replace("NaN", "").replace("None", "") actual_repr = repr(gdf) if pandas_special_case: @@ -552,12 +540,7 @@ def test_series_null_index_repr(sr, pandas_special_case): psr = sr gsr = cudf.from_pandas(psr) - expected_repr = ( - repr(psr) - .replace("NaN", "") - .replace("NaT", "") - .replace("None", "") - ) + expected_repr = repr(psr).replace("NaN", "").replace("None", "") actual_repr = repr(gsr) if pandas_special_case: @@ -603,9 +586,7 @@ def test_timedelta_series_s_us_repr(data, dtype): sr = cudf.Series(data, dtype=dtype) psr = sr.to_pandas() - expected = ( - repr(psr).replace("timedelta64[ns]", dtype).replace("NaT", "") - ) + expected = repr(psr).replace("timedelta64[ns]", dtype) actual = repr(sr) assert expected.split() == actual.split() @@ -658,7 +639,7 @@ def test_timedelta_series_s_us_repr(data, dtype): """ 0 0 days 00:00:00.001000000 1 0 days 00:00:00.000200000 - 2 + 2 NaT dtype: timedelta64[ns] """ ), @@ -669,7 +650,7 @@ def test_timedelta_series_s_us_repr(data, dtype): """ 0 0 days 00:16:40 1 0 days 00:03:20 - 2 + 2 NaT dtype: timedelta64[ms] """ ), @@ -680,11 +661,11 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 - 1 - 2 - 3 - 4 + 0 NaT + 1 NaT + 2 NaT + 3 NaT + 4 NaT dtype: timedelta64[ns] """ ), @@ -695,11 +676,11 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 - 1 - 2 - 3 - 4 + 0 NaT + 1 NaT + 2 NaT + 3 NaT + 4 NaT dtype: timedelta64[ms] """ ), @@ -930,10 +911,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): """ a b 0 1579 days 08:54:14 10 - 1 11 + 1 NaT 11 2 2839 days 15:29:05 22 3 2586 days 00:33:31 33 - 4 44 + 4 NaT 44 5 42066 days 12:52:14 55 6 0 days 06:27:14 66 """ @@ -961,10 +942,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): """ a a 1579 days 08:54:14 - b + b NaT c 2839 days 15:29:05 d 2586 days 00:33:31 - e + e NaT f 42066 days 12:52:14 g 0 days 06:27:14 """ @@ -994,10 +975,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): """ a 1 days 13:54:17.654 1 - 2 + NaT 2 2 days 20:09:05.345 3 2 days 14:03:52.411 4 - 5 + NaT 5 42 days 01:35:48.734 6 0 days 00:00:23.234 7 """ @@ -1027,10 +1008,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): """ a 0 days 00:00:00.136457654 a - f + NaT f 0 days 00:00:00.245345345 q 0 days 00:00:00.223432411 e - w + NaT w 0 days 00:00:03.634548734 e 0 days 00:00:00.000023234 t """ @@ -1057,7 +1038,7 @@ def test_timedelta_dataframe_repr(df, expected_repr): cudf.Index( [None, None, None, None, None], dtype="timedelta64[us]" ), - "TimedeltaIndex([, , , , ], " + "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], " "dtype='timedelta64[us]')", ), ( @@ -1073,9 +1054,9 @@ def test_timedelta_dataframe_repr(df, expected_repr): ], dtype="timedelta64[us]", ), - "TimedeltaIndex([0 days 00:02:16.457654, , " + "TimedeltaIndex([0 days 00:02:16.457654, NaT, " "0 days 00:04:05.345345, " - "0 days 00:03:43.432411, ," + "0 days 00:03:43.432411, NaT," " 0 days 01:00:34.548734, 0 days 00:00:00.023234]," " dtype='timedelta64[us]')", ), @@ -1092,8 +1073,8 @@ def test_timedelta_dataframe_repr(df, expected_repr): ], dtype="timedelta64[s]", ), - "TimedeltaIndex([1579 days 08:54:14, , 2839 days 15:29:05," - " 2586 days 00:33:31, , 42066 days 12:52:14, " + "TimedeltaIndex([1579 days 08:54:14, NaT, 2839 days 15:29:05," + " 2586 days 00:33:31, NaT, 42066 days 12:52:14, " "0 days 06:27:14]," " dtype='timedelta64[s]')", ), @@ -1190,7 +1171,7 @@ def test_multiindex_repr(pmi, max_seq_items): .index, textwrap.dedent( """ - MultiIndex([( '', 'abc'), + MultiIndex([( 'NaT', 'abc'), ('1970-01-01 00:00:00.000000001', ), ('1970-01-01 00:00:00.000000002', 'xyz'), ('1970-01-01 00:00:00.000000003', )], @@ -1210,7 +1191,7 @@ def test_multiindex_repr(pmi, max_seq_items): .index, textwrap.dedent( """ - MultiIndex([( '', 'abc', 0.345), + MultiIndex([( 'NaT', 'abc', 0.345), ('1970-01-01 00:00:00.000000001', , ), ('1970-01-01 00:00:00.000000002', 'xyz', 100.0), ('1970-01-01 00:00:00.000000003', , 10.0)], @@ -1230,7 +1211,7 @@ def test_multiindex_repr(pmi, max_seq_items): .index, textwrap.dedent( """ - MultiIndex([('abc', '', 0.345), + MultiIndex([('abc', 'NaT', 0.345), ( , '0 days 00:00:00.000000001', ), ('xyz', '0 days 00:00:00.000000002', 100.0), ( , '0 days 00:00:00.000000003', 10.0)], @@ -1272,10 +1253,10 @@ def test_multiindex_repr(pmi, max_seq_items): .index, textwrap.dedent( """ - MultiIndex([('', ), - ('', ), - ('', ), - ('', )], + MultiIndex([('NaT', ), + ('NaT', ), + ('NaT', ), + ('NaT', )], names=['b', 'a']) """ ), diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 82ef309d116..681b42fd3bd 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -1441,4 +1441,4 @@ def test_timdelta_binop_tz_timestamp(op): def test_timedelta_getitem_na(): s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]") - assert s[2] is cudf.NA + assert s[2] is cudf.NaT diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index d5e9e5854df..a25bbf31b70 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -16,6 +16,7 @@ from cudf.core.missing import NA _NA_REP = "" +_NAT_REP = "NaT" """Map numpy dtype to pyarrow types. Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special From 4e7dbd72a8fe9adc5ebdcd4f7aeb5244f4dd13ab Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sat, 12 Aug 2023 14:04:26 -0700 Subject: [PATCH 02/10] style & test --- python/cudf/cudf/core/missing.py | 3 +-- python/cudf/cudf/tests/test_datetime.py | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/missing.py b/python/cudf/cudf/core/missing.py index 75985a2cf3e..0d48a1d4136 100644 --- a/python/cudf/cudf/core/missing.py +++ b/python/cudf/cudf/core/missing.py @@ -4,7 +4,6 @@ # Pandas NAType enforces a single instance exists at a time # instantiating this class will yield the existing instance # of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA). -from pandas import NA -from pandas import NaT +from pandas import NA, NaT __all__ = ["NA", "NaT"] diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index dcb8781e712..a59f8b62e7e 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2107,3 +2107,8 @@ def test_datetime_binop_tz_timestamp(op): date_scalar = datetime.datetime.now(datetime.timezone.utc) with pytest.raises(NotImplementedError): op(s, date_scalar) + + +def test_datetime_getitem_na(): + s = cudf.Series([1, 2, None, 3], dtype="datetime64[ns]") + assert s[2] is cudf.NaT From c7d9d4091541451b505fade68a8cfb2afa4c4fc6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sat, 12 Aug 2023 16:36:50 -0700 Subject: [PATCH 03/10] more fixes --- python/cudf/cudf/__init__.py | 1 + python/cudf/cudf/_lib/scalar.pyx | 4 ++-- python/cudf/cudf/core/_internals/timezones.py | 2 +- python/cudf/cudf/core/_internals/where.py | 6 +++--- python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/core/scalar.py | 10 +++++++--- python/cudf/cudf/testing/testing.py | 4 ++-- python/cudf/cudf/tests/test_binops.py | 7 ++++++- python/cudf/cudf/tests/test_list.py | 7 ++++++- python/cudf/cudf/tests/test_parquet.py | 2 +- python/cudf/cudf/tests/test_scalar.py | 16 +++++++++++++--- 11 files changed, 44 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 3bb25e051e3..e593a2ca219 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -124,6 +124,7 @@ "IntervalIndex", "ListDtype", "MultiIndex", + "NaT", "NA", "RangeIndex", "Scalar", diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 5513e864444..a2d6f960c69 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -178,7 +178,7 @@ cdef class DeviceScalar: return self.get_raw_ptr()[0].is_valid() def __repr__(self): - if self.value is NA: + if self.value is NA or self.value is NaT: return ( f"{self.__class__.__name__}" f"({self.value}, {repr(self.dtype)})" @@ -586,7 +586,7 @@ def as_device_scalar(val, dtype=None): def _is_null_host_scalar(slr): - if slr is None or slr is NA: + if slr is None or slr is NA or slr is NaT: return True elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr): return True diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 2895fd3476c..67043d3fbb3 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -186,7 +186,7 @@ def localize( DatetimeColumn, data._scatter_by_column( data.isnull() | (ambiguous | nonexistent), - cudf.Scalar(cudf.NA, dtype=data.dtype), + cudf.Scalar(cudf.NaT, dtype=data.dtype), ), ) gmt_data = local_to_utc(localized, zone_name) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 6d4a2990e34..5c07b7c44d7 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import warnings from typing import Tuple, Union @@ -13,7 +13,7 @@ is_scalar, ) from cudf.core.column import ColumnBase -from cudf.core.missing import NA +from cudf.core.missing import NaT, NA from cudf.utils.dtypes import ( _can_cast, _dtype_can_hold_element, @@ -59,7 +59,7 @@ def _check_and_cast_columns_with_other( f"{type(other).__name__} to {source_dtype.name}" ) - if other in {None, NA}: + if other in {None, NA, NaT}: return _normalize_categorical( source_col, cudf.Scalar(other, dtype=source_dtype) ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 0b2b9fda2fd..07ada4544c3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -81,7 +81,7 @@ ListDtype, StructDtype, ) -from cudf.core.missing import NA +from cudf.core.missing import NaT, NA from cudf.core.mixins import BinaryOperand, Reducible from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( @@ -605,7 +605,7 @@ def __setitem__(self, key: Any, value: Any): self._mimic_inplace(out, inplace=True) def _wrap_binop_normalization(self, other): - if other is NA or other is None: + if other is NA or other is None or other is NaT: return cudf.Scalar(other, dtype=self.dtype) if isinstance(other, np.ndarray) and other.ndim == 0: # Try and maintain the dtype diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 438e3c7477d..06aeaf1f1f4 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -8,9 +8,9 @@ import pyarrow as pa import cudf -from cudf.api.types import is_scalar +from cudf.api.types import is_scalar, is_datetime64_dtype, is_timedelta64_dtype from cudf.core.dtypes import ListDtype, StructDtype -from cudf.core.missing import NA +from cudf.core.missing import NA, NaT from cudf.core.mixins import BinaryOperand from cudf.utils.dtypes import ( get_allowed_combinations_for_operator, @@ -243,7 +243,11 @@ def _preprocess_host_value(self, value, dtype): dtype = cudf.dtype(dtype) if not valid: - value = NA + value = ( + NaT + if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype) + else NA + ) return value, dtype diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 484c013f774..351b1f5c131 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -19,7 +19,7 @@ is_string_dtype, is_struct_dtype, ) -from cudf.core.missing import NA +from cudf.core.missing import NaT, NA def dtype_can_compare_equal_to_other(dtype): @@ -290,7 +290,7 @@ def assert_column_equal( def null_safe_scalar_equals(left, right): - if left in {NA, np.nan} or right in {NA, np.nan}: + if left in {NA, NaT, np.nan} or right in {NA, NaT, np.nan}: return left is right return left == right diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 65a15d7c567..549cd8da78e 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1700,7 +1700,12 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): rhs = cudf.Scalar(cudf.NA, dtype=dtype_r) result = op(lhs, rhs) - assert result.value is cudf.NA + assert result.value is ( + cudf.NaT + if cudf.api.types.is_datetime64_dtype(result.dtype) + or cudf.api.types.is_timedelta64_dtype(result.dtype) + else cudf.NA + ) # make sure dtype is the same as had there been a valid scalar valid_lhs = cudf.Scalar(1, dtype=dtype_l) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 87a0424998f..5dd58d8a875 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -697,7 +697,12 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level): dtype = cudf.ListDtype(dtype) slr = cudf.Scalar(None, dtype=dtype) - assert slr.value is cudf.NA + assert slr.value is ( + cudf.NaT + if cudf.api.types.is_datetime64_dtype(slr.dtype) + or cudf.api.types.is_timedelta64_dtype(slr.dtype) + else cudf.NA + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index ff4c2e2a14d..a08ab211b8e 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2264,7 +2264,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls): pdf = pdf.drop(columns=["col_category", "col_bool"]) if not add_nulls: - # Timedelta types convert NA to None when reading from parquet into + # Timedelta types convert NaT to None when reading from parquet into # pandas which interferes with series.max()/min() for t in TIMEDELTA_TYPES: pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index c1aeb987eff..d73a1d40aaa 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -212,7 +212,12 @@ def test_scalar_roundtrip(value): ) def test_null_scalar(dtype): s = cudf.Scalar(None, dtype=dtype) - assert s.value is cudf.NA + if cudf.api.types.is_datetime64_dtype( + dtype + ) or cudf.api.types.is_timedelta64_dtype(dtype): + assert s.value is cudf.NaT + else: + assert s.value is cudf.NA assert s.dtype == ( cudf.dtype(dtype) if not isinstance(dtype, cudf.core.dtypes.DecimalDtype) @@ -236,7 +241,7 @@ def test_null_scalar(dtype): ) def test_nat_to_null_scalar_succeeds(value): s = cudf.Scalar(value) - assert s.value is cudf.NA + assert s.value is cudf.NaT assert not s.is_valid() assert s.dtype == value.dtype @@ -349,7 +354,12 @@ def test_scalar_implicit_int_conversion(value): def test_scalar_invalid_implicit_conversion(cls, dtype): try: - cls(pd.NA) + cls( + pd.NaT + if cudf.api.types.is_datetime64_dtype(dtype) + or cudf.api.types.is_timedelta64_dtype(dtype) + else pd.NA + ) except TypeError as e: with pytest.raises(TypeError, match=re.escape(str(e))): slr = cudf.Scalar(None, dtype=dtype) From 496e88284da74d6391b74f497b03599e6bea86a5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sat, 12 Aug 2023 16:41:13 -0700 Subject: [PATCH 04/10] isort --- python/cudf/cudf/core/_internals/where.py | 2 +- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/testing/testing.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 5c07b7c44d7..c6ba5a70937 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -13,7 +13,7 @@ is_scalar, ) from cudf.core.column import ColumnBase -from cudf.core.missing import NaT, NA +from cudf.core.missing import NA, NaT from cudf.utils.dtypes import ( _can_cast, _dtype_can_hold_element, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 07ada4544c3..1ddb95552d1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -81,7 +81,7 @@ ListDtype, StructDtype, ) -from cudf.core.missing import NaT, NA +from cudf.core.missing import NA, NaT from cudf.core.mixins import BinaryOperand, Reducible from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 351b1f5c131..a9c54ddcaa1 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -19,7 +19,7 @@ is_string_dtype, is_struct_dtype, ) -from cudf.core.missing import NaT, NA +from cudf.core.missing import NA, NaT def dtype_can_compare_equal_to_other(dtype): From fabab5f63b351a48d24f3fabc18e3aed58f40c90 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sat, 12 Aug 2023 16:49:50 -0700 Subject: [PATCH 05/10] isort --- python/cudf/cudf/core/scalar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 06aeaf1f1f4..a20628f6601 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -8,7 +8,7 @@ import pyarrow as pa import cudf -from cudf.api.types import is_scalar, is_datetime64_dtype, is_timedelta64_dtype +from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT from cudf.core.mixins import BinaryOperand From 09466815b0d4da61b80115de4e0612965ccde82a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 14 Aug 2023 08:08:59 -0700 Subject: [PATCH 06/10] Add utility is_na_like --- python/cudf/cudf/_lib/scalar.pyx | 4 ++-- python/cudf/cudf/core/_internals/where.py | 3 +-- python/cudf/cudf/core/column/column.py | 3 +-- python/cudf/cudf/utils/utils.py | 8 ++++++++ 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index a2d6f960c69..39a1b0609cf 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -178,7 +178,7 @@ cdef class DeviceScalar: return self.get_raw_ptr()[0].is_valid() def __repr__(self): - if self.value is NA or self.value is NaT: + if cudf.utils.utils.is_na_like(self.value): return ( f"{self.__class__.__name__}" f"({self.value}, {repr(self.dtype)})" @@ -586,7 +586,7 @@ def as_device_scalar(val, dtype=None): def _is_null_host_scalar(slr): - if slr is None or slr is NA or slr is NaT: + if cudf.utils.utils.is_na_like(slr): return True elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr): return True diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index c6ba5a70937..0f65861dc72 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -13,7 +13,6 @@ is_scalar, ) from cudf.core.column import ColumnBase -from cudf.core.missing import NA, NaT from cudf.utils.dtypes import ( _can_cast, _dtype_can_hold_element, @@ -59,7 +58,7 @@ def _check_and_cast_columns_with_other( f"{type(other).__name__} to {source_dtype.name}" ) - if other in {None, NA, NaT}: + if cudf.utils.utils.is_na_like(other): return _normalize_categorical( source_col, cudf.Scalar(other, dtype=source_dtype) ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1ddb95552d1..263da9b350e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -81,7 +81,6 @@ ListDtype, StructDtype, ) -from cudf.core.missing import NA, NaT from cudf.core.mixins import BinaryOperand, Reducible from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( @@ -605,7 +604,7 @@ def __setitem__(self, key: Any, value: Any): self._mimic_inplace(out, inplace=True) def _wrap_binop_normalization(self, other): - if other is NA or other is None or other is NaT: + if cudf.utils.utils.is_na_like(other): return cudf.Scalar(other, dtype=self.dtype) if isinstance(other, np.ndarray) and other.ndim == 0: # Try and maintain the dtype diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index dadd9aa2cc4..e2cb3f145a1 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -345,6 +345,14 @@ def search_range(x: int, ri: range, *, side: str) -> int: return max(min(len(ri), offset), 0) +def is_na_like(obj): + """ + Check if `obj` is a cudf NA value, + i.e., None, cudf.NA or cudf.NaT + """ + return obj is None or obj is cudf.NA or obj is cudf.NaT + + def _get_color_for_nvtx(name): m = hashlib.sha256() m.update(name.encode()) From 2c2039b60532ecc7df8e2b282ef57c6c1f8fe69d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 14 Aug 2023 13:33:24 -0500 Subject: [PATCH 07/10] Update python/cudf/cudf/core/series.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7b9e9593550..10735786329 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1404,7 +1404,7 @@ def __repr__(self): cudf.core.column.timedelta.TimeDeltaColumn, ): output = repr( - preprocess.astype("O") + preprocess.astype("str") .fillna( cudf._NAT_REP if isinstance( From 578a999b2ac6819ce0cf9f21b2f159a4d260ab2d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 14 Aug 2023 11:52:48 -0700 Subject: [PATCH 08/10] Address reviews --- python/cudf/cudf/__init__.py | 1 - python/cudf/cudf/core/dataframe.py | 23 +++++++++++++---------- python/cudf/cudf/core/index.py | 15 ++++++++------- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/series.py | 27 +++++++++++++-------------- python/cudf/cudf/tests/test_repr.py | 3 +-- python/cudf/cudf/utils/dtypes.py | 5 +---- 7 files changed, 37 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index e593a2ca219..05a008aa8b7 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -90,7 +90,6 @@ option_context, set_option, ) -from cudf.utils.dtypes import _NA_REP, _NAT_REP from cudf.utils.utils import clear_cache cuda.set_memory_manager(RMMNumbaManager) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 84e0e38fafc..fc624c0b8eb 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1694,16 +1694,19 @@ def _clean_nulls_from_dataframe(self, df): # TODO we need to handle this pass elif df._data[col].has_nulls(): - if isinstance( - df._data[col], - ( - cudf.core.column.DatetimeColumn, - cudf.core.column.TimeDeltaColumn, - ), - ): - df[col] = df._data[col].astype("str").fillna(cudf._NAT_REP) - else: - df[col] = df._data[col].astype("str").fillna(cudf._NA_REP) + fill_value = ( + str(cudf.NaT) + if isinstance( + df._data[col], + ( + cudf.core.column.DatetimeColumn, + cudf.core.column.TimeDeltaColumn, + ), + ) + else str(cudf.NA) + ) + + df[col] = df._data[col].astype("str").fillna(fill_value) else: df[col] = df._data[col] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8436fec4f90..b7ee85758b9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1347,7 +1347,7 @@ def __repr__(self): else: output = repr(preprocess.to_pandas()) - output = output.replace("nan", cudf._NA_REP) + output = output.replace("nan", str(cudf.NA)) elif preprocess._values.nullable: output = repr(self._clean_nulls_from_index().to_pandas()) @@ -1499,12 +1499,13 @@ def __contains__(self, item): def _clean_nulls_from_index(self): if self._values.has_nulls(): + fill_value = ( + str(cudf.NaT) + if isinstance(self, (DatetimeIndex, TimedeltaIndex)) + else str(cudf.NA) + ) return cudf.Index( - self._values.astype("str").fillna( - cudf._NAT_REP - if isinstance(self, (DatetimeIndex, TimedeltaIndex)) - else cudf._NA_REP - ), + self._values.astype("str").fillna(fill_value), name=self.name, ) @@ -3259,7 +3260,7 @@ def str(self): def _clean_nulls_from_index(self): if self._values.has_nulls(): - return self.fillna(cudf._NA_REP) + return self.fillna(str(cudf.NA)) else: return self diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index dbbece9dc24..54c67458b55 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -504,7 +504,7 @@ def __repr__(self): ), ): preprocess_df[name] = col.astype("str").fillna( - cudf._NAT_REP + str(cudf.NaT) ) tuples_list = list( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7b9e9593550..b30a9615f0c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1403,20 +1403,19 @@ def __repr__(self): preprocess._column, cudf.core.column.timedelta.TimeDeltaColumn, ): - output = repr( - preprocess.astype("O") - .fillna( - cudf._NAT_REP - if isinstance( - preprocess._column, - ( - cudf.core.column.TimeDeltaColumn, - cudf.core.column.DatetimeColumn, - ), - ) - else cudf._NA_REP + fill_value = ( + str(cudf.NaT) + if isinstance( + preprocess._column, + ( + cudf.core.column.TimeDeltaColumn, + cudf.core.column.DatetimeColumn, + ), ) - .to_pandas() + else str(cudf.NA) + ) + output = repr( + preprocess.astype("O").fillna(fill_value).to_pandas() ) elif isinstance( preprocess._column, cudf.core.column.CategoricalColumn @@ -1448,7 +1447,7 @@ def __repr__(self): min_rows=min_rows, max_rows=max_rows, length=show_dimensions, - na_rep=cudf._NA_REP, + na_rep=str(cudf.NA), ) else: output = repr(preprocess.to_pandas()) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 0dc65d3cbee..b944e0483d0 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -40,8 +40,7 @@ def test_null_series(nrows, dtype): ps = sr.to_pandas() pd.options.display.max_rows = int(nrows) - psrepr = repr(ps) - psrepr = psrepr.replace("NaN", "").replace("None", "") + psrepr = repr(ps).replace("NaN", "").replace("None", "") if "UInt" in psrepr: psrepr = psrepr.replace("UInt", "uint") elif "Int" in psrepr: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index a25bbf31b70..cb04a61dd23 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -13,10 +13,7 @@ import cudf from cudf._typing import DtypeObj from cudf.api.types import is_bool, is_float, is_integer -from cudf.core.missing import NA -_NA_REP = "" -_NAT_REP = "NaT" """Map numpy dtype to pyarrow types. Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special @@ -640,7 +637,7 @@ def _can_cast(from_dtype, to_dtype): `np.can_cast` but with some special handling around cudf specific dtypes. """ - if from_dtype in {None, NA}: + if cudf.utils.utils.is_na_like(from_dtype): return True if isinstance(from_dtype, type): from_dtype = cudf.dtype(from_dtype) From b948e3e6dee40663ded6d44c4bdfd623d3f08e08 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 14 Aug 2023 12:05:14 -0700 Subject: [PATCH 09/10] isort --- python/cudf/cudf/utils/dtypes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index cb04a61dd23..ea96a0859ce 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -14,7 +14,6 @@ from cudf._typing import DtypeObj from cudf.api.types import is_bool, is_float, is_integer - """Map numpy dtype to pyarrow types. Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special handling is required when converting a Boolean column into arrow. From 79074570b5ef3e7e0ac7ca3592f26bcbf04f1eff Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 14 Aug 2023 14:32:17 -0500 Subject: [PATCH 10/10] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/cudf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 05a008aa8b7..e5c78fca893 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -123,8 +123,8 @@ "IntervalIndex", "ListDtype", "MultiIndex", - "NaT", "NA", + "NaT", "RangeIndex", "Scalar", "Series",