Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change NA to NaT for datetime and timedelta types #13868

Merged
merged 14 commits into from
Aug 14, 2023
4 changes: 2 additions & 2 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
UInt64Index,
interval_range,
)
from cudf.core.missing import NA
from cudf.core.missing import NA, NaT
from cudf.core.multiindex import MultiIndex
from cudf.core.reshape import (
concat,
Expand Down Expand Up @@ -90,7 +90,6 @@
option_context,
set_option,
)
from cudf.utils.dtypes import _NA_REP
from cudf.utils.utils import clear_cache

cuda.set_memory_manager(RMMNumbaManager)
Expand Down Expand Up @@ -124,6 +123,7 @@
"IntervalIndex",
"ListDtype",
"MultiIndex",
"NaT",
"NA",
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
"RangeIndex",
"Scalar",
Expand Down
10 changes: 5 additions & 5 deletions python/cudf/cudf/_lib/scalar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ from cudf._lib.types import (
duration_unit_map,
)
from cudf.core.dtypes import ListDtype, StructDtype
from cudf.core.missing import NA
from cudf.core.missing import NA, NaT

from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column_view cimport column_view
Expand Down Expand Up @@ -178,7 +178,7 @@ cdef class DeviceScalar:
return self.get_raw_ptr()[0].is_valid()

def __repr__(self):
if self.value is NA:
if cudf.utils.utils.is_na_like(self.value):
return (
f"{self.__class__.__name__}"
f"({self.value}, {repr(self.dtype)})"
Expand Down Expand Up @@ -495,7 +495,7 @@ cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s):
cdef scalar* s_ptr = s.get()

if not s_ptr[0].is_valid():
return NA
return NaT

cdef libcudf_types.data_type cdtype = s_ptr[0].type()

Expand Down Expand Up @@ -536,7 +536,7 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
cdef scalar* s_ptr = s.get()

if not s_ptr[0].is_valid():
return NA
return NaT

cdef libcudf_types.data_type cdtype = s_ptr[0].type()

Expand Down Expand Up @@ -586,7 +586,7 @@ def as_device_scalar(val, dtype=None):


def _is_null_host_scalar(slr):
if slr is None or slr is NA:
if cudf.utils.utils.is_na_like(slr):
return True
elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr):
return True
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def localize(
DatetimeColumn,
data._scatter_by_column(
data.isnull() | (ambiguous | nonexistent),
cudf.Scalar(cudf.NA, dtype=data.dtype),
cudf.Scalar(cudf.NaT, dtype=data.dtype),
),
)
gmt_data = local_to_utc(localized, zone_name)
Expand Down
5 changes: 2 additions & 3 deletions python/cudf/cudf/core/_internals/where.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.

import warnings
from typing import Tuple, Union
Expand All @@ -13,7 +13,6 @@
is_scalar,
)
from cudf.core.column import ColumnBase
from cudf.core.missing import NA
from cudf.utils.dtypes import (
_can_cast,
_dtype_can_hold_element,
Expand Down Expand Up @@ -59,7 +58,7 @@ def _check_and_cast_columns_with_other(
f"{type(other).__name__} to {source_dtype.name}"
)

if other in {None, NA}:
if cudf.utils.utils.is_na_like(other):
return _normalize_categorical(
source_col, cudf.Scalar(other, dtype=source_dtype)
)
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@
ListDtype,
StructDtype,
)
from cudf.core.missing import NA
from cudf.core.mixins import BinaryOperand, Reducible
from cudf.errors import MixedTypeError
from cudf.utils.dtypes import (
Expand Down Expand Up @@ -605,7 +604,7 @@ def __setitem__(self, key: Any, value: Any):
self._mimic_inplace(out, inplace=True)

def _wrap_binop_normalization(self, other):
if other is NA or other is None:
if cudf.utils.utils.is_na_like(other):
return cudf.Scalar(other, dtype=self.dtype)
if isinstance(other, np.ndarray) and other.ndim == 0:
# Try and maintain the dtype
Expand Down
14 changes: 13 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1694,7 +1694,19 @@ def _clean_nulls_from_dataframe(self, df):
# TODO we need to handle this
pass
elif df._data[col].has_nulls():
df[col] = df._data[col].astype("str").fillna(cudf._NA_REP)
fill_value = (
str(cudf.NaT)
if isinstance(
df._data[col],
(
cudf.core.column.DatetimeColumn,
cudf.core.column.TimeDeltaColumn,
),
)
else str(cudf.NA)
)

df[col] = df._data[col].astype("str").fillna(fill_value)
else:
df[col] = df._data[col]

Expand Down
14 changes: 10 additions & 4 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,7 +1347,7 @@ def __repr__(self):
else:
output = repr(preprocess.to_pandas())

output = output.replace("nan", cudf._NA_REP)
output = output.replace("nan", str(cudf.NA))
elif preprocess._values.nullable:
output = repr(self._clean_nulls_from_index().to_pandas())

Expand Down Expand Up @@ -1499,8 +1499,14 @@ def __contains__(self, item):

def _clean_nulls_from_index(self):
if self._values.has_nulls():
fill_value = (
str(cudf.NaT)
if isinstance(self, (DatetimeIndex, TimedeltaIndex))
else str(cudf.NA)
)
return cudf.Index(
self._values.astype("str").fillna(cudf._NA_REP), name=self.name
self._values.astype("str").fillna(fill_value),
name=self.name,
)

return self
Expand Down Expand Up @@ -2611,7 +2617,7 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
... '2018-10-28 03:46:00']))
>>> s.dt.tz_localize("CET")
0 2018-10-28 01:20:00.000000000
1 <NA>
1 NaT
2 2018-10-28 03:46:00.000000000
dtype: datetime64[ns, CET]

Expand Down Expand Up @@ -3254,7 +3260,7 @@ def str(self):

def _clean_nulls_from_index(self):
if self._values.has_nulls():
return self.fillna(cudf._NA_REP)
return self.fillna(str(cudf.NA))
else:
return self

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/missing.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
# Copyright (c) 2018-2023, NVIDIA CORPORATION.


# Pandas NAType enforces a single instance exists at a time
# instantiating this class will yield the existing instance
# of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA).
from pandas import NA
from pandas import NA, NaT

__all__ = ["NA"]
__all__ = ["NA", "NaT"]
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ def __repr__(self):
),
):
preprocess_df[name] = col.astype("str").fillna(
cudf._NA_REP
str(cudf.NaT)
)

tuples_list = list(
Expand Down
10 changes: 7 additions & 3 deletions python/cudf/cudf/core/scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import pyarrow as pa

import cudf
from cudf.api.types import is_scalar
from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
from cudf.core.dtypes import ListDtype, StructDtype
from cudf.core.missing import NA
from cudf.core.missing import NA, NaT
from cudf.core.mixins import BinaryOperand
from cudf.utils.dtypes import (
get_allowed_combinations_for_operator,
Expand Down Expand Up @@ -243,7 +243,11 @@ def _preprocess_host_value(self, value, dtype):
dtype = cudf.dtype(dtype)

if not valid:
value = NA
value = (
NaT
if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype)
else NA
)

return value, dtype

Expand Down
15 changes: 13 additions & 2 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1403,8 +1403,19 @@ def __repr__(self):
preprocess._column,
cudf.core.column.timedelta.TimeDeltaColumn,
):
fill_value = (
str(cudf.NaT)
if isinstance(
preprocess._column,
(
cudf.core.column.TimeDeltaColumn,
cudf.core.column.DatetimeColumn,
),
)
else str(cudf.NA)
)
output = repr(
preprocess.astype("O").fillna(cudf._NA_REP).to_pandas()
preprocess.astype("str").fillna(fill_value).to_pandas()
)
elif isinstance(
preprocess._column, cudf.core.column.CategoricalColumn
Expand Down Expand Up @@ -1436,7 +1447,7 @@ def __repr__(self):
min_rows=min_rows,
max_rows=max_rows,
length=show_dimensions,
na_rep=cudf._NA_REP,
na_rep=str(cudf.NA),
)
else:
output = repr(preprocess.to_pandas())
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/testing/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
is_string_dtype,
is_struct_dtype,
)
from cudf.core.missing import NA
from cudf.core.missing import NA, NaT


def dtype_can_compare_equal_to_other(dtype):
Expand Down Expand Up @@ -290,7 +290,7 @@ def assert_column_equal(


def null_safe_scalar_equals(left, right):
if left in {NA, np.nan} or right in {NA, np.nan}:
if left in {NA, NaT, np.nan} or right in {NA, NaT, np.nan}:
return left is right
return left == right

Expand Down
7 changes: 6 additions & 1 deletion python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1700,7 +1700,12 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
rhs = cudf.Scalar(cudf.NA, dtype=dtype_r)

result = op(lhs, rhs)
assert result.value is cudf.NA
assert result.value is (
cudf.NaT
if cudf.api.types.is_datetime64_dtype(result.dtype)
or cudf.api.types.is_timedelta64_dtype(result.dtype)
else cudf.NA
)

# make sure dtype is the same as had there been a valid scalar
valid_lhs = cudf.Scalar(1, dtype=dtype_l)
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2107,3 +2107,8 @@ def test_datetime_binop_tz_timestamp(op):
date_scalar = datetime.datetime.now(datetime.timezone.utc)
with pytest.raises(NotImplementedError):
op(s, date_scalar)


def test_datetime_getitem_na():
s = cudf.Series([1, 2, None, 3], dtype="datetime64[ns]")
assert s[2] is cudf.NaT
7 changes: 6 additions & 1 deletion python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,12 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level):
dtype = cudf.ListDtype(dtype)

slr = cudf.Scalar(None, dtype=dtype)
assert slr.value is cudf.NA
assert slr.value is (
cudf.NaT
if cudf.api.types.is_datetime64_dtype(slr.dtype)
or cudf.api.types.is_timedelta64_dtype(slr.dtype)
else cudf.NA
)


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2264,7 +2264,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
pdf = pdf.drop(columns=["col_category", "col_bool"])

if not add_nulls:
# Timedelta types convert NA to None when reading from parquet into
# Timedelta types convert NaT to None when reading from parquet into
# pandas which interferes with series.max()/min()
for t in TIMEDELTA_TYPES:
pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t)
Expand Down
Loading