Skip to content

Commit

Permalink
Deprecate certain frequency strings (#14967)
Browse files Browse the repository at this point in the history
This PR deprecates "H", "N", "T", "L", "U" and "S" as frequencies in all datetime APIs. This PR prepares `branch-24.04` for `pandas-2.2` support.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #14967
  • Loading branch information
galipremsagar authored Feb 6, 2024
1 parent 0665575 commit cf32049
Show file tree
Hide file tree
Showing 12 changed files with 158 additions and 101 deletions.
32 changes: 25 additions & 7 deletions python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import warnings

from cudf.core.buffer import acquire_spill_lock

Expand Down Expand Up @@ -85,19 +87,35 @@ cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq):
cdef libcudf_datetime.rounding_frequency freq_val

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.resolution_string.html
old_to_new_freq_map = {
"H": "h",
"N": "ns",
"T": "min",
"L": "ms",
"U": "us",
"S": "s",
}
if freq in old_to_new_freq_map:
warnings.warn(
f"FutureWarning: {freq} is deprecated and will be "
"removed in a future version, please use "
f"{old_to_new_freq_map[freq]} instead.",
FutureWarning
)
freq = old_to_new_freq_map.get(freq)
if freq == "D":
freq_val = libcudf_datetime.rounding_frequency.DAY
elif freq == "H":
elif freq == "h":
freq_val = libcudf_datetime.rounding_frequency.HOUR
elif freq in ("T", "min"):
elif freq == "min":
freq_val = libcudf_datetime.rounding_frequency.MINUTE
elif freq == "S":
elif freq == "s":
freq_val = libcudf_datetime.rounding_frequency.SECOND
elif freq in ("L", "ms"):
elif freq == "ms":
freq_val = libcudf_datetime.rounding_frequency.MILLISECOND
elif freq in ("U", "us"):
elif freq == "us":
freq_val = libcudf_datetime.rounding_frequency.MICROSECOND
elif freq == "N":
elif freq == "ns":
freq_val = libcudf_datetime.rounding_frequency.NANOSECOND
else:
raise ValueError(f"Invalid resolution: '{freq}'")
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/core/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@
PANDAS_GE_214 = PANDAS_VERSION >= version.parse("2.1.4")
PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0")
PANDAS_LT_203 = PANDAS_VERSION < version.parse("2.0.3")
PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0")
PANDAS_LT_300 = PANDAS_VERSION < version.parse("3.0.0")
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
import pandas as pd

import cudf
Expand All @@ -17,7 +17,7 @@ def test_tz_localize():


def test_tz_convert():
pidx = pd.date_range("2023-01-01", periods=3, freq="H")
pidx = pd.date_range("2023-01-01", periods=3, freq="h")
idx = cudf.from_pandas(pidx)
pidx = pidx.tz_localize("UTC")
idx = idx.tz_localize("UTC")
Expand All @@ -27,6 +27,6 @@ def test_tz_convert():


def test_delocalize_naive():
pidx = pd.date_range("2023-01-01", periods=3, freq="H")
pidx = pd.date_range("2023-01-01", periods=3, freq="h")
idx = cudf.from_pandas(pidx)
assert_eq(pidx.tz_localize(None), idx.tz_localize(None))
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/series/test_datetimelike.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

import os

Expand Down Expand Up @@ -130,7 +130,7 @@ def test_delocalize_naive():
"to_tz", ["Europe/London", "America/Chicago", "UTC", None]
)
def test_convert(from_tz, to_tz):
ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="H"))
ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="h"))
gs = cudf.from_pandas(ps)
ps = ps.dt.tz_localize(from_tz)
gs = gs.dt.tz_localize(from_tz)
Expand All @@ -140,7 +140,7 @@ def test_convert(from_tz, to_tz):


def test_convert_from_naive():
gs = cudf.Series(cudf.date_range("2023-01-01", periods=3, freq="H"))
gs = cudf.Series(cudf.date_range("2023-01-01", periods=3, freq="h"))
with pytest.raises(TypeError):
gs.dt.tz_convert("America/New_York")

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_dataset_timeseries():
gdf = cudf.datasets.timeseries(
"2000",
"2010",
freq="2H",
freq="2h",
dtypes={"value": float, "name": "category", "id": int},
nulls_frequency=0.7,
seed=1,
Expand Down
120 changes: 67 additions & 53 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@
import cudf
import cudf.testing.dataset_generator as dataset_generator
from cudf import DataFrame, Series
from cudf.core._compat import PANDAS_EQ_200, PANDAS_GE_200, PANDAS_GE_210
from cudf.core._compat import (
PANDAS_EQ_200,
PANDAS_GE_200,
PANDAS_GE_210,
PANDAS_GE_220,
)
from cudf.core.index import DatetimeIndex
from cudf.testing._utils import (
DATETIME_TYPES,
Expand All @@ -39,7 +44,7 @@ def data1():

def data2():
return pd.date_range(
"20010101", freq="243434324423423234N", name="times", periods=10
"20010101", freq="243434324423423234ns", name="times", periods=10
)


Expand Down Expand Up @@ -1497,10 +1502,10 @@ def test_is_month_start(data, dtype):
{"hours": 10, "days": 57, "nanoseconds": 3},
"83D",
"17h",
"-680T",
"-680min",
"110546s",
"110546789L",
"110546789248U",
"110546789ms",
"110546789248us",
]


Expand Down Expand Up @@ -1540,7 +1545,7 @@ def test_date_range_start_end_freq(request, start, end, freq):
condition=(
start == "1831-05-08 15:23:21"
and end == "1996-11-21 04:05:30"
and freq == "110546789L"
and freq == "110546789ms"
),
reason="https://github.com/rapidsai/cudf/issues/12133",
)
Expand Down Expand Up @@ -1653,7 +1658,8 @@ def test_date_range_end_freq_periods(request, end, freq, periods):
request.applymarker(
pytest.mark.xfail(
condition=(
isinstance(freq, dict)
not PANDAS_GE_220
and isinstance(freq, dict)
and freq.get("hours", None) == 10
and freq.get("days", None) == 57
and freq.get("nanoseconds", None) == 3
Expand Down Expand Up @@ -1723,30 +1729,34 @@ def test_date_range_raise_overflow():
@pytest.mark.parametrize(
"freqstr_unsupported",
[
"1M",
"2SM",
"1ME",
"2SME",
"3MS",
"4BM",
"5CBM",
"4BME",
"5CBME",
"6SMS",
"7BMS",
"8CBMS",
"Q",
"2BQ",
"QE",
"2BQE",
"3BQS",
"10A",
"10Y",
"9BA",
"9BY",
"8AS",
"10YE",
"9BYE",
"8YS",
"7BAS",
"7BYS",
"BH",
"bh",
"B",
],
)
def test_date_range_raise_unsupported(freqstr_unsupported):
def test_date_range_raise_unsupported(request, freqstr_unsupported):
request.applymarker(
pytest.mark.xfail(
condition=(
not PANDAS_GE_220 and freqstr_unsupported.endswith("E")
),
reason="TODO: Remove this once pandas-2.2 support is added",
)
)
s, e = "2001-01-01", "2008-01-31"
pd.date_range(start=s, end=e, freq=freqstr_unsupported)
with pytest.raises(ValueError, match="does not yet support"):
Expand All @@ -1757,9 +1767,9 @@ def test_date_range_raise_unsupported(freqstr_unsupported):
# is a valid frequency for every 3 milliseconds.
if freqstr_unsupported != "3MS":
freqstr_unsupported = freqstr_unsupported.lower()
pd.date_range(start=s, end=e, freq=freqstr_unsupported)
with pytest.raises(ValueError, match="does not yet support"):
cudf.date_range(start=s, end=e, freq=freqstr_unsupported)
with expect_warning_if(PANDAS_GE_220):
cudf.date_range(start=s, end=e, freq=freqstr_unsupported)


##################################################################
Expand Down Expand Up @@ -1957,7 +1967,7 @@ def test_error_values():
)
@pytest.mark.parametrize("time_type", DATETIME_TYPES)
@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"]
)
def test_ceil(request, data, time_type, resolution):
alias_map = {"L": "ms", "U": "us", "N": "ns"}
Expand Down Expand Up @@ -2002,7 +2012,7 @@ def test_ceil(request, data, time_type, resolution):
)
@pytest.mark.parametrize("time_type", DATETIME_TYPES)
@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"]
)
def test_floor(request, data, time_type, resolution):
alias_map = {"L": "ms", "U": "us", "N": "ns"}
Expand Down Expand Up @@ -2048,25 +2058,9 @@ def test_floor(request, data, time_type, resolution):
)
@pytest.mark.parametrize("time_type", DATETIME_TYPES)
@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"]
)
def test_round(request, data, time_type, resolution):
alias_map = {"L": "ms", "U": "us", "N": "ns"}
request.applymarker(
pytest.mark.xfail(
condition=(
PANDAS_EQ_200
and resolution in {"L", "ms", "U", "us", "N"}
and np.dtype(
f"datetime64[{alias_map.get(resolution, resolution)}]"
)
> np.dtype(time_type)
),
reason="https://github.com/pandas-dev/pandas/issues/52761",
strict=True,
)
)

def test_round(data, time_type, resolution):
gs = cudf.Series(data, dtype=time_type)
ps = gs.to_pandas()

Expand Down Expand Up @@ -2284,20 +2278,20 @@ def test_daterange_pandas_compatibility():
@pytest.mark.parametrize(
"data,dtype,freq",
[
([10], "datetime64[ns]", "2N"),
([10, 12, 14, 16], "datetime64[ns]", "2N"),
([10, 11, 12, 13], "datetime64[ns]", "1N"),
([10], "datetime64[ns]", "2ns"),
([10, 12, 14, 16], "datetime64[ns]", "2ns"),
([10, 11, 12, 13], "datetime64[ns]", "1ns"),
([100, 200, 300, 400], "datetime64[s]", "100s"),
([101, 201, 301, 401], "datetime64[ms]", "100ms"),
],
)
def test_datetime_index_with_freq(request, data, dtype, freq):
request.applymarker(
pytest.mark.xfail(
condition=(not PANDAS_GE_200 and dtype != "datetime64[ns]"),
reason="Pandas < 2.0 lacks non-nano-second dtype support.",
)
)
# request.applymarker(
# pytest.mark.xfail(
# condition=(not PANDAS_GE_200 and dtype != "datetime64[ns]"),
# reason="Pandas < 2.0 lacks non-nano-second dtype support.",
# )
# )
actual = cudf.DatetimeIndex(data, dtype=dtype, freq=freq)
expected = pd.DatetimeIndex(data, dtype=dtype, freq=freq)
assert_eq(actual, expected)
Expand All @@ -2306,7 +2300,7 @@ def test_datetime_index_with_freq(request, data, dtype, freq):
@pytest.mark.parametrize(
"data,dtype,freq",
[
([10, 1232, 13244, 13426], "datetime64[ns]", "2N"),
([10, 1232, 13244, 13426], "datetime64[ns]", "2ns"),
([10, 11, 12, 13], "datetime64[ns]", "1s"),
([10000, 200, 300, 400], "datetime64[s]", "100s"),
([107871, 201, 301, 401], "datetime64[ms]", "100ns"),
Expand Down Expand Up @@ -2454,3 +2448,23 @@ def test_dateimeindex_from_noniso_string():
def test_to_datetime_errors_non_scalar_not_implemented(errors):
with pytest.raises(NotImplementedError):
cudf.to_datetime([1, ""], unit="s", errors=errors)


@pytest.mark.parametrize(
"freqstr",
[
"H",
"N",
"T",
"L",
"U",
"S",
],
)
def test_datetime_raise_warning(freqstr):
t = cudf.Series(
["2001-01-01 00:04:45", "2001-01-01 00:04:58", "2001-01-01 00:05:04"],
dtype="datetime64[ns]",
)
with pytest.warns(FutureWarning):
t.dt.ceil(freqstr)
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2422,7 +2422,7 @@ def test_index_type_methods(data, func):


@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "s", "ms", "us", "ns"]
)
def test_index_datetime_ceil(resolution):
cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
Expand All @@ -2435,7 +2435,7 @@ def test_index_datetime_ceil(resolution):


@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "s", "ms", "us", "ns"]
)
def test_index_datetime_floor(resolution):
cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
Expand All @@ -2448,7 +2448,7 @@ def test_index_datetime_floor(resolution):


@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "s", "ms", "us", "ns"]
)
def test_index_datetime_round(resolution):
cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
Expand Down Expand Up @@ -2490,7 +2490,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
pd.Series(
range(25),
index=pd.date_range(
start="2019-01-01", end="2019-01-02", freq="H"
start="2019-01-01", end="2019-01-02", freq="h"
),
),
],
Expand Down
Loading

0 comments on commit cf32049

Please sign in to comment.