Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate certain frequency strings #14967

Merged
merged 9 commits into from
Feb 6, 2024
25 changes: 21 additions & 4 deletions python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import warnings

from cudf.core.buffer import acquire_spill_lock

Expand Down Expand Up @@ -85,19 +87,34 @@ cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq):
cdef libcudf_datetime.rounding_frequency freq_val

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.resolution_string.html
old_to_new_freq_map = {
"H": "h",
"N": "ns",
"T": "min",
"L": "ms",
"U": "us",
"S": "s",
}
if freq in old_to_new_freq_map:
warnings.warn(
f"FutureWarning: {freq} is deprecated and will be "
"removed in a future version, please use "
f"{old_to_new_freq_map[freq]} instead.",
FutureWarning
)
if freq == "D":
freq_val = libcudf_datetime.rounding_frequency.DAY
elif freq == "H":
elif freq in ("H", "h"):
freq_val = libcudf_datetime.rounding_frequency.HOUR
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we do freq = old_to_new_freq_map.get(freq, freq) and then update these elif checks with the future frequency?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, updated the code 👍

elif freq in ("T", "min"):
freq_val = libcudf_datetime.rounding_frequency.MINUTE
elif freq == "S":
elif freq in ("S", "s"):
freq_val = libcudf_datetime.rounding_frequency.SECOND
elif freq in ("L", "ms"):
freq_val = libcudf_datetime.rounding_frequency.MILLISECOND
elif freq in ("U", "us"):
freq_val = libcudf_datetime.rounding_frequency.MICROSECOND
elif freq == "N":
elif freq in ("N", "ns"):
freq_val = libcudf_datetime.rounding_frequency.NANOSECOND
else:
raise ValueError(f"Invalid resolution: '{freq}'")
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/core/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@
PANDAS_GE_214 = PANDAS_VERSION >= version.parse("2.1.4")
PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0")
PANDAS_LT_203 = PANDAS_VERSION < version.parse("2.0.3")
PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0")
PANDAS_LT_300 = PANDAS_VERSION < version.parse("3.0.0")
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
import pandas as pd

import cudf
Expand All @@ -17,7 +17,7 @@ def test_tz_localize():


def test_tz_convert():
pidx = pd.date_range("2023-01-01", periods=3, freq="H")
pidx = pd.date_range("2023-01-01", periods=3, freq="h")
idx = cudf.from_pandas(pidx)
pidx = pidx.tz_localize("UTC")
idx = idx.tz_localize("UTC")
Expand All @@ -27,6 +27,6 @@ def test_tz_convert():


def test_delocalize_naive():
pidx = pd.date_range("2023-01-01", periods=3, freq="H")
pidx = pd.date_range("2023-01-01", periods=3, freq="h")
idx = cudf.from_pandas(pidx)
assert_eq(pidx.tz_localize(None), idx.tz_localize(None))
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/series/test_datetimelike.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

import os

Expand Down Expand Up @@ -130,7 +130,7 @@ def test_delocalize_naive():
"to_tz", ["Europe/London", "America/Chicago", "UTC", None]
)
def test_convert(from_tz, to_tz):
ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="H"))
ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="h"))
gs = cudf.from_pandas(ps)
ps = ps.dt.tz_localize(from_tz)
gs = gs.dt.tz_localize(from_tz)
Expand All @@ -140,7 +140,7 @@ def test_convert(from_tz, to_tz):


def test_convert_from_naive():
gs = cudf.Series(cudf.date_range("2023-01-01", periods=3, freq="H"))
gs = cudf.Series(cudf.date_range("2023-01-01", periods=3, freq="h"))
with pytest.raises(TypeError):
gs.dt.tz_convert("America/New_York")

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_dataset_timeseries():
gdf = cudf.datasets.timeseries(
"2000",
"2010",
freq="2H",
freq="2h",
dtypes={"value": float, "name": "category", "id": int},
nulls_frequency=0.7,
seed=1,
Expand Down
120 changes: 67 additions & 53 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@
import cudf
import cudf.testing.dataset_generator as dataset_generator
from cudf import DataFrame, Series
from cudf.core._compat import PANDAS_EQ_200, PANDAS_GE_200, PANDAS_GE_210
from cudf.core._compat import (
PANDAS_EQ_200,
PANDAS_GE_200,
PANDAS_GE_210,
PANDAS_GE_220,
)
from cudf.core.index import DatetimeIndex
from cudf.testing._utils import (
DATETIME_TYPES,
Expand All @@ -39,7 +44,7 @@ def data1():

def data2():
return pd.date_range(
"20010101", freq="243434324423423234N", name="times", periods=10
"20010101", freq="243434324423423234ns", name="times", periods=10
)


Expand Down Expand Up @@ -1497,10 +1502,10 @@ def test_is_month_start(data, dtype):
{"hours": 10, "days": 57, "nanoseconds": 3},
"83D",
"17h",
"-680T",
"-680min",
"110546s",
"110546789L",
"110546789248U",
"110546789ms",
"110546789248us",
]


Expand Down Expand Up @@ -1540,7 +1545,7 @@ def test_date_range_start_end_freq(request, start, end, freq):
condition=(
start == "1831-05-08 15:23:21"
and end == "1996-11-21 04:05:30"
and freq == "110546789L"
and freq == "110546789ms"
),
reason="https://github.com/rapidsai/cudf/issues/12133",
)
Expand Down Expand Up @@ -1653,7 +1658,8 @@ def test_date_range_end_freq_periods(request, end, freq, periods):
request.applymarker(
pytest.mark.xfail(
condition=(
isinstance(freq, dict)
not PANDAS_GE_220
and isinstance(freq, dict)
and freq.get("hours", None) == 10
and freq.get("days", None) == 57
and freq.get("nanoseconds", None) == 3
Expand Down Expand Up @@ -1723,30 +1729,34 @@ def test_date_range_raise_overflow():
@pytest.mark.parametrize(
"freqstr_unsupported",
[
"1M",
"2SM",
"1ME",
"2SME",
"3MS",
"4BM",
"5CBM",
"4BME",
"5CBME",
"6SMS",
"7BMS",
"8CBMS",
"Q",
"2BQ",
"QE",
"2BQE",
"3BQS",
"10A",
"10Y",
"9BA",
"9BY",
"8AS",
"10YE",
"9BYE",
"8YS",
"7BAS",
"7BYS",
"BH",
"bh",
"B",
],
)
def test_date_range_raise_unsupported(freqstr_unsupported):
def test_date_range_raise_unsupported(request, freqstr_unsupported):
request.applymarker(
pytest.mark.xfail(
condition=(
not PANDAS_GE_220 and freqstr_unsupported.endswith("E")
),
reason="TODO: Remove this once pandas-2.2 support is added",
)
)
s, e = "2001-01-01", "2008-01-31"
pd.date_range(start=s, end=e, freq=freqstr_unsupported)
with pytest.raises(ValueError, match="does not yet support"):
Expand All @@ -1757,9 +1767,9 @@ def test_date_range_raise_unsupported(freqstr_unsupported):
# is a valid frequency for every 3 milliseconds.
if freqstr_unsupported != "3MS":
freqstr_unsupported = freqstr_unsupported.lower()
pd.date_range(start=s, end=e, freq=freqstr_unsupported)
with pytest.raises(ValueError, match="does not yet support"):
cudf.date_range(start=s, end=e, freq=freqstr_unsupported)
with expect_warning_if(PANDAS_GE_220):
cudf.date_range(start=s, end=e, freq=freqstr_unsupported)


##################################################################
Expand Down Expand Up @@ -1957,7 +1967,7 @@ def test_error_values():
)
@pytest.mark.parametrize("time_type", DATETIME_TYPES)
@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"]
)
def test_ceil(request, data, time_type, resolution):
alias_map = {"L": "ms", "U": "us", "N": "ns"}
Expand Down Expand Up @@ -2002,7 +2012,7 @@ def test_ceil(request, data, time_type, resolution):
)
@pytest.mark.parametrize("time_type", DATETIME_TYPES)
@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"]
)
def test_floor(request, data, time_type, resolution):
alias_map = {"L": "ms", "U": "us", "N": "ns"}
Expand Down Expand Up @@ -2048,25 +2058,9 @@ def test_floor(request, data, time_type, resolution):
)
@pytest.mark.parametrize("time_type", DATETIME_TYPES)
@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"]
)
def test_round(request, data, time_type, resolution):
alias_map = {"L": "ms", "U": "us", "N": "ns"}
request.applymarker(
pytest.mark.xfail(
condition=(
PANDAS_EQ_200
and resolution in {"L", "ms", "U", "us", "N"}
and np.dtype(
f"datetime64[{alias_map.get(resolution, resolution)}]"
)
> np.dtype(time_type)
),
reason="https://github.com/pandas-dev/pandas/issues/52761",
strict=True,
)
)

def test_round(data, time_type, resolution):
gs = cudf.Series(data, dtype=time_type)
ps = gs.to_pandas()

Expand Down Expand Up @@ -2284,20 +2278,20 @@ def test_daterange_pandas_compatibility():
@pytest.mark.parametrize(
"data,dtype,freq",
[
([10], "datetime64[ns]", "2N"),
([10, 12, 14, 16], "datetime64[ns]", "2N"),
([10, 11, 12, 13], "datetime64[ns]", "1N"),
([10], "datetime64[ns]", "2ns"),
([10, 12, 14, 16], "datetime64[ns]", "2ns"),
([10, 11, 12, 13], "datetime64[ns]", "1ns"),
([100, 200, 300, 400], "datetime64[s]", "100s"),
([101, 201, 301, 401], "datetime64[ms]", "100ms"),
],
)
def test_datetime_index_with_freq(request, data, dtype, freq):
request.applymarker(
pytest.mark.xfail(
condition=(not PANDAS_GE_200 and dtype != "datetime64[ns]"),
reason="Pandas < 2.0 lacks non-nano-second dtype support.",
)
)
# request.applymarker(
# pytest.mark.xfail(
# condition=(not PANDAS_GE_200 and dtype != "datetime64[ns]"),
# reason="Pandas < 2.0 lacks non-nano-second dtype support.",
# )
# )
actual = cudf.DatetimeIndex(data, dtype=dtype, freq=freq)
expected = pd.DatetimeIndex(data, dtype=dtype, freq=freq)
assert_eq(actual, expected)
Expand All @@ -2306,7 +2300,7 @@ def test_datetime_index_with_freq(request, data, dtype, freq):
@pytest.mark.parametrize(
"data,dtype,freq",
[
([10, 1232, 13244, 13426], "datetime64[ns]", "2N"),
([10, 1232, 13244, 13426], "datetime64[ns]", "2ns"),
([10, 11, 12, 13], "datetime64[ns]", "1s"),
([10000, 200, 300, 400], "datetime64[s]", "100s"),
([107871, 201, 301, 401], "datetime64[ms]", "100ns"),
Expand Down Expand Up @@ -2454,3 +2448,23 @@ def test_dateimeindex_from_noniso_string():
def test_to_datetime_errors_non_scalar_not_implemented(errors):
with pytest.raises(NotImplementedError):
cudf.to_datetime([1, ""], unit="s", errors=errors)


@pytest.mark.parametrize(
"freqstr",
[
"H",
"N",
"T",
"L",
"U",
"S",
],
)
def test_datetime_raise_warning(freqstr):
t = cudf.Series(
["2001-01-01 00:04:45", "2001-01-01 00:04:58", "2001-01-01 00:05:04"],
dtype="datetime64[ns]",
)
with pytest.warns(FutureWarning):
t.dt.ceil(freqstr)
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2422,7 +2422,7 @@ def test_index_type_methods(data, func):


@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "s", "ms", "us", "ns"]
)
def test_index_datetime_ceil(resolution):
cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
Expand All @@ -2435,7 +2435,7 @@ def test_index_datetime_ceil(resolution):


@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "s", "ms", "us", "ns"]
)
def test_index_datetime_floor(resolution):
cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
Expand All @@ -2448,7 +2448,7 @@ def test_index_datetime_floor(resolution):


@pytest.mark.parametrize(
"resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
"resolution", ["D", "h", "min", "s", "ms", "us", "ns"]
)
def test_index_datetime_round(resolution):
cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
Expand Down Expand Up @@ -2490,7 +2490,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
pd.Series(
range(25),
index=pd.date_range(
start="2019-01-01", end="2019-01-02", freq="H"
start="2019-01-01", end="2019-01-02", freq="h"
),
),
],
Expand Down
7 changes: 6 additions & 1 deletion python/cudf/cudf/tests/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest

import cudf
from cudf.core._compat import PANDAS_GE_220
from cudf.testing._utils import assert_eq


Expand Down Expand Up @@ -166,13 +167,17 @@ def test_interval_index_unique():
assert_eq(expected, actual)


@pytest.mark.xfail(
condition=not PANDAS_GE_220,
reason="TODO: Remove this once pandas-2.2 support is added",
)
@pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex])
@pytest.mark.parametrize("tz", ["US/Eastern", None])
def test_interval_with_datetime(tz, box):
dti = pd.date_range(
start=pd.Timestamp("20180101", tz=tz),
end=pd.Timestamp("20181231", tz=tz),
freq="M",
freq="ME",
)
pobj = box(pd.IntervalIndex.from_breaks(dti))
if tz is None:
Expand Down
Loading
Loading