-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support freq
in DatetimeIndex
#14593
Changes from 18 commits
9602715
98e5e1e
6b0beee
20ca2bb
b461ecb
45fb6b2
f68a689
0337840
957c7c5
ed3ba3f
ce4f3bd
6ecc6e6
cd00345
6d00347
e1d2315
55266cd
e1b697f
32f622a
0d5c452
112dbc1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -463,13 +463,19 @@ class DateOffset: | |
} | ||
|
||
_CODES_TO_UNITS = { | ||
"N": "nanoseconds", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have some vague recollection that we left these out on purpose... hmm. I think there was some pandas behavior for which There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There were a bunch of failing tests without these changes, adding these units passed the cudf pytests. There is only slight increase in pandas-pytest failures:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good, thanks for checking. |
||
"ns": "nanoseconds", | ||
"U": "microseconds", | ||
"us": "microseconds", | ||
"ms": "milliseconds", | ||
"L": "milliseconds", | ||
"s": "seconds", | ||
"S": "seconds", | ||
"m": "minutes", | ||
"min": "minutes", | ||
"T": "minutes", | ||
"h": "hours", | ||
"H": "hours", | ||
"D": "days", | ||
"W": "weeks", | ||
"M": "months", | ||
|
@@ -487,7 +493,7 @@ class DateOffset: | |
pd_offset.Nano: "nanoseconds", | ||
} | ||
|
||
_FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)") | ||
_FREQSTR_REGEX = re.compile("([-+]?[0-9]*)([a-zA-Z]+)") | ||
|
||
def __init__(self, n=1, normalize=False, **kwds): | ||
if normalize: | ||
|
@@ -843,10 +849,6 @@ def date_range( | |
arr = cp.linspace(start=start, stop=end, num=periods) | ||
result = cudf.core.column.as_column(arr).astype("datetime64[ns]") | ||
return cudf.DatetimeIndex._from_data({name: result}) | ||
elif cudf.get_option("mode.pandas_compatible"): | ||
raise NotImplementedError( | ||
"`DatetimeIndex` with `freq` cannot be constructed." | ||
) | ||
|
||
# The code logic below assumes `freq` is defined. It is first normalized | ||
# into `DateOffset` for further computation with timestamps. | ||
|
@@ -940,7 +942,7 @@ def date_range( | |
arr = cp.arange(start=start, stop=stop, step=step, dtype="int64") | ||
res = cudf.core.column.as_column(arr).astype("datetime64[ns]") | ||
|
||
return cudf.DatetimeIndex._from_data({name: res}) | ||
return cudf.DatetimeIndex._from_data({name: res}, freq=freq) | ||
|
||
|
||
def _has_fixed_frequency(freq: DateOffset) -> bool: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,7 @@ | |
import cudf | ||
import cudf.testing.dataset_generator as dataset_generator | ||
from cudf import DataFrame, Series | ||
from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 | ||
from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_LT_140 | ||
from cudf.core.index import DatetimeIndex | ||
from cudf.testing._utils import ( | ||
DATETIME_TYPES, | ||
|
@@ -1571,6 +1571,44 @@ def test_date_range_start_end_freq(request, start, end, freq): | |
reason="https://github.com/rapidsai/cudf/issues/12133", | ||
) | ||
) | ||
request.applymarker( | ||
pytest.mark.xfail( | ||
condition=( | ||
isinstance(freq, dict) | ||
and freq.get("hours", None) == 10 | ||
and freq.get("days", None) == 57 | ||
and freq.get("nanoseconds", None) == 3 | ||
and ( | ||
( | ||
start == "1996-11-21 04:05:30" | ||
and end == "2000-02-13 08:41:06" | ||
) | ||
or ( | ||
start == "1970-01-01 00:00:00" | ||
and end == "2000-02-13 08:41:06" | ||
) | ||
or ( | ||
start == "1970-01-01 00:00:00" | ||
and end == "1996-11-21 04:05:30" | ||
) | ||
or ( | ||
start == "1831-05-08 15:23:21" | ||
and end == "2000-02-13 08:41:06" | ||
) | ||
or ( | ||
start == "1831-05-08 15:23:21" | ||
and end == "1996-11-21 04:05:30" | ||
) | ||
or ( | ||
start == "1831-05-08 15:23:21" | ||
and end == "1970-01-01 00:00:00" | ||
) | ||
) | ||
), | ||
reason="Nanosecond offsets being dropped by pandas, which is " | ||
"fixed in pandas-2.0+", | ||
) | ||
) | ||
if isinstance(freq, str): | ||
_gfreq = _pfreq = freq | ||
else: | ||
|
@@ -1586,7 +1624,29 @@ def test_date_range_start_end_freq(request, start, end, freq): | |
) | ||
|
||
|
||
def test_date_range_start_freq_periods(start, freq, periods): | ||
def test_date_range_start_freq_periods(request, start, freq, periods): | ||
request.applymarker( | ||
pytest.mark.xfail( | ||
condition=( | ||
isinstance(freq, dict) | ||
and freq.get("hours", None) == 10 | ||
and freq.get("days", None) == 57 | ||
and freq.get("nanoseconds", None) == 3 | ||
and periods in (10, 100) | ||
and ( | ||
start | ||
in { | ||
"2000-02-13 08:41:06", | ||
"1996-11-21 04:05:30", | ||
"1970-01-01 00:00:00", | ||
"1831-05-08 15:23:21", | ||
} | ||
) | ||
), | ||
reason="Nanosecond offsets being dropped by pandas, which is " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this better solved by fixing the condition on the parameter, which should be "pandas < 2.0"? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wanted to do that but it happens only for a few parameter combinations and we currently xpass/xfail strictly. That's the reason for the current approach. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know we have two diverging approaches at the same place but I plan on dropping these in pandas-2.0 feature branch. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay. We can clean it up later. |
||
"fixed in pandas-2.0+", | ||
) | ||
) | ||
if isinstance(freq, str): | ||
_gfreq = _pfreq = freq | ||
else: | ||
|
@@ -1613,6 +1673,28 @@ def test_date_range_end_freq_periods(request, end, freq, periods): | |
reason="https://github.com/pandas-dev/pandas/issues/46877", | ||
) | ||
) | ||
request.applymarker( | ||
pytest.mark.xfail( | ||
condition=( | ||
isinstance(freq, dict) | ||
and freq.get("hours", None) == 10 | ||
and freq.get("days", None) == 57 | ||
and freq.get("nanoseconds", None) == 3 | ||
and periods in (10, 100) | ||
and ( | ||
end | ||
in { | ||
"2000-02-13 08:41:06", | ||
"1996-11-21 04:05:30", | ||
"1970-01-01 00:00:00", | ||
"1831-05-08 15:23:21", | ||
} | ||
) | ||
), | ||
reason="Nanosecond offsets being dropped by pandas, which is " | ||
"fixed in pandas-2.0+", | ||
) | ||
) | ||
if isinstance(freq, str): | ||
_gfreq = _pfreq = freq | ||
else: | ||
|
@@ -2163,8 +2245,6 @@ def test_datetime_getitem_na(): | |
|
||
def test_daterange_pandas_compatibility(): | ||
with cudf.option_context("mode.pandas_compatible", True): | ||
with pytest.raises(NotImplementedError): | ||
cudf.date_range("20010101", "20020215", freq="400h", name="times") | ||
expected = pd.date_range( | ||
"2010-01-01", "2010-02-01", periods=10, name="times" | ||
) | ||
|
@@ -2174,6 +2254,46 @@ def test_daterange_pandas_compatibility(): | |
assert_eq(expected, actual) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"data,dtype,freq", | ||
[ | ||
([10], "datetime64[ns]", "2N"), | ||
([10, 12, 14, 16], "datetime64[ns]", "2N"), | ||
([10, 11, 12, 13], "datetime64[ns]", "1N"), | ||
([100, 200, 300, 400], "datetime64[s]", "100s"), | ||
([101, 201, 301, 401], "datetime64[ms]", "100ms"), | ||
], | ||
) | ||
def test_datetime_index_with_freq(request, data, dtype, freq): | ||
request.applymarker( | ||
pytest.mark.xfail( | ||
condition=(not PANDAS_GE_200 and dtype != "datetime64[ns]"), | ||
reason="Pandas < 2.0 lacks non-nano-second dtype support.", | ||
) | ||
) | ||
actual = cudf.DatetimeIndex(data, dtype=dtype, freq=freq) | ||
expected = pd.DatetimeIndex(data, dtype=dtype, freq=freq) | ||
assert_eq(actual, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"data,dtype,freq", | ||
[ | ||
([10, 1232, 13244, 13426], "datetime64[ns]", "2N"), | ||
([10, 11, 12, 13], "datetime64[ns]", "1s"), | ||
([10000, 200, 300, 400], "datetime64[s]", "100s"), | ||
([107871, 201, 301, 401], "datetime64[ms]", "100ns"), | ||
], | ||
) | ||
def test_datetime_index_freq_error(data, dtype, freq): | ||
assert_exceptions_equal( | ||
pd.DatetimeIndex, | ||
cudf.DatetimeIndex, | ||
([data], {"dtype": dtype, "freq": freq}), | ||
([data], {"dtype": dtype, "freq": freq}), | ||
) | ||
|
||
|
||
def test_strings_with_utc_offset_not_implemented(): | ||
with pytest.warns(DeprecationWarning, match="parsing timezone"): # cupy | ||
with pytest.raises(NotImplementedError): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While looking on adding
freq
support before, I found some APIs manipulatefreq
(to new values) and return new results. (I vaguely remember..but I think that happens in binops?) Should we add a TODO comment here that this is not fully functional yet andfreq
support needs to be added in rest of the code-base?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, although maybe the default behaviour could be for
DatetimeIndex
to inferfreq
from its values. Then this should just work.Also, we should probably only do that in compatibility mode for perf reasons.