From ef50796b97f3fbe35e7613b405b2c6e2c59421ef Mon Sep 17 00:00:00 2001 From: Marlene <57748216+marlenezw@users.noreply.github.com> Date: Thu, 30 Sep 2021 18:54:08 +0200 Subject: [PATCH] Add `isocalendar` API support (#9169) Closes #8896 #4908. It creates a new `isocalendar` method in cudf python, and solves related datetime issues for obtaining week, year and day in isocalendar form. Authors: - Marlene (https://github.com/marlenezw) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/9169 --- .../cpp/strings/convert/convert_datetime.pxd | 3 +- python/cudf/cudf/_lib/string_casting.pyx | 11 ++- python/cudf/cudf/core/column/datetime.py | 60 ++++++++++++- python/cudf/cudf/core/index.py | 21 +++++ python/cudf/cudf/core/series.py | 59 +++++++++--- python/cudf/cudf/core/tools/datetimes.py | 25 +++++- python/cudf/cudf/tests/test_datetime.py | 89 +++++++++++++++---- 7 files changed, 229 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd index 5a9228608e5..5e7380c1d4e 100644 --- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd @@ -17,7 +17,8 @@ cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ cdef unique_ptr[column] from_timestamps( column_view input_col, - string format) except + + string format, + column_view input_strings_names) except + cdef unique_ptr[column] is_timestamp( column_view input_col, diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 74490d6bb19..f9e98efbbd9 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -509,7 +509,8 @@ def from_booleans(Column input_col): def int2timestamp( Column input_col, - format): + str format, + Column names): """ Converting/Casting input date-time column to string column with specified format @@ -517,6 +518,9 @@ def int2timestamp( Parameters ---------- input_col : input column of type timestamp in integer format + format : The string specifying output format + names : The string names to use for weekdays ("%a", "%A") and + months ("%b", "%B") Returns ------- @@ -525,12 +529,15 @@ def int2timestamp( """ cdef column_view input_column_view = input_col.view() cdef string c_timestamp_format = format.encode("UTF-8") + cdef column_view input_strings_names = names.view() + cdef unique_ptr[column] c_result with nogil: c_result = move( cpp_from_timestamps( input_column_view, - c_timestamp_format)) + c_timestamp_format, + input_strings_names)) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 0d4edbf0113..36d6d58625f 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -4,7 +4,9 @@ import builtins import datetime as dt +import locale import re +from locale import nl_langinfo from numbers import Number from types import SimpleNamespace from typing import Any, Mapping, Sequence, Union, cast @@ -50,6 +52,56 @@ "datetime64[s]": "%Y-%m-%d %H:%M:%S", } +_DATETIME_SPECIAL_FORMATS = { + "%b", + "%B", + "%A", + "%a", +} + +_DATETIME_NAMES = [ + nl_langinfo(locale.AM_STR), # type: ignore + nl_langinfo(locale.PM_STR), # type: ignore + nl_langinfo(locale.DAY_1), + nl_langinfo(locale.DAY_2), + nl_langinfo(locale.DAY_3), + nl_langinfo(locale.DAY_4), + nl_langinfo(locale.DAY_5), + nl_langinfo(locale.DAY_6), + nl_langinfo(locale.DAY_7), + nl_langinfo(locale.ABDAY_1), + nl_langinfo(locale.ABDAY_2), + nl_langinfo(locale.ABDAY_3), + nl_langinfo(locale.ABDAY_4), + nl_langinfo(locale.ABDAY_5), + nl_langinfo(locale.ABDAY_6), + nl_langinfo(locale.ABDAY_7), + nl_langinfo(locale.MON_1), + nl_langinfo(locale.MON_2), + nl_langinfo(locale.MON_3), + nl_langinfo(locale.MON_4), + nl_langinfo(locale.MON_5), + nl_langinfo(locale.MON_6), + nl_langinfo(locale.MON_7), + nl_langinfo(locale.MON_8), + nl_langinfo(locale.MON_9), + nl_langinfo(locale.MON_10), + nl_langinfo(locale.MON_11), + nl_langinfo(locale.MON_12), + nl_langinfo(locale.ABMON_1), + nl_langinfo(locale.ABMON_2), + nl_langinfo(locale.ABMON_3), + nl_langinfo(locale.ABMON_4), + nl_langinfo(locale.ABMON_5), + nl_langinfo(locale.ABMON_6), + nl_langinfo(locale.ABMON_7), + nl_langinfo(locale.ABMON_8), + nl_langinfo(locale.ABMON_9), + nl_langinfo(locale.ABMON_10), + nl_langinfo(locale.ABMON_11), + nl_langinfo(locale.ABMON_12), +] + class DatetimeColumn(column.ColumnBase): """ @@ -278,10 +330,16 @@ def as_string_column( format = _dtype_to_format_conversion.get( self.dtype.name, "%Y-%m-%d %H:%M:%S" ) + if format in _DATETIME_SPECIAL_FORMATS: + names = as_column(_DATETIME_NAMES) + else: + names = cudf.core.column.column_empty( + 0, dtype="object", masked=False + ) if len(self) > 0: return string._datetime_to_str_typecast_functions[ cudf.dtype(self.dtype) - ](self, format) + ](self, format, names) else: return cast( "cudf.core.column.StringColumn", diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 574da689e79..bf12cb79e6a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1615,6 +1615,27 @@ def quarter(self): res = extract_quarter(self._values) return Int8Index(res, dtype="int8") + def isocalendar(self): + """ + Returns a DataFrame with the year, week, and day + calculated according to the ISO 8601 standard. + + Returns + ------- + DataFrame + with columns year, week and day + + Examples + -------- + >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", + ... "1999-12-31 18:40:00"]) + >>> gIndex.isocalendar() + year week day + 2020-05-31 08:00:00 2020 22 7 + 1999-12-31 18:40:00 1999 52 5 + """ + return cudf.core.tools.datetimes._to_iso_calendar(self) + def to_pandas(self): nanos = self._values.astype("datetime64[ns]") return pd.DatetimeIndex(nanos.to_pandas(), name=self.name) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 9439de5b23b..18eec84ccf6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5214,6 +5214,49 @@ def quarter(self): {None: res}, index=self.series._index, name=self.series.name, ) + def isocalendar(self): + """ + Returns a DataFrame with the year, week, and day + calculated according to the ISO 8601 standard. + + Returns + ------- + DataFrame + with columns year, week and day + + Examples + -------- + >>> ser = cudf.Series(pd.date_range(start="2021-07-25", + ... end="2021-07-30")) + >>> ser.dt.isocalendar() + year week day + 0 2021 29 7 + 1 2021 30 1 + 2 2021 30 2 + 3 2021 30 3 + 4 2021 30 4 + 5 2021 30 5 + >>> ser.dt.isocalendar().week + 0 29 + 1 30 + 2 30 + 3 30 + 4 30 + 5 30 + Name: week, dtype: object + + >>> serIndex = cudf.to_datetime(pd.Series(["2010-01-01", pd.NaT])) + >>> serIndex.dt.isocalendar() + year week day + 0 2009 53 5 + 1 + >>> serIndex.dt.isocalendar().year + 0 2009 + 1 + Name: year, dtype: object + """ + return cudf.core.tools.datetimes._to_iso_calendar(self) + @property def is_month_start(self): """ @@ -5517,9 +5560,8 @@ def strftime(self, date_format, *args, **kwargs): Notes ----- - The following date format identifiers are not yet supported: ``%a``, - ``%A``, ``%w``, ``%b``, ``%B``, ``%U``, ``%W``, ``%c``, ``%x``, - ``%X``, ``%G``, ``%u``, ``%V`` + The following date format identifiers are not yet + supported: ``%c``, ``%x``,``%X`` Examples -------- @@ -5558,19 +5600,9 @@ def strftime(self, date_format, *args, **kwargs): # once https://github.com/rapidsai/cudf/issues/5991 # is implemented not_implemented_formats = { - "%a", - "%A", - "%w", - "%b", - "%B", - "%U", - "%W", "%c", "%x", "%X", - "%G", - "%u", - "%V", } for d_format in not_implemented_formats: if d_format in date_format: @@ -5580,7 +5612,6 @@ def strftime(self, date_format, *args, **kwargs): f"https://github.com/rapidsai/cudf/issues/5991 " f"for tracking purposes." ) - str_col = self.series._column.as_string_column( dtype="str", format=date_format ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index b0fb2fb4274..e17c58d1db7 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -4,6 +4,7 @@ from typing import Sequence, Union import numpy as np +import pandas as pd from pandas.core.tools.datetimes import _unit_map import cudf @@ -221,8 +222,8 @@ def to_datetime( format=format, ) return as_index(col, name=arg.name) - elif isinstance(arg, cudf.Series): - col = arg._column + elif isinstance(arg, (cudf.Series, pd.Series)): + col = column.as_column(arg) col = _process_col( col=col, unit=unit, @@ -652,3 +653,23 @@ def _isin_datetimelike( res = lhs._obtain_isin_result(rhs) return res + + +def _to_iso_calendar(arg): + formats = ["%G", "%V", "%u"] + if not isinstance(arg, (cudf.Index, cudf.core.series.DatetimeProperties)): + raise AttributeError( + "Can only use .isocalendar accessor with series or index" + ) + if isinstance(arg, cudf.Index): + iso_params = [ + arg._column.as_string_column(arg._values.dtype, fmt) + for fmt in formats + ] + index = arg._column + elif isinstance(arg.series, cudf.Series): + iso_params = [arg.strftime(fmt) for fmt in formats] + index = arg.series.index + + data = dict(zip(["year", "week", "day"], iso_params)) + return cudf.DataFrame(data, index=index, dtype=np.int32) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 6e5b3c39dc4..688109e3862 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1131,7 +1131,26 @@ def test_datetime_fillna(data, dtype, fill_value): ) @pytest.mark.parametrize("dtype", DATETIME_TYPES) @pytest.mark.parametrize( - "date_format", ["%d - %m", "%y/%H", "%Y", "%I - %M / %S", "%f", "%j", "%p"] + "date_format", + [ + "%d - %m", + "%y/%H", + "%Y", + "%I - %M / %S", + "%f", + "%j", + "%p", + "%w", + "%U", + "%W", + "%G", + "%u", + "%V", + "%b", + "%B", + "%a", + "%A", + ], ) def test_datetime_strftime(data, dtype, date_format): gsr = cudf.Series(data, dtype=dtype) @@ -1143,24 +1162,7 @@ def test_datetime_strftime(data, dtype, date_format): assert_eq(expected, actual) -@pytest.mark.parametrize( - "date_format", - [ - "%a", - "%A", - "%w", - "%b", - "%B", - "%U", - "%W", - "%c", - "%x", - "%X", - "%G", - "%u", - "%V", - ], -) +@pytest.mark.parametrize("date_format", ["%c", "%x", "%X"]) def test_datetime_strftime_not_implemented_formats(date_format): gsr = cudf.Series([1, 2, 3], dtype="datetime64[ms]") @@ -1334,6 +1336,55 @@ def test_quarter(): assert_eq(expect2.values, got2.values, check_dtype=False) +@pytest.mark.parametrize( + "data", + [ + pd.Series([], dtype="datetime64[ns]"), + pd.Series(pd.date_range("2010-01-01", "2010-02-01")), + pd.Series([None, None], dtype="datetime64[ns]"), + pd.Series("2020-05-31 08:00:00", dtype="datetime64[s]"), + pd.Series( + pd.date_range(start="2021-07-25", end="2021-07-30"), + index=["a", "b", "c", "d", "e", "f"], + ), + ], +) +def test_isocalendar_series(data): + ps = data.copy() + gs = cudf.from_pandas(ps) + + expect = ps.dt.isocalendar() + got = gs.dt.isocalendar() + + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + pd.DatetimeIndex([], dtype="datetime64[ns]"), + pd.DatetimeIndex([None, None], dtype="datetime64[ns]"), + pd.DatetimeIndex( + [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + ], + dtype="datetime64[ns]", + ), + pd.DatetimeIndex(["2100-03-14 07:30:00"], dtype="datetime64[ns]"), + ], +) +def test_isocalendar_index(data): + ps = data.copy() + gs = cudf.from_pandas(ps) + + expect = ps.isocalendar() + got = gs.isocalendar() + + assert_eq(expect, got, check_dtype=False) + + @pytest.mark.parametrize("dtype", DATETIME_TYPES) def test_days_in_months(dtype): nrows = 1000