Skip to content

Commit

Permalink
Add isocalendar API support (#9169)
Browse files Browse the repository at this point in the history
Closes #8896 #4908.

 It creates a new `isocalendar` method in cudf python, and solves related datetime issues for obtaining week, year and day in isocalendar form.

Authors:
  - Marlene  (https://github.com/marlenezw)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #9169
  • Loading branch information
marlenezw authored Sep 30, 2021
1 parent 840faf5 commit ef50796
Show file tree
Hide file tree
Showing 7 changed files with 229 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \

cdef unique_ptr[column] from_timestamps(
column_view input_col,
string format) except +
string format,
column_view input_strings_names) except +

cdef unique_ptr[column] is_timestamp(
column_view input_col,
Expand Down
11 changes: 9 additions & 2 deletions python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -509,14 +509,18 @@ def from_booleans(Column input_col):

def int2timestamp(
Column input_col,
format):
str format,
Column names):
"""
Converting/Casting input date-time column to string
column with specified format
Parameters
----------
input_col : input column of type timestamp in integer format
format : The string specifying output format
names : The string names to use for weekdays ("%a", "%A") and
months ("%b", "%B")
Returns
-------
Expand All @@ -525,12 +529,15 @@ def int2timestamp(
"""
cdef column_view input_column_view = input_col.view()
cdef string c_timestamp_format = format.encode("UTF-8")
cdef column_view input_strings_names = names.view()

cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_timestamps(
input_column_view,
c_timestamp_format))
c_timestamp_format,
input_strings_names))

return Column.from_unique_ptr(move(c_result))

Expand Down
60 changes: 59 additions & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

import builtins
import datetime as dt
import locale
import re
from locale import nl_langinfo
from numbers import Number
from types import SimpleNamespace
from typing import Any, Mapping, Sequence, Union, cast
Expand Down Expand Up @@ -50,6 +52,56 @@
"datetime64[s]": "%Y-%m-%d %H:%M:%S",
}

_DATETIME_SPECIAL_FORMATS = {
"%b",
"%B",
"%A",
"%a",
}

_DATETIME_NAMES = [
nl_langinfo(locale.AM_STR), # type: ignore
nl_langinfo(locale.PM_STR), # type: ignore
nl_langinfo(locale.DAY_1),
nl_langinfo(locale.DAY_2),
nl_langinfo(locale.DAY_3),
nl_langinfo(locale.DAY_4),
nl_langinfo(locale.DAY_5),
nl_langinfo(locale.DAY_6),
nl_langinfo(locale.DAY_7),
nl_langinfo(locale.ABDAY_1),
nl_langinfo(locale.ABDAY_2),
nl_langinfo(locale.ABDAY_3),
nl_langinfo(locale.ABDAY_4),
nl_langinfo(locale.ABDAY_5),
nl_langinfo(locale.ABDAY_6),
nl_langinfo(locale.ABDAY_7),
nl_langinfo(locale.MON_1),
nl_langinfo(locale.MON_2),
nl_langinfo(locale.MON_3),
nl_langinfo(locale.MON_4),
nl_langinfo(locale.MON_5),
nl_langinfo(locale.MON_6),
nl_langinfo(locale.MON_7),
nl_langinfo(locale.MON_8),
nl_langinfo(locale.MON_9),
nl_langinfo(locale.MON_10),
nl_langinfo(locale.MON_11),
nl_langinfo(locale.MON_12),
nl_langinfo(locale.ABMON_1),
nl_langinfo(locale.ABMON_2),
nl_langinfo(locale.ABMON_3),
nl_langinfo(locale.ABMON_4),
nl_langinfo(locale.ABMON_5),
nl_langinfo(locale.ABMON_6),
nl_langinfo(locale.ABMON_7),
nl_langinfo(locale.ABMON_8),
nl_langinfo(locale.ABMON_9),
nl_langinfo(locale.ABMON_10),
nl_langinfo(locale.ABMON_11),
nl_langinfo(locale.ABMON_12),
]


class DatetimeColumn(column.ColumnBase):
"""
Expand Down Expand Up @@ -278,10 +330,16 @@ def as_string_column(
format = _dtype_to_format_conversion.get(
self.dtype.name, "%Y-%m-%d %H:%M:%S"
)
if format in _DATETIME_SPECIAL_FORMATS:
names = as_column(_DATETIME_NAMES)
else:
names = cudf.core.column.column_empty(
0, dtype="object", masked=False
)
if len(self) > 0:
return string._datetime_to_str_typecast_functions[
cudf.dtype(self.dtype)
](self, format)
](self, format, names)
else:
return cast(
"cudf.core.column.StringColumn",
Expand Down
21 changes: 21 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1615,6 +1615,27 @@ def quarter(self):
res = extract_quarter(self._values)
return Int8Index(res, dtype="int8")

def isocalendar(self):
"""
Returns a DataFrame with the year, week, and day
calculated according to the ISO 8601 standard.
Returns
-------
DataFrame
with columns year, week and day
Examples
--------
>>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00",
... "1999-12-31 18:40:00"])
>>> gIndex.isocalendar()
year week day
2020-05-31 08:00:00 2020 22 7
1999-12-31 18:40:00 1999 52 5
"""
return cudf.core.tools.datetimes._to_iso_calendar(self)

def to_pandas(self):
nanos = self._values.astype("datetime64[ns]")
return pd.DatetimeIndex(nanos.to_pandas(), name=self.name)
Expand Down
59 changes: 45 additions & 14 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5214,6 +5214,49 @@ def quarter(self):
{None: res}, index=self.series._index, name=self.series.name,
)

def isocalendar(self):
"""
Returns a DataFrame with the year, week, and day
calculated according to the ISO 8601 standard.
Returns
-------
DataFrame
with columns year, week and day
Examples
--------
>>> ser = cudf.Series(pd.date_range(start="2021-07-25",
... end="2021-07-30"))
>>> ser.dt.isocalendar()
year week day
0 2021 29 7
1 2021 30 1
2 2021 30 2
3 2021 30 3
4 2021 30 4
5 2021 30 5
>>> ser.dt.isocalendar().week
0 29
1 30
2 30
3 30
4 30
5 30
Name: week, dtype: object
>>> serIndex = cudf.to_datetime(pd.Series(["2010-01-01", pd.NaT]))
>>> serIndex.dt.isocalendar()
year week day
0 2009 53 5
1 <NA> <NA> <NA>
>>> serIndex.dt.isocalendar().year
0 2009
1 <NA>
Name: year, dtype: object
"""
return cudf.core.tools.datetimes._to_iso_calendar(self)

@property
def is_month_start(self):
"""
Expand Down Expand Up @@ -5517,9 +5560,8 @@ def strftime(self, date_format, *args, **kwargs):
Notes
-----
The following date format identifiers are not yet supported: ``%a``,
``%A``, ``%w``, ``%b``, ``%B``, ``%U``, ``%W``, ``%c``, ``%x``,
``%X``, ``%G``, ``%u``, ``%V``
The following date format identifiers are not yet
supported: ``%c``, ``%x``,``%X``
Examples
--------
Expand Down Expand Up @@ -5558,19 +5600,9 @@ def strftime(self, date_format, *args, **kwargs):
# once https://github.com/rapidsai/cudf/issues/5991
# is implemented
not_implemented_formats = {
"%a",
"%A",
"%w",
"%b",
"%B",
"%U",
"%W",
"%c",
"%x",
"%X",
"%G",
"%u",
"%V",
}
for d_format in not_implemented_formats:
if d_format in date_format:
Expand All @@ -5580,7 +5612,6 @@ def strftime(self, date_format, *args, **kwargs):
f"https://github.com/rapidsai/cudf/issues/5991 "
f"for tracking purposes."
)

str_col = self.series._column.as_string_column(
dtype="str", format=date_format
)
Expand Down
25 changes: 23 additions & 2 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Sequence, Union

import numpy as np
import pandas as pd
from pandas.core.tools.datetimes import _unit_map

import cudf
Expand Down Expand Up @@ -221,8 +222,8 @@ def to_datetime(
format=format,
)
return as_index(col, name=arg.name)
elif isinstance(arg, cudf.Series):
col = arg._column
elif isinstance(arg, (cudf.Series, pd.Series)):
col = column.as_column(arg)
col = _process_col(
col=col,
unit=unit,
Expand Down Expand Up @@ -652,3 +653,23 @@ def _isin_datetimelike(

res = lhs._obtain_isin_result(rhs)
return res


def _to_iso_calendar(arg):
formats = ["%G", "%V", "%u"]
if not isinstance(arg, (cudf.Index, cudf.core.series.DatetimeProperties)):
raise AttributeError(
"Can only use .isocalendar accessor with series or index"
)
if isinstance(arg, cudf.Index):
iso_params = [
arg._column.as_string_column(arg._values.dtype, fmt)
for fmt in formats
]
index = arg._column
elif isinstance(arg.series, cudf.Series):
iso_params = [arg.strftime(fmt) for fmt in formats]
index = arg.series.index

data = dict(zip(["year", "week", "day"], iso_params))
return cudf.DataFrame(data, index=index, dtype=np.int32)
Loading

0 comments on commit ef50796

Please sign in to comment.