Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose days_in_month function in libcudf and add python bindings #8892

Merged
merged 12 commits into from
Aug 6, 2021
15 changes: 15 additions & 0 deletions cpp/include/cudf/datetime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,21 @@ std::unique_ptr<cudf::column> is_leap_year(
cudf::column_view const& column,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Extract the number of days in the month
*
* output[i] contains the number of days in the month of date `column[i]`
* output[i] is null if `column[i]` is null
*
* @throw cudf::logic_error if input column datatype is not a TIMESTAMP
*
* @param cudf::column_view of the input datetime values *
isVoid marked this conversation as resolved.
Show resolved Hide resolved
* @return cudf::column of datatype INT16 of days in month of the corresponding date
*/
std::unique_ptr<cudf::column> days_in_month(
cudf::column_view const& column,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns the quarter of the date
*
Expand Down
73 changes: 33 additions & 40 deletions cpp/src/datetime/datetime_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,31 +83,30 @@ static __device__ int16_t const days_until_month[2][13] = {
{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} // For leap years
};

CUDA_DEVICE_CALLABLE uint8_t days_in_month(cuda::std::chrono::month mon, bool is_leap_year)
{
return days_until_month[is_leap_year][unsigned{mon}] -
days_until_month[is_leap_year][unsigned{mon} - 1];
}

// Round up the date to the last day of the month and return the
// date only (without the time component)
struct extract_last_day_of_month {
template <typename Timestamp>
CUDA_DEVICE_CALLABLE timestamp_D operator()(Timestamp const ts) const
{
using namespace cuda::std::chrono;
// IDEAL: does not work with CUDA10.0 due to nvcc compiler bug
// cannot invoke ym_last_day.day()
// const year_month_day orig_ymd(floor<days>(ts));
// const year_month_day_last ym_last_day(orig_ymd.year(), month_day_last(orig_ymd.month()));
// return timestamp_D(sys_days(ym_last_day));

// Only has the days - time component is chopped off, which is what we want
auto const days_since_epoch = floor<days>(ts);
auto const date = year_month_day(days_since_epoch);
auto const last_day = days_in_month(date.month(), date.year().is_leap());
const year_month_day ymd(floor<days>(ts));
auto const ymdl = year_month_day_last{ymd.year() / ymd.month() / last};
return timestamp_D{sys_days{ymdl}};
}
};

return timestamp_D(days_since_epoch + days(last_day - static_cast<unsigned>(date.day())));
// Extract the number of days of the month
// A similar operator to `extract_last_day_of_month`, except this returns
// an integer while the other returns a timestamp.
struct days_in_month_op {
template <typename Timestamp>
CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
{
using namespace cuda::std::chrono;
auto const date = year_month_day(floor<days>(ts));
auto const ymdl = year_month_day_last(date.year() / date.month() / last);
return static_cast<int16_t>(unsigned{ymdl.day()});
}
};

Expand Down Expand Up @@ -144,6 +143,7 @@ struct extract_quarter_op {
}
};

// Returns true if the year is a leap year
struct is_leap_year_op {
template <typename Timestamp>
CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const
Expand Down Expand Up @@ -220,22 +220,6 @@ struct add_calendrical_months_functor {
{
}

// std chrono implementation is copied here due to nvcc bug 2909685
// https://howardhinnant.github.io/date_algorithms.html#days_from_civil
static CUDA_DEVICE_CALLABLE timestamp_D
compute_sys_days(cuda::std::chrono::year_month_day const& ymd)
{
const int yr = static_cast<int>(ymd.year()) - (ymd.month() <= cuda::std::chrono::month{2});
const unsigned mth = static_cast<unsigned>(ymd.month());
const unsigned dy = static_cast<unsigned>(ymd.day());

const int era = (yr >= 0 ? yr : yr - 399) / 400;
const unsigned yoe = static_cast<unsigned>(yr - era * 400); // [0, 399]
const unsigned doy = (153 * (mth + (mth > 2 ? -3 : 9)) + 2) / 5 + dy - 1; // [0, 365]
const unsigned doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096]
return timestamp_D{duration_D{era * 146097 + static_cast<int>(doe) - 719468}};
}

template <typename Element>
typename std::enable_if_t<!cudf::is_timestamp_t<Element>::value, void> operator()(
rmm::cuda_stream_view stream) const
Expand Down Expand Up @@ -265,15 +249,10 @@ struct add_calendrical_months_functor {

// If the new date isn't valid, scale it back to the last day of the
// month.
// IDEAL: if (!ymd.ok()) ymd = ymd.year()/ymd.month()/last;
auto month_days = days_in_month(ymd.month(), ymd.year().is_leap());
if (unsigned{ymd.day()} > month_days)
ymd = ymd.year() / ymd.month() / day{month_days};
if (!ymd.ok()) ymd = ymd.year() / ymd.month() / last;

// Put back the time component to the date
return
// IDEAL: sys_days{ymd} + ...
compute_sys_days(ymd) + (time_val - days_since_epoch);
return sys_days{ymd} + (time_val - days_since_epoch);
});
}
};
Expand Down Expand Up @@ -393,6 +372,13 @@ std::unique_ptr<column> is_leap_year(column_view const& column,
return apply_datetime_op<is_leap_year_op, type_id::BOOL8>(column, stream, mr);
}

std::unique_ptr<column> days_in_month(column_view const& column,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return apply_datetime_op<days_in_month_op, type_id::INT16>(column, stream, mr);
}

std::unique_ptr<column> extract_quarter(column_view const& column,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
Expand Down Expand Up @@ -476,6 +462,13 @@ std::unique_ptr<column> is_leap_year(column_view const& column, rmm::mr::device_
return detail::is_leap_year(column, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> days_in_month(column_view const& column,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::days_in_month(column, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> extract_quarter(column_view const& column,
rmm::mr::device_memory_resource* mr)
{
Expand Down
35 changes: 35 additions & 0 deletions cpp/tests/datetime/datetime_ops_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,41 @@ TEST_F(BasicDatetimeOpsTest, TestIsLeapYear)
{true, false, true, true, true, true, true, true, false, true, true, false}});
}

TEST_F(BasicDatetimeOpsTest, TestDaysInMonths)

{
using namespace cudf::test;
using namespace cudf::datetime;
using namespace cuda::std::chrono;

auto timestamps_s =
cudf::test::fixed_width_column_wrapper<cudf::timestamp_s, cudf::timestamp_s::rep>{
{
0L, // NULL
-1887541682L, // 1910-03-10 10:51:58
0L, // NULL
-1251006943L, // 1930-05-11 18:04:17
-932134638L, // 1940-06-18 09:42:42
-614354877L, // 1950-07-14 09:52:03
-296070394L, // 1960-08-14 06:13:26
22840404L, // 1970-09-22 08:33:24
339817190L, // 1980-10-08 01:39:50
657928062L, // 1990-11-06 21:47:42
976630837L, // 2000-12-12 14:20:37
1294699018L, // 2011-01-10 22:36:58
1613970182L, // 2021-02-22 05:03:02 - non leap year February
1930963331L, // 2031-03-11 02:42:11
2249867102L, // 2041-04-18 03:05:02
951426858L, // 2000-02-24 21:14:18 - leap year February
},
iterators::nulls_at({0, 2})};

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*days_in_month(timestamps_s),
cudf::test::fixed_width_column_wrapper<int16_t>{
{-1, 31, -1, 31, 30, 31, 31, 30, 31, 30, 31, 31, 28, 31, 30, 29},
iterators::nulls_at({0, 2})});
}

TEST_F(BasicDatetimeOpsTest, TestQuarter)
{
using namespace cudf::test;
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/cpp/datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
) except +
cdef unique_ptr[column] day_of_year(const column_view& column) except +
cdef unique_ptr[column] is_leap_year(const column_view& column) except +
cdef unique_ptr[column] days_in_month(const column_view& column) except +
14 changes: 14 additions & 0 deletions python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,24 @@ def extract_datetime_component(Column col, object field):


def is_leap_year(Column col):
"""Returns a boolean indicator whether the year of the date is a leap year
"""
cdef unique_ptr[column] c_result
cdef column_view col_view = col.view()

with nogil:
c_result = move(libcudf_datetime.is_leap_year(col_view))

return Column.from_unique_ptr(move(c_result))


def days_in_month(Column col):
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
"""Extracts the number of days in the month of the date
"""
cdef unique_ptr[column] c_result
cdef column_view col_view = col.view()

with nogil:
c_result = move(libcudf_datetime.days_in_month(col_view))

return Column.from_unique_ptr(move(c_result))
87 changes: 87 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -6438,6 +6438,42 @@ def is_leap_year(self):
-------
Series
Booleans indicating if dates belong to a leap year.

Example
-------
>>> import pandas as pd, cudf
>>> s = cudf.Series(
... pd.date_range(start='2000-02-01', end='2013-02-01', freq='1Y'))
>>> s
0 2000-12-31
1 2001-12-31
2 2002-12-31
3 2003-12-31
4 2004-12-31
5 2005-12-31
6 2006-12-31
7 2007-12-31
8 2008-12-31
9 2009-12-31
10 2010-12-31
11 2011-12-31
12 2012-12-31
dtype: datetime64[ns]
>>> s.dt.is_leap_year
0 True
1 False
2 False
3 False
4 True
5 False
6 False
7 False
8 True
9 False
10 False
11 False
12 True
dtype: bool
"""
res = libcudf.datetime.is_leap_year(self.series._column).fillna(False)
return Series._from_data(
Expand All @@ -6458,6 +6494,57 @@ def is_month_start(self):
"""
return (self.day == 1).fillna(False)

@property
def days_in_month(self):
"""
Get the total number of days in the month that the date falls on.

Returns
-------
Series
Integers representing the number of days in month

Example
-------
>>> import pandas as pd, cudf
>>> s = cudf.Series(
... pd.date_range(start='2000-08-01', end='2001-08-01', freq='1M'))
>>> s
0 2000-08-31
1 2000-09-30
2 2000-10-31
3 2000-11-30
4 2000-12-31
5 2001-01-31
6 2001-02-28
7 2001-03-31
8 2001-04-30
9 2001-05-31
10 2001-06-30
11 2001-07-31
dtype: datetime64[ns]
>>> s.dt.days_in_month
0 31
1 30
2 31
3 30
4 31
5 31
6 28
7 31
8 30
9 31
10 30
11 31
dtype: int16
"""
res = libcudf.datetime.days_in_month(self.series._column)
return Series._from_data(
ColumnAccessor({None: res}),
index=self.series._index,
name=self.series.name,
)

def _get_dt_field(self, field):
out_column = self.series._column.get_dt_field(field)
return Series(
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pytest

import cudf
import cudf.testing.dataset_generator as dataset_generator
from cudf.core import DataFrame, Series
from cudf.core.index import DatetimeIndex
from cudf.testing._utils import (
Expand Down Expand Up @@ -1299,6 +1300,25 @@ def test_is_leap_year():
assert_eq(expect2, got2)


@pytest.mark.parametrize("dtype", DATETIME_TYPES)
def test_days_in_months(dtype):
nrows = 1000

data = dataset_generator.rand_dataframe(
dtypes_meta=[
{"dtype": dtype, "null_frequency": 0.4, "cardinality": nrows}
],
rows=nrows,
use_threads=False,
seed=23,
)

ps = data.to_pandas()["0"]
gs = cudf.from_pandas(ps)

assert_eq(ps.dt.days_in_month, gs.dt.days_in_month)


@pytest.mark.parametrize(
"data",
[
Expand Down