From f13178bbad2eac4454b0a264c2b724449de6450a Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Thu, 2 Dec 2021 19:45:16 +0000 Subject: [PATCH 01/13] added series.dt.floor --- cpp/include/cudf/datetime.hpp | 91 ++++++++++++++++++++++++ cpp/src/datetime/datetime_ops.cu | 78 +++++++++++++++++++- cpp/tests/datetime/datetime_ops_test.cpp | 72 +++++++++++++++++++ docs/cudf/source/api_docs/series.rst | 1 + python/cudf/cudf/_lib/cpp/datetime.pxd | 13 ++++ python/cudf/cudf/_lib/datetime.pyx | 27 +++++++ python/cudf/cudf/core/column/datetime.py | 3 + python/cudf/cudf/core/series.py | 35 +++++++++ python/cudf/cudf/tests/test_datetime.py | 31 ++++++++ 9 files changed, 349 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 71e5968bf07..489ffb25ffe 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -469,5 +469,96 @@ std::unique_ptr floor_nanosecond( column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Round down to the nearest day + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr round_day( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest hour + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr round_hour( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest minute + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr round_minute( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest second + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr round_second( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest millisecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr round_millisecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest microsecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr round_microsecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest nanosecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr round_nanosecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace datetime } // namespace cudf diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 717bd7ac0a8..382cc3653d3 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -54,7 +54,7 @@ enum class datetime_component { NANOSECOND }; -enum class rounding_kind { CEIL, FLOOR }; +enum class rounding_kind { CEIL, FLOOR, ROUND }; template struct extract_component_operator { @@ -100,6 +100,7 @@ struct RoundFunctor { switch (round_kind) { case rounding_kind::CEIL: return cuda::std::chrono::ceil(dt); case rounding_kind::FLOOR: return cuda::std::chrono::floor(dt); + case rounding_kind::ROUND: return cuda::std::chrono::round(dt); default: cudf_assert(false && "Unsupported rounding kind."); } __builtin_unreachable(); @@ -224,7 +225,7 @@ struct is_leap_year_op { } }; -// Specific function for applying ceil/floor date ops +// Specific function for applying ceil/floor/round date ops struct dispatch_round { template std::enable_if_t(), std::unique_ptr> operator()( @@ -672,6 +673,79 @@ std::unique_ptr floor_nanosecond(column_view const& column, mr); } +std::unique_ptr round_day(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::ROUND, + detail::datetime_component::DAY, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr round_hour(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::ROUND, + detail::datetime_component::HOUR, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr round_minute(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::ROUND, + detail::datetime_component::MINUTE, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr round_second(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::ROUND, + detail::datetime_component::SECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr round_millisecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::ROUND, + detail::datetime_component::MILLISECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr round_microsecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::ROUND, + detail::datetime_component::MICROSECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr round_nanosecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::ROUND, + detail::datetime_component::NANOSECOND, + column, + rmm::cuda_stream_default, + mr); +} + std::unique_ptr extract_year(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 4ac24317145..f4910681a38 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -914,4 +914,76 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_nanosecond); } +TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) +{ + using T = TypeParam; + using namespace cudf::test; + using namespace cudf::datetime; + using namespace cuda::std::chrono; + + auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT + auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT + + auto input = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); + + auto host_val = to_host(input); + thrust::host_vector timestamps = host_val.first; + + std::vector round_day(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), round_day.begin(), [](auto i) { + return time_point_cast(round(i)); + }); + auto expected_day = + fixed_width_column_wrapper(round_day.begin(), round_day.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_day(input), expected_day); + + std::vector round_hour(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), round_hour.begin(), [](auto i) { + return time_point_cast(round(i)); + }); + auto expected_hour = + fixed_width_column_wrapper(round_hour.begin(), round_hour.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_hour(input), expected_hour); + + std::vector round_minute(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), round_minute.begin(), [](auto i) { + return time_point_cast(round(i)); + }); + auto expected_minute = fixed_width_column_wrapper( + round_minute.begin(), round_minute.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_minute(input), expected_minute); + + std::vector round_second(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), round_second.begin(), [](auto i) { + return time_point_cast(round(i)); + }); + auto expected_second = fixed_width_column_wrapper( + round_second.begin(), round_second.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_second(input), expected_second); + + std::vector round_millisecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), round_millisecond.begin(), [](auto i) { + return time_point_cast(round(i)); + }); + auto expected_millisecond = fixed_width_column_wrapper( + round_millisecond.begin(), round_millisecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_millisecond(input), expected_millisecond); + + std::vector round_microsecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), round_microsecond.begin(), [](auto i) { + return time_point_cast(round(i)); + }); + auto expected_microsecond = fixed_width_column_wrapper( + round_microsecond.begin(), round_microsecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_second(input), expected_microsecond); + + std::vector round_nanosecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), round_nanosecond.begin(), [](auto i) { + return time_point_cast(round(i)); + }); + auto expected_nanosecond = fixed_width_column_wrapper( + round_nanosecond.begin(), rounded_nanosecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_millisecond(input), expected_nanosecond); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index b90ee628332..dc1632f0a72 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -302,6 +302,7 @@ Datetime methods isocalendar ceil floor + round Timedelta properties diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 38ed9fbd769..f75b39ce6ee 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -39,6 +39,19 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] floor_nanosecond( const column_view& column ) except + + cdef unique_ptr[column] round_day(const column_view& column) except + + cdef unique_ptr[column] round_hour(const column_view& column) except + + cdef unique_ptr[column] round_minute(const column_view& column) except + + cdef unique_ptr[column] round_second(const column_view& column) except + + cdef unique_ptr[column] round_millisecond( + const column_view& column + ) except + + cdef unique_ptr[column] round_microsecond( + const column_view& column + ) except + + cdef unique_ptr[column] round_nanosecond( + const column_view& column + ) except + cdef unique_ptr[column] add_calendrical_months( const column_view& timestamps, const column_view& months diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 3215088c438..3c05a17c268 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -116,6 +116,33 @@ def floor_datetime(Column col, object field): return result +def round_datetime(Column col, object field): + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + + with nogil: + # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html + if field == "D": + c_result = move(libcudf_datetime.round_day(col_view)) + elif field == "H": + c_result = move(libcudf_datetime.round_hour(col_view)) + elif field == "T" or field == "min": + c_result = move(libcudf_datetime.round_minute(col_view)) + elif field == "S": + c_result = move(libcudf_datetime.round_second(col_view)) + elif field == "L" or field == "ms": + c_result = move(libcudf_datetime.round_millisecond(col_view)) + elif field == "U" or field == "us": + c_result = move(libcudf_datetime.round_microsecond(col_view)) + elif field == "N": + c_result = move(libcudf_datetime.round_nanosecond(col_view)) + else: + raise ValueError(f"Invalid resolution: '{field}'") + + result = Column.from_unique_ptr(move(c_result)) + return result + + def is_leap_year(Column col): """Returns a boolean indicator whether the year of the date is a leap year """ diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 7c8837ef45f..08d72f1c6ee 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -228,6 +228,9 @@ def ceil(self, freq: str) -> ColumnBase: def floor(self, freq: str) -> ColumnBase: return libcudf.datetime.floor_datetime(self, freq) + def round(self, freq: str) -> ColumnBase: + return libcudf.datetime.round_datetime(self, freq) + def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if isinstance(other, cudf.Scalar): return other diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cf035ef457d..2ce997ece21 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4664,6 +4664,41 @@ def floor(self, freq): data={self.series.name: out_column}, index=self.series._index ) + def round(self, freq): + """ + Perform round operation on the data to the specified freq. + + Parameters + ---------- + freq : str + One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. + Must be a fixed frequency like 'S' (second) not 'ME' (month end). + See `frequency aliases `__ + for more details on these aliases. + + Returns + ------- + Series + Series with all timestamps rounded up to the specified frequency. + The index is preserved. + + Examples + -------- + >>> import cudf, pandas + >>> rng = pandas.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> cudf.Series(rng).dt.round("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + out_column = self.series._column.round(freq) + + return Series._from_data( + data={self.series.name: out_column}, index=self.series._index + ) + def strftime(self, date_format, *args, **kwargs): """ Convert to Series using specified ``date_format``. diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index a95be4f7932..72601a3da2c 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1819,3 +1819,34 @@ def test_floor(data, time_type, resolution): expect = ps.dt.floor(resolution) got = gs.dt.floor(resolution) assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + ( + [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:10", + "2000-12-31 04:00:05", + "1900-02-28 07:00:06", + "1800-03-14 07:30:20", + "2100-03-14 07:30:20", + "1970-01-01 00:00:09", + "1969-12-31 12:59:10", + ] + ) + ], +) +@pytest.mark.parametrize("time_type", DATETIME_TYPES) +@pytest.mark.parametrize( + "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] +) +def test_round(data, time_type, resolution): + + gs = cudf.Series(data, dtype=time_type) + ps = gs.to_pandas() + + expect = ps.dt.round(resolution) + got = gs.dt.round(resolution) + assert_eq(expect, got) From 789ace3438f946dc2f89b99ef0701b7ad77f6950 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Thu, 2 Dec 2021 22:16:11 +0000 Subject: [PATCH 02/13] added datetimeindex.round --- docs/cudf/source/api_docs/index_objects.rst | 1 + python/cudf/cudf/core/index.py | 33 +++++++++++++++++++++ python/cudf/cudf/tests/test_index.py | 13 ++++++++ 3 files changed, 47 insertions(+) diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 2a4dd5ff9c8..497f7a413b9 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -282,6 +282,7 @@ Time-specific operations DatetimeIndex.round DatetimeIndex.ceil DatetimeIndex.floor + DatetimeIndex.round Conversion ~~~~~~~~~~ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8f905ee6d49..88dd6b4b46b 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1933,6 +1933,39 @@ def floor(self, field): return self.__class__._from_data({self.name: out_column}) + def round(self, field): + """ + Perform round operation on the data to the specified freq. + + Parameters + ---------- + field : str + One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. + Must be a fixed frequency like 'S' (second) not 'ME' (month end). + See `frequency aliases `__ + for more details on these aliases. + + Returns + ------- + DatetimeIndex + Index of the same type for a DatetimeIndex + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> rng = cudf.Index(pd.date_range('1/1/2018 11:59:00', + ... periods=3, freq='min')) + >>> rng.round('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + """ + out_column = self._values.round(field) + + return self.__class__._from_data({self.name: out_column}) + class TimedeltaIndex(GenericIndex): """ diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index ab211616a02..c7fca2075f5 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2496,3 +2496,16 @@ def test_index_datetime_floor(resolution): cuidx_floor = cuidx.floor(resolution) assert_eq(pidx_floor, cuidx_floor) + + +@pytest.mark.parametrize( + "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] +) +def test_index_datetime_round(resolution): + cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) + pidx = cuidx.to_pandas() + + pidx_floor = pidx.round(resolution) + cuidx_floor = cuidx.round(resolution) + + assert_eq(pidx_floor, cuidx_floor) From 96d22baa32ccad57ab6cd6500e2937e90fedffef Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Thu, 2 Dec 2021 22:30:27 +0000 Subject: [PATCH 03/13] move round impl. to IndexedFrame --- python/cudf/cudf/core/frame.py | 114 ----------------------- python/cudf/cudf/core/indexed_frame.py | 121 ++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 115 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d7a75cb9f40..61ce64e7d6b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1836,120 +1836,6 @@ def _shift(self, offset, fill_value=None): zip(self._column_names, data_columns), self._index ) - def round(self, decimals=0, how="half_even"): - """ - Round to a variable number of decimal places. - - Parameters - ---------- - decimals : int, dict, Series - Number of decimal places to round each column to. This parameter - must be an int for a Series. For a DataFrame, a dict or a Series - are also valid inputs. If an int is given, round each column to the - same number of places. Otherwise dict and Series round to variable - numbers of places. Column names should be in the keys if - `decimals` is a dict-like, or in the index if `decimals` is a - Series. Any columns not included in `decimals` will be left as is. - Elements of `decimals` which are not columns of the input will be - ignored. - how : str, optional - Type of rounding. Can be either "half_even" (default) - of "half_up" rounding. - - Returns - ------- - Series or DataFrame - A Series or DataFrame with the affected columns rounded to the - specified number of decimal places. - - Examples - -------- - **Series** - - >>> s = cudf.Series([0.1, 1.4, 2.9]) - >>> s.round() - 0 0.0 - 1 1.0 - 2 3.0 - dtype: float64 - - **DataFrame** - - >>> df = cudf.DataFrame( - [(.21, .32), (.01, .67), (.66, .03), (.21, .18)], - ... columns=['dogs', 'cats'] - ... ) - >>> df - dogs cats - 0 0.21 0.32 - 1 0.01 0.67 - 2 0.66 0.03 - 3 0.21 0.18 - - By providing an integer each column is rounded to the same number - of decimal places - - >>> df.round(1) - dogs cats - 0 0.2 0.3 - 1 0.0 0.7 - 2 0.7 0.0 - 3 0.2 0.2 - - With a dict, the number of places for specific columns can be - specified with the column names as key and the number of decimal - places as value - - >>> df.round({'dogs': 1, 'cats': 0}) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 - - Using a Series, the number of places for specific columns can be - specified with the column names as index and the number of - decimal places as value - - >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs']) - >>> df.round(decimals) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 - """ - - if isinstance(decimals, cudf.Series): - decimals = decimals.to_pandas() - - if isinstance(decimals, pd.Series): - if not decimals.index.is_unique: - raise ValueError("Index of decimals must be unique") - decimals = decimals.to_dict() - elif isinstance(decimals, int): - decimals = {name: decimals for name in self._column_names} - elif not isinstance(decimals, abc.Mapping): - raise TypeError( - "decimals must be an integer, a dict-like or a Series" - ) - - cols = { - name: col.round(decimals[name], how=how) - if (name in decimals and _is_non_decimal_numeric_dtype(col.dtype)) - else col.copy(deep=True) - for name, col in self._data.items() - } - - return self.__class__._from_data( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ), - index=self._index, - ) - @annotate("SAMPLE", color="orange", domain="cudf_python") def sample( self, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2044bad9675..97d6179f846 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4,6 +4,7 @@ from __future__ import annotations import warnings +from collections import abc from typing import Type, TypeVar from uuid import uuid4 @@ -15,7 +16,12 @@ import cudf import cudf._lib as libcudf from cudf._typing import ColumnLike -from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like +from cudf.api.types import ( + _is_non_decimal_numeric_dtype, + is_categorical_dtype, + is_integer_dtype, + is_list_like, +) from cudf.core.column import arange from cudf.core.frame import Frame from cudf.core.index import Index @@ -695,6 +701,119 @@ def _align_to_index( return result + def round(self, decimals=0, how="half_even"): + """ + Round to a variable number of decimal places. + + Parameters + ---------- + decimals : int, dict, Series + Number of decimal places to round each column to. This parameter + must be an int for a Series. For a DataFrame, a dict or a Series + are also valid inputs. If an int is given, round each column to the + same number of places. Otherwise dict and Series round to variable + numbers of places. Column names should be in the keys if + `decimals` is a dict-like, or in the index if `decimals` is a + Series. Any columns not included in `decimals` will be left as is. + Elements of `decimals` which are not columns of the input will be + ignored. + how : str, optional + Type of rounding. Can be either "half_even" (default) + of "half_up" rounding. + + Returns + ------- + Series or DataFrame + A Series or DataFrame with the affected columns rounded to the + specified number of decimal places. + + Examples + -------- + **Series** + + >>> s = cudf.Series([0.1, 1.4, 2.9]) + >>> s.round() + 0 0.0 + 1 1.0 + 2 3.0 + dtype: float64 + + **DataFrame** + + >>> df = cudf.DataFrame( + [(.21, .32), (.01, .67), (.66, .03), (.21, .18)], + ... columns=['dogs', 'cats'] + ... ) + >>> df + dogs cats + 0 0.21 0.32 + 1 0.01 0.67 + 2 0.66 0.03 + 3 0.21 0.18 + + By providing an integer each column is rounded to the same number + of decimal places + + >>> df.round(1) + dogs cats + 0 0.2 0.3 + 1 0.0 0.7 + 2 0.7 0.0 + 3 0.2 0.2 + + With a dict, the number of places for specific columns can be + specified with the column names as key and the number of decimal + places as value + + >>> df.round({'dogs': 1, 'cats': 0}) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + Using a Series, the number of places for specific columns can be + specified with the column names as index and the number of + decimal places as value + + >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs']) + >>> df.round(decimals) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + """ + if isinstance(decimals, cudf.Series): + decimals = decimals.to_pandas() + + if isinstance(decimals, pd.Series): + if not decimals.index.is_unique: + raise ValueError("Index of decimals must be unique") + decimals = decimals.to_dict() + elif isinstance(decimals, int): + decimals = {name: decimals for name in self._column_names} + elif not isinstance(decimals, abc.Mapping): + raise TypeError( + "decimals must be an integer, a dict-like or a Series" + ) + + cols = { + name: col.round(decimals[name], how=how) + if (name in decimals and _is_non_decimal_numeric_dtype(col.dtype)) + else col.copy(deep=True) + for name, col in self._data.items() + } + + return self.__class__._from_data( + data=cudf.core.column_accessor.ColumnAccessor( + cols, + multiindex=self._data.multiindex, + level_names=self._data.level_names, + ), + index=self._index, + ) + def resample( self, rule, From bdc25c9308cde80968747a176cf87d19a121149c Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Thu, 2 Dec 2021 22:56:45 +0000 Subject: [PATCH 04/13] added impl. for autocorr --- python/cudf/cudf/core/series.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2ce997ece21..1faa785a95a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2816,6 +2816,31 @@ def corr(self, other, method="pearson", min_periods=None): return lhs._column.corr(rhs._column) + def autocorr(self, lag=1): + """Compute the lag-N autocorrelation. This method computes the Pearson + correlation between the Series and its shifted self. + + Parameters + ---------- + lag : int, default 1 + Number of lags to apply before performing autocorrelation. + + Returns + ------- + result : float + The Pearson correlation between self and self.shift(lag). + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05]) + >>> s.autocorr() + 0.10355263309024071 + >>> s.autocorr(lag=2) + -0.9999999999999999 + """ + return self.corr(self, self.shift(lag)) + def isin(self, values): """Check whether values are contained in Series. From 971366f15ebe2cc84bce50475bb5fb80a3e6c028 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Thu, 2 Dec 2021 23:06:48 +0000 Subject: [PATCH 05/13] fixed bug --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 1faa785a95a..3c90bce2003 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2839,7 +2839,7 @@ def autocorr(self, lag=1): >>> s.autocorr(lag=2) -0.9999999999999999 """ - return self.corr(self, self.shift(lag)) + return self.corr(self.shift(lag)) def isin(self, values): """Check whether values are contained in Series. From 710e30a8dbb72ef01d9db4e6fe31809713ad69a4 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Thu, 2 Dec 2021 23:12:57 +0000 Subject: [PATCH 06/13] added to series.rst --- docs/cudf/source/api_docs/series.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index dc1632f0a72..27615d1a0b0 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -118,6 +118,7 @@ Computations / descriptive stats Series.abs Series.all Series.any + Series.autocorr Series.ceil Series.clip Series.corr From 46bb589a50b1687475dbf338bd234506b636ef98 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Fri, 3 Dec 2021 00:00:15 +0000 Subject: [PATCH 07/13] code clean up --- cpp/include/cudf/datetime.hpp | 94 --------------- cpp/src/datetime/datetime_ops.cu | 80 +------------ cpp/tests/datetime/datetime_ops_test.cpp | 72 ------------ docs/cudf/source/api_docs/index_objects.rst | 1 - docs/cudf/source/api_docs/series.rst | 1 - python/cudf/cudf/_lib/cpp/datetime.pxd | 13 --- python/cudf/cudf/_lib/datetime.pyx | 27 ----- python/cudf/cudf/core/column/datetime.py | 3 - python/cudf/cudf/core/frame.py | 113 ++++++++++++++++++ python/cudf/cudf/core/index.py | 33 ------ python/cudf/cudf/core/indexed_frame.py | 121 +------------------- python/cudf/cudf/core/series.py | 35 ------ python/cudf/cudf/tests/test_datetime.py | 31 ----- python/cudf/cudf/tests/test_index.py | 13 --- 14 files changed, 117 insertions(+), 520 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 489ffb25ffe..bc88fbc77b5 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -468,97 +468,3 @@ std::unique_ptr floor_microsecond( std::unique_ptr floor_nanosecond( column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest day - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest hour - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_hour( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest minute - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_minute( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest second - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_second( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest millisecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_millisecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest microsecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_microsecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest nanosecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr round_nanosecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // namespace datetime -} // namespace cudf diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 382cc3653d3..d1334c0526f 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -54,7 +54,7 @@ enum class datetime_component { NANOSECOND }; -enum class rounding_kind { CEIL, FLOOR, ROUND }; +enum class rounding_kind { CEIL, FLOOR }; template struct extract_component_operator { @@ -90,7 +90,7 @@ struct extract_component_operator { } }; -// This functor takes the rounding type as runtime info and dispatches to the ceil/floor/round +// This functor takes the rounding type as runtime info and dispatches to the ceil/floor // function. template struct RoundFunctor { @@ -100,7 +100,6 @@ struct RoundFunctor { switch (round_kind) { case rounding_kind::CEIL: return cuda::std::chrono::ceil(dt); case rounding_kind::FLOOR: return cuda::std::chrono::floor(dt); - case rounding_kind::ROUND: return cuda::std::chrono::round(dt); default: cudf_assert(false && "Unsupported rounding kind."); } __builtin_unreachable(); @@ -225,7 +224,7 @@ struct is_leap_year_op { } }; -// Specific function for applying ceil/floor/round date ops +// Specific function for applying ceil/floor date ops struct dispatch_round { template std::enable_if_t(), std::unique_ptr> operator()( @@ -673,79 +672,6 @@ std::unique_ptr floor_nanosecond(column_view const& column, mr); } -std::unique_ptr round_day(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::ROUND, - detail::datetime_component::DAY, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_hour(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::ROUND, - detail::datetime_component::HOUR, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_minute(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::ROUND, - detail::datetime_component::MINUTE, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_second(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::ROUND, - detail::datetime_component::SECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_millisecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::ROUND, - detail::datetime_component::MILLISECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_microsecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::ROUND, - detail::datetime_component::MICROSECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr round_nanosecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::ROUND, - detail::datetime_component::NANOSECOND, - column, - rmm::cuda_stream_default, - mr); -} - std::unique_ptr extract_year(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index f4910681a38..4ac24317145 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -914,76 +914,4 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_nanosecond); } -TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime) -{ - using T = TypeParam; - using namespace cudf::test; - using namespace cudf::datetime; - using namespace cuda::std::chrono; - - auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT - auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT - - auto input = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); - - auto host_val = to_host(input); - thrust::host_vector timestamps = host_val.first; - - std::vector round_day(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), round_day.begin(), [](auto i) { - return time_point_cast(round(i)); - }); - auto expected_day = - fixed_width_column_wrapper(round_day.begin(), round_day.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_day(input), expected_day); - - std::vector round_hour(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), round_hour.begin(), [](auto i) { - return time_point_cast(round(i)); - }); - auto expected_hour = - fixed_width_column_wrapper(round_hour.begin(), round_hour.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_hour(input), expected_hour); - - std::vector round_minute(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), round_minute.begin(), [](auto i) { - return time_point_cast(round(i)); - }); - auto expected_minute = fixed_width_column_wrapper( - round_minute.begin(), round_minute.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_minute(input), expected_minute); - - std::vector round_second(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), round_second.begin(), [](auto i) { - return time_point_cast(round(i)); - }); - auto expected_second = fixed_width_column_wrapper( - round_second.begin(), round_second.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_second(input), expected_second); - - std::vector round_millisecond(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), round_millisecond.begin(), [](auto i) { - return time_point_cast(round(i)); - }); - auto expected_millisecond = fixed_width_column_wrapper( - round_millisecond.begin(), round_millisecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_millisecond(input), expected_millisecond); - - std::vector round_microsecond(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), round_microsecond.begin(), [](auto i) { - return time_point_cast(round(i)); - }); - auto expected_microsecond = fixed_width_column_wrapper( - round_microsecond.begin(), round_microsecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_second(input), expected_microsecond); - - std::vector round_nanosecond(timestamps.size()); - std::transform(timestamps.begin(), timestamps.end(), round_nanosecond.begin(), [](auto i) { - return time_point_cast(round(i)); - }); - auto expected_nanosecond = fixed_width_column_wrapper( - round_nanosecond.begin(), rounded_nanosecond.end()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_millisecond(input), expected_nanosecond); -} - CUDF_TEST_PROGRAM_MAIN() diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 497f7a413b9..2a4dd5ff9c8 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -282,7 +282,6 @@ Time-specific operations DatetimeIndex.round DatetimeIndex.ceil DatetimeIndex.floor - DatetimeIndex.round Conversion ~~~~~~~~~~ diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 27615d1a0b0..356a8dd87f6 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -303,7 +303,6 @@ Datetime methods isocalendar ceil floor - round Timedelta properties diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index f75b39ce6ee..38ed9fbd769 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -39,19 +39,6 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] floor_nanosecond( const column_view& column ) except + - cdef unique_ptr[column] round_day(const column_view& column) except + - cdef unique_ptr[column] round_hour(const column_view& column) except + - cdef unique_ptr[column] round_minute(const column_view& column) except + - cdef unique_ptr[column] round_second(const column_view& column) except + - cdef unique_ptr[column] round_millisecond( - const column_view& column - ) except + - cdef unique_ptr[column] round_microsecond( - const column_view& column - ) except + - cdef unique_ptr[column] round_nanosecond( - const column_view& column - ) except + cdef unique_ptr[column] add_calendrical_months( const column_view& timestamps, const column_view& months diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 3c05a17c268..3215088c438 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -116,33 +116,6 @@ def floor_datetime(Column col, object field): return result -def round_datetime(Column col, object field): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html - if field == "D": - c_result = move(libcudf_datetime.round_day(col_view)) - elif field == "H": - c_result = move(libcudf_datetime.round_hour(col_view)) - elif field == "T" or field == "min": - c_result = move(libcudf_datetime.round_minute(col_view)) - elif field == "S": - c_result = move(libcudf_datetime.round_second(col_view)) - elif field == "L" or field == "ms": - c_result = move(libcudf_datetime.round_millisecond(col_view)) - elif field == "U" or field == "us": - c_result = move(libcudf_datetime.round_microsecond(col_view)) - elif field == "N": - c_result = move(libcudf_datetime.round_nanosecond(col_view)) - else: - raise ValueError(f"Invalid resolution: '{field}'") - - result = Column.from_unique_ptr(move(c_result)) - return result - - def is_leap_year(Column col): """Returns a boolean indicator whether the year of the date is a leap year """ diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 08d72f1c6ee..7c8837ef45f 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -228,9 +228,6 @@ def ceil(self, freq: str) -> ColumnBase: def floor(self, freq: str) -> ColumnBase: return libcudf.datetime.floor_datetime(self, freq) - def round(self, freq: str) -> ColumnBase: - return libcudf.datetime.round_datetime(self, freq) - def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if isinstance(other, cudf.Scalar): return other diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 61ce64e7d6b..9969b9ac0fa 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1836,6 +1836,119 @@ def _shift(self, offset, fill_value=None): zip(self._column_names, data_columns), self._index ) + def round(self, decimals=0, how="half_even"): + """ + Round to a variable number of decimal places. + + Parameters + ---------- + decimals : int, dict, Series + Number of decimal places to round each column to. This parameter + must be an int for a Series. For a DataFrame, a dict or a Series + are also valid inputs. If an int is given, round each column to the + same number of places. Otherwise dict and Series round to variable + numbers of places. Column names should be in the keys if + `decimals` is a dict-like, or in the index if `decimals` is a + Series. Any columns not included in `decimals` will be left as is. + Elements of `decimals` which are not columns of the input will be + ignored. + how : str, optional + Type of rounding. Can be either "half_even" (default) + of "half_up" rounding. + + Returns + ------- + Series or DataFrame + A Series or DataFrame with the affected columns rounded to the + specified number of decimal places. + + Examples + -------- + **Series** + + >>> s = cudf.Series([0.1, 1.4, 2.9]) + >>> s.round() + 0 0.0 + 1 1.0 + 2 3.0 + dtype: float64 + + **DataFrame** + + >>> df = cudf.DataFrame( + [(.21, .32), (.01, .67), (.66, .03), (.21, .18)], + ... columns=['dogs', 'cats'] + ... ) + >>> df + dogs cats + 0 0.21 0.32 + 1 0.01 0.67 + 2 0.66 0.03 + 3 0.21 0.18 + + By providing an integer each column is rounded to the same number + of decimal places + + >>> df.round(1) + dogs cats + 0 0.2 0.3 + 1 0.0 0.7 + 2 0.7 0.0 + 3 0.2 0.2 + + With a dict, the number of places for specific columns can be + specified with the column names as key and the number of decimal + places as value + + >>> df.round({'dogs': 1, 'cats': 0}) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + Using a Series, the number of places for specific columns can be + specified with the column names as index and the number of + decimal places as value + + >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs']) + >>> df.round(decimals) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + """ + if isinstance(decimals, cudf.Series): + decimals = decimals.to_pandas() + + if isinstance(decimals, pd.Series): + if not decimals.index.is_unique: + raise ValueError("Index of decimals must be unique") + decimals = decimals.to_dict() + elif isinstance(decimals, int): + decimals = {name: decimals for name in self._column_names} + elif not isinstance(decimals, abc.Mapping): + raise TypeError( + "decimals must be an integer, a dict-like or a Series" + ) + + cols = { + name: col.round(decimals[name], how=how) + if (name in decimals and _is_non_decimal_numeric_dtype(col.dtype)) + else col.copy(deep=True) + for name, col in self._data.items() + } + + return self.__class__._from_data( + data=cudf.core.column_accessor.ColumnAccessor( + cols, + multiindex=self._data.multiindex, + level_names=self._data.level_names, + ), + index=self._index, + ) + @annotate("SAMPLE", color="orange", domain="cudf_python") def sample( self, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 88dd6b4b46b..8f905ee6d49 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1933,39 +1933,6 @@ def floor(self, field): return self.__class__._from_data({self.name: out_column}) - def round(self, field): - """ - Perform round operation on the data to the specified freq. - - Parameters - ---------- - field : str - One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. - Must be a fixed frequency like 'S' (second) not 'ME' (month end). - See `frequency aliases `__ - for more details on these aliases. - - Returns - ------- - DatetimeIndex - Index of the same type for a DatetimeIndex - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> rng = cudf.Index(pd.date_range('1/1/2018 11:59:00', - ... periods=3, freq='min')) - >>> rng.round('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - """ - out_column = self._values.round(field) - - return self.__class__._from_data({self.name: out_column}) - class TimedeltaIndex(GenericIndex): """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 97d6179f846..2044bad9675 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4,7 +4,6 @@ from __future__ import annotations import warnings -from collections import abc from typing import Type, TypeVar from uuid import uuid4 @@ -16,12 +15,7 @@ import cudf import cudf._lib as libcudf from cudf._typing import ColumnLike -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_categorical_dtype, - is_integer_dtype, - is_list_like, -) +from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like from cudf.core.column import arange from cudf.core.frame import Frame from cudf.core.index import Index @@ -701,119 +695,6 @@ def _align_to_index( return result - def round(self, decimals=0, how="half_even"): - """ - Round to a variable number of decimal places. - - Parameters - ---------- - decimals : int, dict, Series - Number of decimal places to round each column to. This parameter - must be an int for a Series. For a DataFrame, a dict or a Series - are also valid inputs. If an int is given, round each column to the - same number of places. Otherwise dict and Series round to variable - numbers of places. Column names should be in the keys if - `decimals` is a dict-like, or in the index if `decimals` is a - Series. Any columns not included in `decimals` will be left as is. - Elements of `decimals` which are not columns of the input will be - ignored. - how : str, optional - Type of rounding. Can be either "half_even" (default) - of "half_up" rounding. - - Returns - ------- - Series or DataFrame - A Series or DataFrame with the affected columns rounded to the - specified number of decimal places. - - Examples - -------- - **Series** - - >>> s = cudf.Series([0.1, 1.4, 2.9]) - >>> s.round() - 0 0.0 - 1 1.0 - 2 3.0 - dtype: float64 - - **DataFrame** - - >>> df = cudf.DataFrame( - [(.21, .32), (.01, .67), (.66, .03), (.21, .18)], - ... columns=['dogs', 'cats'] - ... ) - >>> df - dogs cats - 0 0.21 0.32 - 1 0.01 0.67 - 2 0.66 0.03 - 3 0.21 0.18 - - By providing an integer each column is rounded to the same number - of decimal places - - >>> df.round(1) - dogs cats - 0 0.2 0.3 - 1 0.0 0.7 - 2 0.7 0.0 - 3 0.2 0.2 - - With a dict, the number of places for specific columns can be - specified with the column names as key and the number of decimal - places as value - - >>> df.round({'dogs': 1, 'cats': 0}) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 - - Using a Series, the number of places for specific columns can be - specified with the column names as index and the number of - decimal places as value - - >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs']) - >>> df.round(decimals) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 - """ - if isinstance(decimals, cudf.Series): - decimals = decimals.to_pandas() - - if isinstance(decimals, pd.Series): - if not decimals.index.is_unique: - raise ValueError("Index of decimals must be unique") - decimals = decimals.to_dict() - elif isinstance(decimals, int): - decimals = {name: decimals for name in self._column_names} - elif not isinstance(decimals, abc.Mapping): - raise TypeError( - "decimals must be an integer, a dict-like or a Series" - ) - - cols = { - name: col.round(decimals[name], how=how) - if (name in decimals and _is_non_decimal_numeric_dtype(col.dtype)) - else col.copy(deep=True) - for name, col in self._data.items() - } - - return self.__class__._from_data( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ), - index=self._index, - ) - def resample( self, rule, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3c90bce2003..b755ab7ad0a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4689,41 +4689,6 @@ def floor(self, freq): data={self.series.name: out_column}, index=self.series._index ) - def round(self, freq): - """ - Perform round operation on the data to the specified freq. - - Parameters - ---------- - freq : str - One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. - Must be a fixed frequency like 'S' (second) not 'ME' (month end). - See `frequency aliases `__ - for more details on these aliases. - - Returns - ------- - Series - Series with all timestamps rounded up to the specified frequency. - The index is preserved. - - Examples - -------- - >>> import cudf, pandas - >>> rng = pandas.date_range('1/1/2018 11:59:00', periods=3, freq='min') - >>> cudf.Series(rng).dt.round("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """ - out_column = self.series._column.round(freq) - - return Series._from_data( - data={self.series.name: out_column}, index=self.series._index - ) - def strftime(self, date_format, *args, **kwargs): """ Convert to Series using specified ``date_format``. diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 72601a3da2c..a95be4f7932 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1819,34 +1819,3 @@ def test_floor(data, time_type, resolution): expect = ps.dt.floor(resolution) got = gs.dt.floor(resolution) assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ( - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - ) - ], -) -@pytest.mark.parametrize("time_type", DATETIME_TYPES) -@pytest.mark.parametrize( - "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] -) -def test_round(data, time_type, resolution): - - gs = cudf.Series(data, dtype=time_type) - ps = gs.to_pandas() - - expect = ps.dt.round(resolution) - got = gs.dt.round(resolution) - assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index c7fca2075f5..ab211616a02 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2496,16 +2496,3 @@ def test_index_datetime_floor(resolution): cuidx_floor = cuidx.floor(resolution) assert_eq(pidx_floor, cuidx_floor) - - -@pytest.mark.parametrize( - "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] -) -def test_index_datetime_round(resolution): - cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) - pidx = cuidx.to_pandas() - - pidx_floor = pidx.round(resolution) - cuidx_floor = cuidx.round(resolution) - - assert_eq(pidx_floor, cuidx_floor) From 142cd428e4dce9a598d2b6fb1dc0901bea7ec973 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Fri, 3 Dec 2021 00:02:25 +0000 Subject: [PATCH 08/13] code clean up --- cpp/include/cudf/datetime.hpp | 3 +++ cpp/src/datetime/datetime_ops.cu | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index bc88fbc77b5..74be2a3c0a6 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -468,3 +468,6 @@ std::unique_ptr floor_microsecond( std::unique_ptr floor_nanosecond( column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace datetime +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index d1334c0526f..717bd7ac0a8 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -90,7 +90,7 @@ struct extract_component_operator { } }; -// This functor takes the rounding type as runtime info and dispatches to the ceil/floor +// This functor takes the rounding type as runtime info and dispatches to the ceil/floor/round // function. template struct RoundFunctor { From 9bb435bbb43049f276553bd4b91d0b1de8d9eba5 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Fri, 3 Dec 2021 00:04:04 +0000 Subject: [PATCH 09/13] code clean up --- cpp/src/datetime/datetime_ops.cu | 1199 +++++++++++------------------- 1 file changed, 450 insertions(+), 749 deletions(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 717bd7ac0a8..74be2a3c0a6 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,759 +14,460 @@ * limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#pragma once + #include -#include -#include -#include -#include -#include -#include +#include -#include +/** + * @file datetime.hpp + * @brief DateTime column APIs. + */ namespace cudf { namespace datetime { -namespace detail { -enum class datetime_component { - INVALID = 0, - YEAR, - MONTH, - DAY, - WEEKDAY, - HOUR, - MINUTE, - SECOND, - MILLISECOND, - MICROSECOND, - NANOSECOND -}; - -enum class rounding_kind { CEIL, FLOOR }; - -template -struct extract_component_operator { - template - CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const - { - using namespace cuda::std::chrono; - - auto days_since_epoch = floor(ts); - - auto time_since_midnight = ts - days_since_epoch; - - if (time_since_midnight.count() < 0) { time_since_midnight += days(1); } - - auto hrs_ = duration_cast(time_since_midnight); - auto mins_ = duration_cast(time_since_midnight - hrs_); - auto secs_ = duration_cast(time_since_midnight - hrs_ - mins_); - - switch (Component) { - case datetime_component::YEAR: - return static_cast(year_month_day(days_since_epoch).year()); - case datetime_component::MONTH: - return static_cast(year_month_day(days_since_epoch).month()); - case datetime_component::DAY: - return static_cast(year_month_day(days_since_epoch).day()); - case datetime_component::WEEKDAY: - return year_month_weekday(days_since_epoch).weekday().iso_encoding(); - case datetime_component::HOUR: return hrs_.count(); - case datetime_component::MINUTE: return mins_.count(); - case datetime_component::SECOND: return secs_.count(); - default: return 0; - } - } -}; - -// This functor takes the rounding type as runtime info and dispatches to the ceil/floor/round -// function. -template -struct RoundFunctor { - template - CUDA_DEVICE_CALLABLE auto operator()(rounding_kind round_kind, Timestamp dt) - { - switch (round_kind) { - case rounding_kind::CEIL: return cuda::std::chrono::ceil(dt); - case rounding_kind::FLOOR: return cuda::std::chrono::floor(dt); - default: cudf_assert(false && "Unsupported rounding kind."); - } - __builtin_unreachable(); - } -}; - -struct RoundingDispatcher { - rounding_kind round_kind; - datetime_component component; - - RoundingDispatcher(rounding_kind round_kind, datetime_component component) - : round_kind(round_kind), component(component) - { - } - - template - CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const - { - switch (component) { - case datetime_component::DAY: - return time_point_cast( - RoundFunctor{}(round_kind, ts)); - case datetime_component::HOUR: - return time_point_cast( - RoundFunctor{}(round_kind, ts)); - case datetime_component::MINUTE: - return time_point_cast( - RoundFunctor{}(round_kind, ts)); - case datetime_component::SECOND: - return time_point_cast( - RoundFunctor{}(round_kind, ts)); - case datetime_component::MILLISECOND: - return time_point_cast( - RoundFunctor{}(round_kind, ts)); - case datetime_component::MICROSECOND: - return time_point_cast( - RoundFunctor{}(round_kind, ts)); - case datetime_component::NANOSECOND: - return time_point_cast( - RoundFunctor{}(round_kind, ts)); - default: cudf_assert(false && "Unsupported datetime rounding resolution."); - } - __builtin_unreachable(); - } -}; - -// Number of days until month indexed by leap year and month (0-based index) -static __device__ int16_t const days_until_month[2][13] = { - {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, // For non leap years - {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} // For leap years -}; - -// Round up the date to the last day of the month and return the -// date only (without the time component) -struct extract_last_day_of_month { - template - CUDA_DEVICE_CALLABLE timestamp_D operator()(Timestamp const ts) const - { - using namespace cuda::std::chrono; - const year_month_day ymd(floor(ts)); - auto const ymdl = year_month_day_last{ymd.year() / ymd.month() / last}; - return timestamp_D{sys_days{ymdl}}; - } -}; - -// Extract the number of days of the month -// A similar operator to `extract_last_day_of_month`, except this returns -// an integer while the other returns a timestamp. -struct days_in_month_op { - template - CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const - { - using namespace cuda::std::chrono; - auto const date = year_month_day(floor(ts)); - auto const ymdl = year_month_day_last(date.year() / date.month() / last); - return static_cast(unsigned{ymdl.day()}); - } -}; - -// Extract the day number of the year present in the timestamp -struct extract_day_num_of_year { - template - CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const - { - using namespace cuda::std::chrono; - - // Only has the days - time component is chopped off, which is what we want - auto const days_since_epoch = floor(ts); - auto const date = year_month_day(days_since_epoch); - - return days_until_month[date.year().is_leap()][unsigned{date.month()} - 1] + - unsigned{date.day()}; - } -}; - -// Extract the the quarter to which the timestamp belongs to -struct extract_quarter_op { - template - CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const - { - using namespace cuda::std::chrono; - - // Only has the days - time component is chopped off, which is what we want - auto const days_since_epoch = floor(ts); - auto const date = year_month_day(days_since_epoch); - auto const month = unsigned{date.month()}; - - // (x + y - 1) / y = ceil(x/y), where x and y are unsigned. x = month, y = 3 - return (month + 2) / 3; - } -}; - -// Returns true if the year is a leap year -struct is_leap_year_op { - template - CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const - { - using namespace cuda::std::chrono; - auto const days_since_epoch = floor(ts); - auto const date = year_month_day(days_since_epoch); - return date.year().is_leap(); - } -}; - -// Specific function for applying ceil/floor date ops -struct dispatch_round { - template - std::enable_if_t(), std::unique_ptr> operator()( - rounding_kind round_kind, - datetime_component component, - cudf::column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - auto size = column.size(); - auto output_col_type = data_type{cudf::type_to_id()}; - - // Return an empty column if source column is empty - if (size == 0) return make_empty_column(output_col_type); - - auto output = make_fixed_width_column(output_col_type, - size, - cudf::detail::copy_bitmask(column, stream, mr), - column.null_count(), - stream, - mr); - - thrust::transform(rmm::exec_policy(stream), - column.begin(), - column.end(), - output->mutable_view().begin(), - RoundingDispatcher{round_kind, component}); - - return output; - } - - template - std::enable_if_t(), std::unique_ptr> operator()( - Args&&...) - { - CUDF_FAIL("Must be cudf::timestamp"); - } -}; - -// Apply the functor for every element/row in the input column to create the output column -template -struct launch_functor { - column_view input; - mutable_column_view output; - - launch_functor(column_view inp, mutable_column_view out) : input(inp), output(out) {} - - template - typename std::enable_if_t::value, void> operator()( - rmm::cuda_stream_view stream) const - { - CUDF_FAIL("Cannot extract datetime component from non-timestamp column."); - } - - template - typename std::enable_if_t::value, void> operator()( - rmm::cuda_stream_view stream) const - { - thrust::transform(rmm::exec_policy(stream), - input.begin(), - input.end(), - output.begin(), - TransformFunctor{}); - } -}; - -// Create an output column by applying the functor to every element from the input column -template -std::unique_ptr apply_datetime_op(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(is_timestamp(column.type()), "Column type should be timestamp"); - auto size = column.size(); - auto output_col_type = data_type{OutputColCudfT}; - - // Return an empty column if source column is empty - if (size == 0) return make_empty_column(output_col_type); - - auto output = make_fixed_width_column(output_col_type, - size, - cudf::detail::copy_bitmask(column, stream, mr), - column.null_count(), - stream, - mr); - auto launch = - launch_functor::type>{ - column, static_cast(*output)}; - - type_dispatcher(column.type(), launch, stream); - - return output; -} - -struct add_calendrical_months_functor { - template - typename std::enable_if_t::value, std::unique_ptr> - operator()(Args&&...) const - { - CUDF_FAIL("Cannot extract datetime component from non-timestamp column."); - } - - template - typename std::enable_if_t::value, std::unique_ptr> - operator()(column_view timestamp_column, - MonthIterator months_begin, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - auto size = timestamp_column.size(); - auto output_col_type = timestamp_column.type(); - - // Return an empty column if source column is empty - if (size == 0) return make_empty_column(output_col_type); - - // The nullmask of `output` cannot be determined without information from - // the `months` type (column or scalar). Therefore, it is initialized as - // `UNALLOCATED` and assigned at a later stage. - auto output = - make_fixed_width_column(output_col_type, size, mask_state::UNALLOCATED, stream, mr); - auto output_mview = output->mutable_view(); - - thrust::transform(rmm::exec_policy(stream), - timestamp_column.begin(), - timestamp_column.end(), - months_begin, - output->mutable_view().begin(), - [] __device__(auto& timestamp, auto& months) { - return add_calendrical_months_with_scale_back( - timestamp, cuda::std::chrono::months{months}); - }); - return output; - } -}; - -std::unique_ptr add_calendrical_months(column_view const& timestamp_column, - column_view const& months_column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp"); - CUDF_EXPECTS( - months_column.type().id() == type_id::INT16 or months_column.type().id() == type_id::INT32, - "Months column type should be INT16 or INT32."); - CUDF_EXPECTS(timestamp_column.size() == months_column.size(), - "Timestamp and months column should be of the same size"); - - auto const months_begin_iter = - cudf::detail::indexalator_factory::make_input_iterator(months_column); - auto output = type_dispatcher(timestamp_column.type(), - add_calendrical_months_functor{}, - timestamp_column, - months_begin_iter, - stream, - mr); - - auto [output_null_mask, null_count] = - cudf::detail::bitmask_and(table_view{{timestamp_column, months_column}}, stream, mr); - output->set_null_mask(std::move(output_null_mask), null_count); - return output; -} - -std::unique_ptr add_calendrical_months(column_view const& timestamp_column, - scalar const& months, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp"); - CUDF_EXPECTS(months.type().id() == type_id::INT16 or months.type().id() == type_id::INT32, - "Months type should be INT16 or INT32"); - - if (months.is_valid(stream)) { - auto const months_begin_iter = thrust::make_permutation_iterator( - cudf::detail::indexalator_factory::make_input_iterator(months), - thrust::make_constant_iterator(0)); - auto output = type_dispatcher(timestamp_column.type(), - add_calendrical_months_functor{}, - timestamp_column, - months_begin_iter, - stream, - mr); - output->set_null_mask(cudf::detail::copy_bitmask(timestamp_column, stream, mr)); - return output; - } else { - return make_timestamp_column( - timestamp_column.type(), timestamp_column.size(), mask_state::ALL_NULL, stream, mr); - } -} - -std::unique_ptr round_general(rounding_kind round_kind, - datetime_component component, - column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return cudf::type_dispatcher( - column.type(), dispatch_round{}, round_kind, component, column, stream, mr); -} - -std::unique_ptr extract_year(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); -} - -std::unique_ptr extract_month(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); -} - -std::unique_ptr extract_day(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); -} - -std::unique_ptr extract_weekday(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); -} - -std::unique_ptr extract_hour(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); -} - -std::unique_ptr extract_minute(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); -} - -std::unique_ptr extract_second(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); -} - -std::unique_ptr last_day_of_month(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return detail::apply_datetime_op(column, stream, mr); -} - -std::unique_ptr day_of_year(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return detail::apply_datetime_op( - column, stream, mr); -} - -std::unique_ptr is_leap_year(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return apply_datetime_op(column, stream, mr); -} - -std::unique_ptr days_in_month(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return apply_datetime_op(column, stream, mr); -} - -std::unique_ptr extract_quarter(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return apply_datetime_op(column, stream, mr); -} - -} // namespace detail - -std::unique_ptr ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::CEIL, - detail::datetime_component::DAY, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::CEIL, - detail::datetime_component::HOUR, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::CEIL, - detail::datetime_component::MINUTE, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::CEIL, - detail::datetime_component::SECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_millisecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::CEIL, - detail::datetime_component::MILLISECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_microsecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::CEIL, - detail::datetime_component::MICROSECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr ceil_nanosecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::CEIL, - detail::datetime_component::NANOSECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_day(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::FLOOR, - detail::datetime_component::DAY, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_hour(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::FLOOR, - detail::datetime_component::HOUR, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_minute(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::FLOOR, - detail::datetime_component::MINUTE, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_second(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::FLOOR, - detail::datetime_component::SECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_millisecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::FLOOR, - detail::datetime_component::MILLISECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_microsecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::FLOOR, - detail::datetime_component::MICROSECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr floor_nanosecond(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::round_general(detail::rounding_kind::FLOOR, - detail::datetime_component::NANOSECOND, - column, - rmm::cuda_stream_default, - mr); -} - -std::unique_ptr extract_year(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::extract_year(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr extract_month(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::extract_month(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr extract_day(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::extract_day(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr extract_weekday(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::extract_weekday(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr extract_hour(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::extract_hour(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr extract_minute(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::extract_minute(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr extract_second(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::extract_second(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr last_day_of_month(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::last_day_of_month(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr day_of_year(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::day_of_year(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, - cudf::column_view const& months_column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::add_calendrical_months( - timestamp_column, months_column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, - cudf::scalar const& months, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::add_calendrical_months(timestamp_column, months, rmm::cuda_stream_default, mr); -} - -std::unique_ptr is_leap_year(column_view const& column, rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::is_leap_year(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr days_in_month(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::days_in_month(column, rmm::cuda_stream_default, mr); -} - -std::unique_ptr extract_quarter(column_view const& column, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::extract_quarter(column, rmm::cuda_stream_default, mr); -} +/** + * @addtogroup datetime_extract + * @{ + * @file + */ + +/** + * @brief Extracts year from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of the extracted int16_t years + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_year( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts month from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of the extracted int16_t months + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_month( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts day from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of the extracted int16_t days + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_day( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts day from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of the extracted int16_t days + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_weekday( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts hour from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of the extracted int16_t hours + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_hour( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts minute from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of the extracted int16_t minutes + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_minute( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts second from any date time type and returns an int16_t + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of the extracted int16_t seconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_second( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +/** + * @addtogroup datetime_compute + * @{ + * @file + */ + +/** + * @brief Computes the last day of the month in date time type and returns a TIMESTAMP_DAYS + * cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column containing last day of the month as TIMESTAMP_DAYS + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr last_day_of_month( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the day number since the start of the year from the datetime and + * returns an int16_t cudf::column. The value is between [1, {365-366}] + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of datatype INT16 containing the day number since the start of the year. + * @throw cudf::logic_error if input column datatype is not a TIMESTAMP + */ +std::unique_ptr day_of_year( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Adds or subtracts a number of months from the date time type and returns a + * timestamp column that is of the same type as the input `timestamps` column. + * + * For a given row, if the `timestamps` or the `months` column value is null, + * the output for that row is null. + * This method preserves the input time and the day where applicable. The date is rounded + * down to the last day of the month for that year, if the new day is invalid for that month. + * + * @code{.pseudo} + * Example: + * timestamps = [5/31/20 08:00:00, 5/31/20 00:00:00, 5/31/20 13:00:00, 5/31/20 23:00:00, + * 6/30/20 00:00:01, 6/30/20 14:12:13] + * months = [1 , -1 , -3 , -15 , + * -1 , 1] + * r = add_calendrical_months(timestamp_column, months_column) + * r is [6/30/20 08:00:00, 4/30/20 00:00:00, 2/29/20 13:00:00, 2/28/19 23:00:00, + * 5/30/20 00:00:01, 7/30/20 14:12:13] + * @endcode + + * @throw cudf::logic_error if `timestamps` datatype is not a TIMESTAMP or if `months` datatype + * is not INT16 or INT32. + * @throw cudf::logic_error if `timestamps` column size is not equal to `months` column size. + * + * @param timestamps cudf::column_view of timestamp type. + * @param months cudf::column_view of integer type containing the number of months to add. + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of timestamp type containing the computed timestamps. + */ +std::unique_ptr add_calendrical_months( + cudf::column_view const& timestamps, + cudf::column_view const& months, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Adds or subtracts a number of months from the date time type and returns a + * timestamp column that is of the same type as the input `timestamps` column. + * + * For a given row, if the `timestamps` value is null, the output for that row is null. + * A null months scalar would result in an all null column. + * This method preserves the input time and the day where applicable. The date is rounded + * down to the last day of the month for that year, if the new day is invalid for that month. + * + * @code{.pseudo} + * Example: + * timestamps = [5/31/20 08:00:00, 6/30/20 00:00:00, 7/31/20 13:00:00] + * months = -3 + * output is [2/29/20 08:00:00, 3/30/20 00:00:00, 4/30/20 13:00:00] + * + * timestamps = [4/28/20 04:00:00, 5/30/20 01:00:00, 6/30/20 21:00:00] + * months = 1 + * output is [5/28/20 04:00:00, 6/30/20 01:00:00, 7/30/20 21:00:00] + * @endcode + * + * @throw cudf::logic_error if `timestamps` datatype is not a TIMESTAMP or if `months` datatype + * is not INT16 or INT32. + * @throw cudf::logic_error if `timestamps` column size is not equal to `months` column size. + * + * @param timestamps cudf::column_view of timestamp type. + * @param months cudf::scalar of integer type containing the number of months to add. + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @return cudf::column of timestamp type containing the computed timestamps. + */ +std::unique_ptr add_calendrical_months( + cudf::column_view const& timestamps, + cudf::scalar const& months, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Check if the year of the given date is a leap year + * + * `output[i] == true` if year of `column[i]` is a leap year + * `output[i] == false` if year of `column[i]` is not a leap year + * `output[i] is null` if `column[i]` is null + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @returns cudf::column of datatype BOOL8 truth value of the corresponding date + * @throw cudf::logic_error if input column datatype is not a TIMESTAMP + */ +std::unique_ptr is_leap_year( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extract the number of days in the month + * + * output[i] contains the number of days in the month of date `column[i]` + * output[i] is null if `column[i]` is null + * + * @throw cudf::logic_error if input column datatype is not a TIMESTAMP + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * @return cudf::column of datatype INT16 of days in month of the corresponding date + */ +std::unique_ptr days_in_month( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the quarter of the date + * + * `output[i]` will be a value from {1, 2, 3, 4} corresponding to the quater of month given by + * `column[i]`. It will be null if the input row at `column[i]` is null. + * + * @throw cudf::logic_error if input column datatype is not a TIMESTAMP + * + * @param column The input column containing datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * @return A column of INT16 type indicating which quarter the date is in + */ +std::unique_ptr extract_quarter( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group + +/** + * @brief Round up to the nearest day + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr ceil_day( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round up to the nearest hour + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr ceil_hour( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round up to the nearest minute + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr ceil_minute( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round up to the nearest second + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr ceil_second( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round up to the nearest millisecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr ceil_millisecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round up to the nearest microsecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr ceil_microsecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round up to the nearest nanosecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr ceil_nanosecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest day + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_day( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest hour + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_hour( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest minute + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_minute( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest second + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_second( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest millisecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_millisecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest microsecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_microsecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest nanosecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_nanosecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace datetime -} // namespace cudf +} // namespace cudf \ No newline at end of file From 7fdfd180f59e1985b45b94fe35bfb95254741157 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Fri, 3 Dec 2021 00:06:02 +0000 Subject: [PATCH 10/13] code clcleaning --- cpp/src/datetime/datetime_ops.cu | 1197 +++++++++++++++++++----------- 1 file changed, 748 insertions(+), 449 deletions(-) diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 74be2a3c0a6..efef26a762b 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,460 +14,759 @@ * limitations under the License. */ -#pragma once - +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include -#include +#include +#include -/** - * @file datetime.hpp - * @brief DateTime column APIs. - */ +#include namespace cudf { namespace datetime { -/** - * @addtogroup datetime_extract - * @{ - * @file - */ - -/** - * @brief Extracts year from any date time type and returns an int16_t - * cudf::column. - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of the extracted int16_t years - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - */ -std::unique_ptr extract_year( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Extracts month from any date time type and returns an int16_t - * cudf::column. - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of the extracted int16_t months - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - */ -std::unique_ptr extract_month( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Extracts day from any date time type and returns an int16_t - * cudf::column. - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of the extracted int16_t days - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - */ -std::unique_ptr extract_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Extracts day from any date time type and returns an int16_t - * cudf::column. - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of the extracted int16_t days - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - */ -std::unique_ptr extract_weekday( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Extracts hour from any date time type and returns an int16_t - * cudf::column. - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of the extracted int16_t hours - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - */ -std::unique_ptr extract_hour( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Extracts minute from any date time type and returns an int16_t - * cudf::column. - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of the extracted int16_t minutes - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - */ -std::unique_ptr extract_minute( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Extracts second from any date time type and returns an int16_t - * cudf::column. - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of the extracted int16_t seconds - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - */ -std::unique_ptr extract_second( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** @} */ // end of group -/** - * @addtogroup datetime_compute - * @{ - * @file - */ - -/** - * @brief Computes the last day of the month in date time type and returns a TIMESTAMP_DAYS - * cudf::column. - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column containing last day of the month as TIMESTAMP_DAYS - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - */ -std::unique_ptr last_day_of_month( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Computes the day number since the start of the year from the datetime and - * returns an int16_t cudf::column. The value is between [1, {365-366}] - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of datatype INT16 containing the day number since the start of the year. - * @throw cudf::logic_error if input column datatype is not a TIMESTAMP - */ -std::unique_ptr day_of_year( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Adds or subtracts a number of months from the date time type and returns a - * timestamp column that is of the same type as the input `timestamps` column. - * - * For a given row, if the `timestamps` or the `months` column value is null, - * the output for that row is null. - * This method preserves the input time and the day where applicable. The date is rounded - * down to the last day of the month for that year, if the new day is invalid for that month. - * - * @code{.pseudo} - * Example: - * timestamps = [5/31/20 08:00:00, 5/31/20 00:00:00, 5/31/20 13:00:00, 5/31/20 23:00:00, - * 6/30/20 00:00:01, 6/30/20 14:12:13] - * months = [1 , -1 , -3 , -15 , - * -1 , 1] - * r = add_calendrical_months(timestamp_column, months_column) - * r is [6/30/20 08:00:00, 4/30/20 00:00:00, 2/29/20 13:00:00, 2/28/19 23:00:00, - * 5/30/20 00:00:01, 7/30/20 14:12:13] - * @endcode - - * @throw cudf::logic_error if `timestamps` datatype is not a TIMESTAMP or if `months` datatype - * is not INT16 or INT32. - * @throw cudf::logic_error if `timestamps` column size is not equal to `months` column size. - * - * @param timestamps cudf::column_view of timestamp type. - * @param months cudf::column_view of integer type containing the number of months to add. - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of timestamp type containing the computed timestamps. - */ -std::unique_ptr add_calendrical_months( - cudf::column_view const& timestamps, - cudf::column_view const& months, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Adds or subtracts a number of months from the date time type and returns a - * timestamp column that is of the same type as the input `timestamps` column. - * - * For a given row, if the `timestamps` value is null, the output for that row is null. - * A null months scalar would result in an all null column. - * This method preserves the input time and the day where applicable. The date is rounded - * down to the last day of the month for that year, if the new day is invalid for that month. - * - * @code{.pseudo} - * Example: - * timestamps = [5/31/20 08:00:00, 6/30/20 00:00:00, 7/31/20 13:00:00] - * months = -3 - * output is [2/29/20 08:00:00, 3/30/20 00:00:00, 4/30/20 13:00:00] - * - * timestamps = [4/28/20 04:00:00, 5/30/20 01:00:00, 6/30/20 21:00:00] - * months = 1 - * output is [5/28/20 04:00:00, 6/30/20 01:00:00, 7/30/20 21:00:00] - * @endcode - * - * @throw cudf::logic_error if `timestamps` datatype is not a TIMESTAMP or if `months` datatype - * is not INT16 or INT32. - * @throw cudf::logic_error if `timestamps` column size is not equal to `months` column size. - * - * @param timestamps cudf::column_view of timestamp type. - * @param months cudf::scalar of integer type containing the number of months to add. - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @return cudf::column of timestamp type containing the computed timestamps. - */ -std::unique_ptr add_calendrical_months( - cudf::column_view const& timestamps, - cudf::scalar const& months, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Check if the year of the given date is a leap year - * - * `output[i] == true` if year of `column[i]` is a leap year - * `output[i] == false` if year of `column[i]` is not a leap year - * `output[i] is null` if `column[i]` is null - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @returns cudf::column of datatype BOOL8 truth value of the corresponding date - * @throw cudf::logic_error if input column datatype is not a TIMESTAMP - */ -std::unique_ptr is_leap_year( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Extract the number of days in the month - * - * output[i] contains the number of days in the month of date `column[i]` - * output[i] is null if `column[i]` is null - * - * @throw cudf::logic_error if input column datatype is not a TIMESTAMP - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * @return cudf::column of datatype INT16 of days in month of the corresponding date - */ -std::unique_ptr days_in_month( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Returns the quarter of the date - * - * `output[i]` will be a value from {1, 2, 3, 4} corresponding to the quater of month given by - * `column[i]`. It will be null if the input row at `column[i]` is null. - * - * @throw cudf::logic_error if input column datatype is not a TIMESTAMP - * - * @param column The input column containing datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * @return A column of INT16 type indicating which quarter the date is in - */ -std::unique_ptr extract_quarter( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** @} */ // end of group - -/** - * @brief Round up to the nearest day - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest hour - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_hour( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest minute - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_minute( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest second - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_second( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest millisecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_millisecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest microsecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_microsecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round up to the nearest nanosecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr ceil_nanosecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest day - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_day( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest hour - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_hour( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest minute - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_minute( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest second - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_second( - cudf::column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest millisecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_millisecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest microsecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_microsecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Round down to the nearest nanosecond - * - * @param column cudf::column_view of the input datetime values - * @param mr Device memory resource used to allocate device memory of the returned column. - * - * @throw cudf::logic_error if input column datatype is not TIMESTAMP - * @return cudf::column of the same datetime resolution as the input column - */ -std::unique_ptr floor_nanosecond( - column_view const& column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +namespace detail { +enum class datetime_component { + INVALID = 0, + YEAR, + MONTH, + DAY, + WEEKDAY, + HOUR, + MINUTE, + SECOND, + MILLISECOND, + MICROSECOND, + NANOSECOND +}; + +enum class rounding_kind { CEIL, FLOOR }; + +template +struct extract_component_operator { + template + CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const + { + using namespace cuda::std::chrono; + + auto days_since_epoch = floor(ts); + + auto time_since_midnight = ts - days_since_epoch; + + if (time_since_midnight.count() < 0) { time_since_midnight += days(1); } + + auto hrs_ = duration_cast(time_since_midnight); + auto mins_ = duration_cast(time_since_midnight - hrs_); + auto secs_ = duration_cast(time_since_midnight - hrs_ - mins_); + + switch (Component) { + case datetime_component::YEAR: + return static_cast(year_month_day(days_since_epoch).year()); + case datetime_component::MONTH: + return static_cast(year_month_day(days_since_epoch).month()); + case datetime_component::DAY: + return static_cast(year_month_day(days_since_epoch).day()); + case datetime_component::WEEKDAY: + return year_month_weekday(days_since_epoch).weekday().iso_encoding(); + case datetime_component::HOUR: return hrs_.count(); + case datetime_component::MINUTE: return mins_.count(); + case datetime_component::SECOND: return secs_.count(); + default: return 0; + } + } +}; + +// This functor takes the rounding type as runtime info and dispatches to the ceil/floor/round +// function. +template +struct RoundFunctor { + template + CUDA_DEVICE_CALLABLE auto operator()(rounding_kind round_kind, Timestamp dt) + { + switch (round_kind) { + case rounding_kind::CEIL: return cuda::std::chrono::ceil(dt); + case rounding_kind::FLOOR: return cuda::std::chrono::floor(dt); + default: cudf_assert(false && "Unsupported rounding kind."); + } + __builtin_unreachable(); + } +}; + +struct RoundingDispatcher { + rounding_kind round_kind; + datetime_component component; + + RoundingDispatcher(rounding_kind round_kind, datetime_component component) + : round_kind(round_kind), component(component) + { + } + + template + CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const + { + switch (component) { + case datetime_component::DAY: + return time_point_cast( + RoundFunctor{}(round_kind, ts)); + case datetime_component::HOUR: + return time_point_cast( + RoundFunctor{}(round_kind, ts)); + case datetime_component::MINUTE: + return time_point_cast( + RoundFunctor{}(round_kind, ts)); + case datetime_component::SECOND: + return time_point_cast( + RoundFunctor{}(round_kind, ts)); + case datetime_component::MILLISECOND: + return time_point_cast( + RoundFunctor{}(round_kind, ts)); + case datetime_component::MICROSECOND: + return time_point_cast( + RoundFunctor{}(round_kind, ts)); + case datetime_component::NANOSECOND: + return time_point_cast( + RoundFunctor{}(round_kind, ts)); + default: cudf_assert(false && "Unsupported datetime rounding resolution."); + } + __builtin_unreachable(); + } +}; + +// Number of days until month indexed by leap year and month (0-based index) +static __device__ int16_t const days_until_month[2][13] = { + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, // For non leap years + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} // For leap years +}; + +// Round up the date to the last day of the month and return the +// date only (without the time component) +struct extract_last_day_of_month { + template + CUDA_DEVICE_CALLABLE timestamp_D operator()(Timestamp const ts) const + { + using namespace cuda::std::chrono; + const year_month_day ymd(floor(ts)); + auto const ymdl = year_month_day_last{ymd.year() / ymd.month() / last}; + return timestamp_D{sys_days{ymdl}}; + } +}; + +// Extract the number of days of the month +// A similar operator to `extract_last_day_of_month`, except this returns +// an integer while the other returns a timestamp. +struct days_in_month_op { + template + CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const + { + using namespace cuda::std::chrono; + auto const date = year_month_day(floor(ts)); + auto const ymdl = year_month_day_last(date.year() / date.month() / last); + return static_cast(unsigned{ymdl.day()}); + } +}; + +// Extract the day number of the year present in the timestamp +struct extract_day_num_of_year { + template + CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const + { + using namespace cuda::std::chrono; + + // Only has the days - time component is chopped off, which is what we want + auto const days_since_epoch = floor(ts); + auto const date = year_month_day(days_since_epoch); + + return days_until_month[date.year().is_leap()][unsigned{date.month()} - 1] + + unsigned{date.day()}; + } +}; + +// Extract the the quarter to which the timestamp belongs to +struct extract_quarter_op { + template + CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const + { + using namespace cuda::std::chrono; + + // Only has the days - time component is chopped off, which is what we want + auto const days_since_epoch = floor(ts); + auto const date = year_month_day(days_since_epoch); + auto const month = unsigned{date.month()}; + + // (x + y - 1) / y = ceil(x/y), where x and y are unsigned. x = month, y = 3 + return (month + 2) / 3; + } +}; + +// Returns true if the year is a leap year +struct is_leap_year_op { + template + CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const + { + using namespace cuda::std::chrono; + auto const days_since_epoch = floor(ts); + auto const date = year_month_day(days_since_epoch); + return date.year().is_leap(); + } +}; + +// Specific function for applying ceil/floor date ops +struct dispatch_round { + template + std::enable_if_t(), std::unique_ptr> operator()( + rounding_kind round_kind, + datetime_component component, + cudf::column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + auto size = column.size(); + auto output_col_type = data_type{cudf::type_to_id()}; + + // Return an empty column if source column is empty + if (size == 0) return make_empty_column(output_col_type); + + auto output = make_fixed_width_column(output_col_type, + size, + cudf::detail::copy_bitmask(column, stream, mr), + column.null_count(), + stream, + mr); + + thrust::transform(rmm::exec_policy(stream), + column.begin(), + column.end(), + output->mutable_view().begin(), + RoundingDispatcher{round_kind, component}); + + return output; + } + + template + std::enable_if_t(), std::unique_ptr> operator()( + Args&&...) + { + CUDF_FAIL("Must be cudf::timestamp"); + } +}; + +// Apply the functor for every element/row in the input column to create the output column +template +struct launch_functor { + column_view input; + mutable_column_view output; + + launch_functor(column_view inp, mutable_column_view out) : input(inp), output(out) {} + + template + typename std::enable_if_t::value, void> operator()( + rmm::cuda_stream_view stream) const + { + CUDF_FAIL("Cannot extract datetime component from non-timestamp column."); + } + + template + typename std::enable_if_t::value, void> operator()( + rmm::cuda_stream_view stream) const + { + thrust::transform(rmm::exec_policy(stream), + input.begin(), + input.end(), + output.begin(), + TransformFunctor{}); + } +}; + +// Create an output column by applying the functor to every element from the input column +template +std::unique_ptr apply_datetime_op(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(is_timestamp(column.type()), "Column type should be timestamp"); + auto size = column.size(); + auto output_col_type = data_type{OutputColCudfT}; + + // Return an empty column if source column is empty + if (size == 0) return make_empty_column(output_col_type); + + auto output = make_fixed_width_column(output_col_type, + size, + cudf::detail::copy_bitmask(column, stream, mr), + column.null_count(), + stream, + mr); + auto launch = + launch_functor::type>{ + column, static_cast(*output)}; + + type_dispatcher(column.type(), launch, stream); + + return output; +} + +struct add_calendrical_months_functor { + template + typename std::enable_if_t::value, std::unique_ptr> + operator()(Args&&...) const + { + CUDF_FAIL("Cannot extract datetime component from non-timestamp column."); + } + + template + typename std::enable_if_t::value, std::unique_ptr> + operator()(column_view timestamp_column, + MonthIterator months_begin, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + auto size = timestamp_column.size(); + auto output_col_type = timestamp_column.type(); + + // Return an empty column if source column is empty + if (size == 0) return make_empty_column(output_col_type); + + // The nullmask of `output` cannot be determined without information from + // the `months` type (column or scalar). Therefore, it is initialized as + // `UNALLOCATED` and assigned at a later stage. + auto output = + make_fixed_width_column(output_col_type, size, mask_state::UNALLOCATED, stream, mr); + auto output_mview = output->mutable_view(); + + thrust::transform(rmm::exec_policy(stream), + timestamp_column.begin(), + timestamp_column.end(), + months_begin, + output->mutable_view().begin(), + [] __device__(auto& timestamp, auto& months) { + return add_calendrical_months_with_scale_back( + timestamp, cuda::std::chrono::months{months}); + }); + return output; + } +}; + +std::unique_ptr add_calendrical_months(column_view const& timestamp_column, + column_view const& months_column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp"); + CUDF_EXPECTS( + months_column.type().id() == type_id::INT16 or months_column.type().id() == type_id::INT32, + "Months column type should be INT16 or INT32."); + CUDF_EXPECTS(timestamp_column.size() == months_column.size(), + "Timestamp and months column should be of the same size"); + + auto const months_begin_iter = + cudf::detail::indexalator_factory::make_input_iterator(months_column); + auto output = type_dispatcher(timestamp_column.type(), + add_calendrical_months_functor{}, + timestamp_column, + months_begin_iter, + stream, + mr); + + auto [output_null_mask, null_count] = + cudf::detail::bitmask_and(table_view{{timestamp_column, months_column}}, stream, mr); + output->set_null_mask(std::move(output_null_mask), null_count); + return output; +} + +std::unique_ptr add_calendrical_months(column_view const& timestamp_column, + scalar const& months, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp"); + CUDF_EXPECTS(months.type().id() == type_id::INT16 or months.type().id() == type_id::INT32, + "Months type should be INT16 or INT32"); + + if (months.is_valid(stream)) { + auto const months_begin_iter = thrust::make_permutation_iterator( + cudf::detail::indexalator_factory::make_input_iterator(months), + thrust::make_constant_iterator(0)); + auto output = type_dispatcher(timestamp_column.type(), + add_calendrical_months_functor{}, + timestamp_column, + months_begin_iter, + stream, + mr); + output->set_null_mask(cudf::detail::copy_bitmask(timestamp_column, stream, mr)); + return output; + } else { + return make_timestamp_column( + timestamp_column.type(), timestamp_column.size(), mask_state::ALL_NULL, stream, mr); + } +} + +std::unique_ptr round_general(rounding_kind round_kind, + datetime_component component, + column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return cudf::type_dispatcher( + column.type(), dispatch_round{}, round_kind, component, column, stream, mr); +} + +std::unique_ptr extract_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_day(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_weekday(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_hour(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_minute(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_second(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr last_day_of_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op(column, stream, mr); +} + +std::unique_ptr day_of_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op( + column, stream, mr); +} + +std::unique_ptr is_leap_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return apply_datetime_op(column, stream, mr); +} + +std::unique_ptr days_in_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return apply_datetime_op(column, stream, mr); +} + +std::unique_ptr extract_quarter(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return apply_datetime_op(column, stream, mr); +} + +} // namespace detail + +std::unique_ptr ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::DAY, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::HOUR, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::MINUTE, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::SECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr ceil_millisecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::MILLISECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr ceil_microsecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::MICROSECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr ceil_nanosecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::NANOSECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_day(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::DAY, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_hour(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::HOUR, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_minute(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::MINUTE, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_second(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::SECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_millisecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::MILLISECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_microsecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::MICROSECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_nanosecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::NANOSECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr extract_year(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_year(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr extract_month(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_month(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr extract_day(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_day(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr extract_weekday(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_weekday(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr extract_hour(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_hour(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr extract_minute(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_minute(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr extract_second(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_second(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr last_day_of_month(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::last_day_of_month(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr day_of_year(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::day_of_year(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, + cudf::column_view const& months_column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::add_calendrical_months( + timestamp_column, months_column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, + cudf::scalar const& months, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::add_calendrical_months(timestamp_column, months, rmm::cuda_stream_default, mr); +} + +std::unique_ptr is_leap_year(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_leap_year(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr days_in_month(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::days_in_month(column, rmm::cuda_stream_default, mr); +} + +std::unique_ptr extract_quarter(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_quarter(column, rmm::cuda_stream_default, mr); +} } // namespace datetime } // namespace cudf \ No newline at end of file From dbbd88cb9e891f199118e7d7e69d559b6ad3bd81 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Fri, 3 Dec 2021 00:07:03 +0000 Subject: [PATCH 11/13] last cleanup --- cpp/include/cudf/datetime.hpp | 2 +- cpp/src/datetime/datetime_ops.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 74be2a3c0a6..71e5968bf07 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -470,4 +470,4 @@ std::unique_ptr floor_nanosecond( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace datetime -} // namespace cudf \ No newline at end of file +} // namespace cudf diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index efef26a762b..717bd7ac0a8 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -769,4 +769,4 @@ std::unique_ptr extract_quarter(column_view const& column, } } // namespace datetime -} // namespace cudf \ No newline at end of file +} // namespace cudf From 1db75893bb39e179514580b26f7eaf591ca27ac4 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Fri, 3 Dec 2021 04:15:32 +0000 Subject: [PATCH 12/13] add test cases --- python/cudf/cudf/tests/test_series.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 73fe46746ce..e66a5b40316 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1346,3 +1346,15 @@ def test_nullable_bool_dtype_series(data, bool_dtype): gsr = cudf.Series(data, dtype=bool_dtype) assert_eq(psr, gsr.to_pandas(nullable=True)) + + +@pytest.mark.parametrize( + "cudf_series", [(cudf.Series([0.25, 0.5, 0.2, -0.05]))] +) +def test_autocorr(cudf_series): + psr = cudf_series.to_pandas() + + cudf_corr = cudf_series.autocorr() + pd_corr = psr.autocorr() + + assert_eq(pd_corr, cudf_corr) From ce2547b78659857c175da56740ae796771e9fa48 Mon Sep 17 00:00:00 2001 From: Mayank Anand Date: Tue, 7 Dec 2021 17:38:57 +0000 Subject: [PATCH 13/13] added further test cases, parametrized lag --- python/cudf/cudf/tests/test_series.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index fe12010a432..d59e3ba7571 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1349,13 +1349,18 @@ def test_nullable_bool_dtype_series(data, bool_dtype): @pytest.mark.parametrize( - "cudf_series", [(cudf.Series([0.25, 0.5, 0.2, -0.05]))] + "cudf_series", + [ + cudf.Series([0.25, 0.5, 0.2, -0.05]), + cudf.Series([0, 1, 2, np.nan, 4, cudf.NA, 6]), + ], ) -def test_autocorr(cudf_series): +@pytest.mark.parametrize("lag", [1, 2, 3, 4]) +def test_autocorr(cudf_series, lag): psr = cudf_series.to_pandas() - cudf_corr = cudf_series.autocorr() - pd_corr = psr.autocorr() + cudf_corr = cudf_series.autocorr(lag=lag) + pd_corr = psr.autocorr(lag=lag) assert_eq(pd_corr, cudf_corr)