diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index a8955ffb17c..fb04336871f 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -36,7 +36,7 @@ namespace datetime { */ /** - * @brief Extracts year from any date time type and returns an int16_t + * @brief Extracts year from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -50,7 +50,7 @@ std::unique_ptr extract_year( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts month from any date time type and returns an int16_t + * @brief Extracts month from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -64,7 +64,7 @@ std::unique_ptr extract_month( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts day from any date time type and returns an int16_t + * @brief Extracts day from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -78,7 +78,7 @@ std::unique_ptr extract_day( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts day from any date time type and returns an int16_t + * @brief Extracts day from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -92,7 +92,7 @@ std::unique_ptr extract_weekday( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts hour from any date time type and returns an int16_t + * @brief Extracts hour from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -106,7 +106,7 @@ std::unique_ptr extract_hour( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts minute from any date time type and returns an int16_t + * @brief Extracts minute from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -120,7 +120,7 @@ std::unique_ptr extract_minute( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts second from any date time type and returns an int16_t + * @brief Extracts second from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -133,6 +133,57 @@ std::unique_ptr extract_second( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Extracts millisecond fraction from any datetime type and returns an int16_t + * cudf::column. + * + * A millisecond fraction is only the 3 digits that make up the millisecond portion of a duration. + * For example, the millisecond fraction of 1.234567890 seconds is 234. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t milliseconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_millisecond_fraction( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts microsecond fraction from any datetime type and returns an int16_t + * cudf::column. + * + * A microsecond fraction is only the 3 digits that make up the microsecond portion of a duration. + * For example, the microsecond fraction of 1.234567890 seconds is 567. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t microseconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_microsecond_fraction( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts nanosecond fraction from any datetime type and returns an int16_t + * cudf::column. + * + * A nanosecond fraction is only the 3 digits that make up the nanosecond portion of a duration. + * For example, the nanosecond fraction of 1.234567890 seconds is 890. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t nanoseconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_nanosecond_fraction( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group /** * @addtogroup datetime_compute @@ -141,7 +192,7 @@ std::unique_ptr extract_second( */ /** - * @brief Computes the last day of the month in date time type and returns a TIMESTAMP_DAYS + * @brief Computes the last day of the month in datetime type and returns a TIMESTAMP_DAYS * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -169,7 +220,7 @@ std::unique_ptr day_of_year( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Adds or subtracts a number of months from the date time type and returns a + * @brief Adds or subtracts a number of months from the datetime type and returns a * timestamp column that is of the same type as the input `timestamps` column. * * For a given row, if the `timestamps` or the `months` column value is null, @@ -204,7 +255,7 @@ std::unique_ptr add_calendrical_months( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Adds or subtracts a number of months from the date time type and returns a + * @brief Adds or subtracts a number of months from the datetime type and returns a * timestamp column that is of the same type as the input `timestamps` column. * * For a given row, if the `timestamps` value is null, the output for that row is null. diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 7a2545fbdcf..d17e641533e 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -94,6 +94,39 @@ std::unique_ptr extract_second( rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, + * rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr extract_millisecond_fraction( + cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, + * rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr extract_microsecond_fraction( + cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, + * rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr extract_nanosecond_fraction( + cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::mr::device_memory_resource *) * diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index ee026d6c395..e89792525c9 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -76,9 +76,22 @@ struct extract_component_operator { if (time_since_midnight.count() < 0) { time_since_midnight += days(1); } - auto hrs_ = duration_cast(time_since_midnight); - auto mins_ = duration_cast(time_since_midnight - hrs_); - auto secs_ = duration_cast(time_since_midnight - hrs_ - mins_); + auto const hrs_ = [&] { return duration_cast(time_since_midnight); }; + auto const mins_ = [&] { return duration_cast(time_since_midnight) - hrs_(); }; + auto const secs_ = [&] { + return duration_cast(time_since_midnight) - hrs_() - mins_(); + }; + auto const millisecs_ = [&] { + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_(); + }; + auto const microsecs_ = [&] { + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() - + millisecs_(); + }; + auto const nanosecs_ = [&] { + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() - + millisecs_() - microsecs_(); + }; switch (Component) { case datetime_component::YEAR: @@ -89,9 +102,12 @@ struct extract_component_operator { return static_cast(year_month_day(days_since_epoch).day()); case datetime_component::WEEKDAY: return year_month_weekday(days_since_epoch).weekday().iso_encoding(); - case datetime_component::HOUR: return hrs_.count(); - case datetime_component::MINUTE: return mins_.count(); - case datetime_component::SECOND: return secs_.count(); + case datetime_component::HOUR: return hrs_().count(); + case datetime_component::MINUTE: return mins_().count(); + case datetime_component::SECOND: return secs_().count(); + case datetime_component::MILLISECOND: return millisecs_().count(); + case datetime_component::MICROSECOND: return microsecs_().count(); + case datetime_component::NANOSECOND: return nanosecs_().count(); default: return 0; } } @@ -495,6 +511,33 @@ std::unique_ptr extract_second(column_view const& column, cudf::type_id::INT16>(column, stream, mr); } +std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + std::unique_ptr last_day_of_month(column_view const& column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -607,6 +650,27 @@ std::unique_ptr extract_second(column_view const& column, return detail::extract_second(column, cudf::default_stream_value, mr); } +std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_millisecond_fraction(column, cudf::default_stream_value, mr); +} + +std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_microsecond_fraction(column, cudf::default_stream_value, mr); +} + +std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_nanosecond_fraction(column, cudf::default_stream_value, mr); +} + std::unique_ptr last_day_of_month(column_view const& column, rmm::mr::device_memory_resource* mr) { diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 2898a649e36..c6d36b2aa6e 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -60,6 +60,9 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp) EXPECT_THROW(extract_hour(col), cudf::logic_error); EXPECT_THROW(extract_minute(col), cudf::logic_error); EXPECT_THROW(extract_second(col), cudf::logic_error); + EXPECT_THROW(extract_millisecond_fraction(col), cudf::logic_error); + EXPECT_THROW(extract_microsecond_fraction(col), cudf::logic_error); + EXPECT_THROW(extract_nanosecond_fraction(col), cudf::logic_error); EXPECT_THROW(last_day_of_month(col), cudf::logic_error); EXPECT_THROW(day_of_year(col), cudf::logic_error); EXPECT_THROW(add_calendrical_months( @@ -97,12 +100,21 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) 1674631932929 // 2023-01-25 07:32:12.929 GMT }; + auto timestamps_ns = + cudf::test::fixed_width_column_wrapper{ + -23324234, // 1969-12-31 23:59:59.976675766 GMT + 23432424, // 1970-01-01 00:00:00.023432424 GMT + 987234623 // 1970-01-01 00:00:00.987234623 GMT + }; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_D), fixed_width_column_wrapper{1965, 2018, 2023}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_s), fixed_width_column_wrapper{1965, 2018, 2023}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ms), fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ns), + fixed_width_column_wrapper{1969, 1970, 1970}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_D), fixed_width_column_wrapper{10, 7, 1}); @@ -110,6 +122,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{10, 7, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ms), fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ns), + fixed_width_column_wrapper{12, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_D), fixed_width_column_wrapper{26, 4, 25}); @@ -117,6 +131,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{26, 4, 25}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ms), fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ns), + fixed_width_column_wrapper{31, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_D), fixed_width_column_wrapper{2, 3, 3}); @@ -124,6 +140,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{2, 3, 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms), fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms), + fixed_width_column_wrapper{2, 3, 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_D), fixed_width_column_wrapper{0, 0, 0}); @@ -131,6 +149,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{14, 12, 7}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ms), fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ns), + fixed_width_column_wrapper{23, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_D), fixed_width_column_wrapper{0, 0, 0}); @@ -138,6 +158,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{1, 0, 32}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ms), fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns), + fixed_width_column_wrapper{59, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_D), fixed_width_column_wrapper{0, 0, 0}); @@ -145,6 +167,35 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{12, 0, 12}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_ms), fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_D), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_s), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ms), + fixed_width_column_wrapper{762, 0, 929}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ns), + fixed_width_column_wrapper{976, 23, 987}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_D), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_s), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ms), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ns), + fixed_width_column_wrapper{675, 432, 234}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_D), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_s), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ms), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns), + fixed_width_column_wrapper{766, 424, 623}); } template @@ -175,6 +226,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), int16s); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), int16s); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps), int16s); } TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents) diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 8e0e3bbd411..6edd15e7176 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -262,12 +262,15 @@ Time/date components DatetimeIndex.hour DatetimeIndex.minute DatetimeIndex.second - DatetimeIndex.dayofweek - DatetimeIndex.dayofyear + DatetimeIndex.microsecond + DatetimeIndex.nanosecond DatetimeIndex.day_of_year + DatetimeIndex.dayofyear + DatetimeIndex.dayofweek DatetimeIndex.weekday - DatetimeIndex.is_leap_year DatetimeIndex.quarter + DatetimeIndex.is_leap_year + DatetimeIndex.isocalendar Time-specific operations diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 53042041f6d..842319338b3 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -260,25 +260,27 @@ Datetime properties .. autosummary:: :toctree: api/ + year + month day - dayofweek - dayofyear - days_in_month - day_of_year hour minute - month second + microsecond + nanosecond + dayofweek weekday - year - is_leap_year + dayofyear + day_of_year + quarter is_month_start is_month_end is_quarter_start is_quarter_end is_year_start is_year_end - quarter + is_leap_year + days_in_month Datetime methods ^^^^^^^^^^^^^^^^ @@ -286,11 +288,11 @@ Datetime methods .. autosummary:: :toctree: api/ - strftime isocalendar - ceil - floor + strftime round + floor + ceil Timedelta properties @@ -300,11 +302,11 @@ Timedelta properties .. autosummary:: :toctree: api/ - components days + seconds microseconds nanoseconds - seconds + components .. _api.series.str: .. include:: string_handling.rst diff --git a/docs/cudf/source/api_docs/string_handling.rst b/docs/cudf/source/api_docs/string_handling.rst index 1496d68db6f..2285bb8fb7a 100644 --- a/docs/cudf/source/api_docs/string_handling.rst +++ b/docs/cudf/source/api_docs/string_handling.rst @@ -28,6 +28,7 @@ strings and apply several methods to it. These can be accessed like filter_tokens find findall + find_multiple get get_json_object hex_to_int diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb index 9fbac3b2578..47c6ba408fb 100644 --- a/docs/cudf/source/user_guide/cupy-interop.ipynb +++ b/docs/cudf/source/user_guide/cupy-interop.ipynb @@ -42,7 +42,7 @@ "\n", "2. We can also use `DataFrame.values`.\n", "\n", - "3. We can also convert via the [CUDA array interface](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) by using cuDF's `as_gpu_matrix` and CuPy's `asarray` functionality." + "3. We can also convert via the [CUDA array interface](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) by using cuDF's `to_cupy` functionality." ] }, { diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 74addb87357..d03587745e1 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -15,6 +15,15 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_hour(const column_view& column) except + cdef unique_ptr[column] extract_minute(const column_view& column) except + cdef unique_ptr[column] extract_second(const column_view& column) except + + cdef unique_ptr[column] extract_millisecond_fraction( + const column_view& column + ) except + + cdef unique_ptr[column] extract_microsecond_fraction( + const column_view& column + ) except + + cdef unique_ptr[column] extract_nanosecond_fraction( + const column_view& column + ) except + ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": DAY "cudf::datetime::rounding_frequency::DAY" diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index e218400a2db..cb0a245b915 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -49,6 +49,18 @@ def extract_datetime_component(Column col, object field): c_result = move(libcudf_datetime.extract_minute(col_view)) elif field == "second": c_result = move(libcudf_datetime.extract_second(col_view)) + elif field == "millisecond": + c_result = move( + libcudf_datetime.extract_millisecond_fraction(col_view) + ) + elif field == "microsecond": + c_result = move( + libcudf_datetime.extract_microsecond_fraction(col_view) + ) + elif field == "nanosecond": + c_result = move( + libcudf_datetime.extract_nanosecond_fraction(col_view) + ) elif field == "day_of_year": c_result = move(libcudf_datetime.day_of_year(col_view)) else: diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index ff558a06d87..22a5066a20e 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -61,6 +61,7 @@ startswith, startswith_multiple, ) +from cudf._lib.strings.find_multiple import find_multiple from cudf._lib.strings.findall import findall from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object from cudf._lib.strings.padding import ( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 287e68531f8..c84e4ff4adb 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -3623,6 +3623,70 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: data = libstrings.findall(self._column, pat, flags) return self._return_or_inplace(data) + def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series": + """ + Find all first occurrences of patterns in the Series/Index. + + Parameters + ---------- + patterns : array-like, Sequence or Series + Patterns to search for in the given Series/Index. + + Returns + ------- + Series + A Series with a list of indices of each pattern's first occurrence. + If a pattern is not found, -1 is returned for that index. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series(["strings", "to", "search", "in"]) + >>> s + 0 strings + 1 to + 2 search + 3 in + dtype: object + >>> t = cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]) + >>> t + 0 a + 1 string + 2 g + 3 inn + 4 o + 5 r + 6 sea + dtype: object + >>> s.str.find_multiple(t) + 0 [-1, 0, 5, -1, -1, 2, -1] + 1 [-1, -1, -1, -1, 1, -1, -1] + 2 [2, -1, -1, -1, -1, 3, 0] + 3 [-1, -1, -1, -1, -1, -1, -1] + dtype: list + """ + if can_convert_to_column(patterns): + patterns_column = column.as_column(patterns) + else: + raise TypeError( + "patterns should be an array-like or a Series object, " + f"found {type(patterns)}" + ) + + if not isinstance(patterns_column, StringColumn): + raise TypeError( + "patterns can only be of 'string' dtype, " + f"got: {patterns_column.dtype}" + ) + + return cudf.Series( + libstrings.find_multiple(self._column, patterns_column), + index=self._parent.index + if isinstance(self._parent, cudf.Series) + else self._parent, + name=self._parent.name, + ) + def isempty(self) -> SeriesOrIndex: """ Check whether each string is an empty string. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5b101f74664..0628497fc29 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2051,6 +2051,56 @@ def second(self): """ return self._get_dt_field("second") + @property # type: ignore + @_cudf_nvtx_annotate + def microsecond(self): + """ + The microseconds of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", + ... periods=3, freq="us")) + >>> datetime_index + DatetimeIndex([ '2000-01-01 00:00:00', '2000-01-01 00:00:00.000001', + '2000-01-01 00:00:00.000002'], + dtype='datetime64[ns]') + >>> datetime_index.microsecond + Int32Index([0, 1, 2], dtype='int32') + """ # noqa: E501 + return as_index( + ( + self._values.get_dt_field("millisecond") + * cudf.Scalar(1000, dtype="int32") + ) + + self._values.get_dt_field("microsecond"), + name=self.name, + ) + + @property # type: ignore + @_cudf_nvtx_annotate + def nanosecond(self): + """ + The nanoseconds of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", + ... periods=3, freq="ns")) + >>> datetime_index + DatetimeIndex([ '2000-01-01 00:00:00', + '2000-01-01 00:00:00.000000001', + '2000-01-01 00:00:00.000000002'], + dtype='datetime64[ns]') + >>> datetime_index.nanosecond + Int16Index([0, 1, 2], dtype='int16') + """ + return self._get_dt_field("nanosecond") + @property # type: ignore @_cudf_nvtx_annotate def weekday(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f11052096e3..7493202a3d1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1206,7 +1206,8 @@ def __repr__(self): and not is_decimal_dtype(preprocess.dtype) and not is_struct_dtype(preprocess.dtype) ) or isinstance( - preprocess._column, cudf.core.column.timedelta.TimeDeltaColumn + preprocess._column, + cudf.core.column.timedelta.TimeDeltaColumn, ): output = repr( preprocess.astype("O").fillna(cudf._NA_REP).to_pandas() @@ -3591,6 +3592,64 @@ def second(self): """ return self._get_dt_field("second") + @property # type: ignore + @_cudf_nvtx_annotate + def microsecond(self): + """ + The microseconds of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", + ... periods=3, freq="us")) + >>> datetime_series + 0 2000-01-01 00:00:00.000000 + 1 2000-01-01 00:00:00.000001 + 2 2000-01-01 00:00:00.000002 + dtype: datetime64[ns] + >>> datetime_series.dt.microsecond + 0 0 + 1 1 + 2 2 + dtype: int32 + """ + return Series( + data=( + self.series._column.get_dt_field("millisecond") + * cudf.Scalar(1000, dtype="int32") + ) + + self.series._column.get_dt_field("microsecond"), + index=self.series._index, + name=self.series.name, + ) + + @property # type: ignore + @_cudf_nvtx_annotate + def nanosecond(self): + """ + The nanoseconds of the datetime. + + Examples + -------- + >>> import pandas as pd + >>> import cudf + >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", + ... periods=3, freq="ns")) + >>> datetime_series + 0 2000-01-01 00:00:00.000000000 + 1 2000-01-01 00:00:00.000000001 + 2 2000-01-01 00:00:00.000000002 + dtype: datetime64[ns] + >>> datetime_series.dt.nanosecond + 0 0 + 1 1 + 2 2 + dtype: int16 + """ + return self._get_dt_field("nanosecond") + @property # type: ignore @_cudf_nvtx_annotate def weekday(self): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d9e9a4dbba1..1fcfbe5fc91 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4286,26 +4286,26 @@ def test_series_values_property(data): pytest.param( {"A": [1, None, 3], "B": [1, 2, None]}, marks=pytest.mark.xfail( - reason="Nulls not supported by as_gpu_matrix" + reason="Nulls not supported by values accessor" ), ), pytest.param( {"A": [None, None, None], "B": [None, None, None]}, marks=pytest.mark.xfail( - reason="Nulls not supported by as_gpu_matrix" + reason="Nulls not supported by values accessor" ), ), {"A": [], "B": []}, pytest.param( {"A": [1, 2, 3], "B": ["a", "b", "c"]}, marks=pytest.mark.xfail( - reason="str or categorical not supported by as_gpu_matrix" + reason="str or categorical not supported by values accessor" ), ), pytest.param( {"A": pd.Categorical(["a", "b", "c"]), "B": ["d", "e", "f"]}, marks=pytest.mark.xfail( - reason="str or categorical not supported by as_gpu_matrix" + reason="str or categorical not supported by values accessor" ), ), ], diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 800a8aeeab5..bd3b3561701 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -28,7 +28,9 @@ def data1(): def data2(): - return pd.date_range("20010101", "20020215", freq="400h", name="times") + return pd.date_range( + "20010101", freq="243434324423423234N", name="times", periods=10 + ) def timeseries_us_data(): @@ -81,6 +83,8 @@ def numerical_data(): "hour", "minute", "second", + "microsecond", + "nanosecond", "weekday", "dayofweek", "dayofyear", @@ -172,7 +176,7 @@ def test_dt_ops(data): # libcudf doesn't respect timezones -@pytest.mark.parametrize("data", [data1()]) +@pytest.mark.parametrize("data", [data1(), data2()]) @pytest.mark.parametrize("field", fields) def test_dt_series(data, field): pd_data = pd.Series(data.copy()) @@ -182,7 +186,7 @@ def test_dt_series(data, field): assert_eq(base, test) -@pytest.mark.parametrize("data", [data1()]) +@pytest.mark.parametrize("data", [data1(), data2()]) @pytest.mark.parametrize("field", fields) def test_dt_index(data, field): pd_data = data.copy() diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 74d602c2cf1..2a43adf5a5c 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -3423,3 +3423,64 @@ def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected): sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep ) assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "patterns, expected", + [ + ( + lambda: ["a", "s", "g", "i", "o", "r"], + [ + [-1, 0, 5, 3, -1, 2], + [-1, -1, -1, -1, 1, -1], + [2, 0, -1, -1, -1, 3], + [-1, -1, -1, 0, -1, -1], + ], + ), + ( + lambda: cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]), + [ + [-1, 0, 5, -1, -1, 2, -1], + [-1, -1, -1, -1, 1, -1, -1], + [2, -1, -1, -1, -1, 3, 0], + [-1, -1, -1, -1, -1, -1, -1], + ], + ), + ], +) +def test_str_find_multiple(patterns, expected): + s = cudf.Series(["strings", "to", "search", "in"]) + t = patterns() + + expected = cudf.Series(expected) + + # We convert to pandas because find_multiple returns ListDtype(int32) + # and expected is ListDtype(int64). + # Currently there is no easy way to type-cast these to match. + assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) + + s = cudf.Index(s) + t = cudf.Index(t) + + expected.index = s + + assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) + + +def test_str_find_multiple_error(): + s = cudf.Series(["strings", "to", "search", "in"]) + with pytest.raises( + TypeError, + match=re.escape( + "patterns should be an array-like or a Series object, found " + "" + ), + ): + s.str.find_multiple("a") + + t = cudf.Series([1, 2, 3]) + with pytest.raises( + TypeError, + match=re.escape("patterns can only be of 'string' dtype, got: int64"), + ): + s.str.find_multiple(t)