From fdcb90a01fe84be8dbd83aa41bc5678b47185e16 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 31 Aug 2021 10:41:24 -0400 Subject: [PATCH] Support additional format specifiers in from_timestamps (#9047) Reference #5991 This PR adds support for the following format specifiers in `cudf::strings::from_timestamp` ``` %a and %A -- weekday names (passed into the API) %b and %B -- month names (passed into the API) %u - ISO weekday (1-7) %w - weekday (0-6) %U - week of the year (Sunday based) %W - week of the year (Monday based) %V - ISO week of the year %G - Year based on ISO weeks ``` This adds a new parameter to the API for the caller to pass then string names for the weekdays and months. These are only required if the `%a, %b, %A, %B` specifiers are contained in the format string. The change to `from_timestamps` is mainly a rewrite to include logic for these specifiers. Some common code required corresponding changes to `to_timestamps` and `is_timestamps` though these functions have not changed in this PR. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Devavret Makkar (https://github.com/devavret) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/9047 --- .../cudf/strings/convert/convert_datetime.hpp | 84 +- .../cudf/strings/detail/converters.hpp | 3 +- cpp/src/io/csv/writer_impl.cu | 7 +- cpp/src/strings/convert/convert_datetime.cu | 965 ++++++++++-------- cpp/tests/strings/datetime_tests.cpp | 140 +++ 5 files changed, 752 insertions(+), 447 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp index 39bd6c639aa..4abca96e32a 100644 --- a/cpp/include/cudf/strings/convert/convert_datetime.hpp +++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,9 @@ #include #include +#include +#include + namespace cudf { namespace strings { /** @@ -135,7 +138,20 @@ std::unique_ptr is_timestamp( * | \%z | Always outputs "+0000" | * | \%Z | Always outputs "UTC" | * | \%j | Day of the year: 001-366 | - * | \%p | Only 'AM' or 'PM' | + * | \%u | ISO weekday where Monday is 1 and Sunday is 7 | + * | \%w | Weekday where Sunday is 0 and Saturday is 6 | + * | \%U | Week of the year with Sunday as the first day: 00-53 | + * | \%W | Week of the year with Monday as the first day: 00-53 | + * | \%V | Week of the year per ISO-8601 format: 01-53 | + * | \%G | Year based on the ISO-8601 weeks: 0000-9999 | + * | \%p | AM/PM from `timestamp_names::am_str/pm_str` | + * | \%a | Weekday abbreviation from the `names` parameter | + * | \%A | Weekday from the `names` parameter | + * | \%b | Month name abbreviation from the `names` parameter | + * | \%B | Month name from the `names` parameter | + * + * Additional descriptions can be found here: + * https://en.cppreference.com/w/cpp/chrono/system_clock/formatter * * No checking is done for invalid formats or invalid timestamp values. * All timestamps values are formatted to UTC. @@ -143,25 +159,75 @@ std::unique_ptr is_timestamp( * Any null input entry will result in a corresponding null entry in the output column. * * The time units of the input column do not influence the number of digits written by - * the "%f" specifier. - * The "%f" supports a precision value to write out numeric digits for the subsecond value. - * Specify the precision with a single integer value (1-9) between the "%" and the "f" as follows: - * use "%3f" for milliseconds, "%6f" for microseconds and "%9f" for nanoseconds. - * If the precision is higher than the units, then zeroes are padded to the right of - * the subsecond value. - * If the precision is lower than the units, the subsecond value may be truncated. + * the "%f" specifier. The "%f" supports a precision value to write out numeric digits + * for the subsecond value. Specify the precision with a single integer value (1-9) + * between the "%" and the "f" as follows: use "%3f" for milliseconds, use "%6f" for + * microseconds and use "%9f" for nanoseconds. If the precision is higher than the + * units, then zeroes are padded to the right of the subsecond value. If the precision + * is lower than the units, the subsecond value may be truncated. + * + * If the "%a", "%A", "%b", "%B" specifiers are included in the format, the caller + * should provide the format names in the `names` strings column using the following + * as a guide: + * + * @code{.pseudo} + * ["AM", "PM", // specify the AM/PM strings + * "Sunday", "Monday", ..., "Saturday", // Weekday full names + * "Sun", "Mon", ..., "Sat", // Weekday abbreviated names + * "January", "February", ..., "December", // Month full names + * "Jan", "Feb", ..., "Dec"] // Month abbreviated names + * @endcode + * + * The result is undefined if the format names are not provided for these specifiers. + * + * These format names can be retrieved for specific locales using the `nl_langinfo` + * functions from C++ `clocale` (std) library or the Python `locale` library. + * + * The following code is an example of retrieving these strings from the locale + * using c++ std functions: + * + * @code{.cpp} + * #include + * #include + * + * // note: install language pack on Ubuntu using 'apt-get install language-pack-de' + * { + * // set to a German language locale for date settings + * std::setlocale(LC_TIME, "de_DE.UTF-8"); + * + * std::vector names({nl_langinfo(AM_STR), nl_langinfo(PM_STR), + * nl_langinfo(DAY_1), nl_langinfo(DAY_2), nl_langinfo(DAY_3), nl_langinfo(DAY_4), + * nl_langinfo(DAY_5), nl_langinfo(DAY_6), nl_langinfo(DAY_7), + * nl_langinfo(ABDAY_1), nl_langinfo(ABDAY_2), nl_langinfo(ABDAY_3), nl_langinfo(ABDAY_4), + * nl_langinfo(ABDAY_5), nl_langinfo(ABDAY_6), nl_langinfo(ABDAY_7), + * nl_langinfo(MON_1), nl_langinfo(MON_2), nl_langinfo(MON_3), nl_langinfo(MON_4), + * nl_langinfo(MON_5), nl_langinfo(MON_6), nl_langinfo(MON_7), nl_langinfo(MON_8), + * nl_langinfo(MON_9), nl_langinfo(MON_10), nl_langinfo(MON_11), nl_langinfo(MON_12), + * nl_langinfo(ABMON_1), nl_langinfo(ABMON_2), nl_langinfo(ABMON_3), nl_langinfo(ABMON_4), + * nl_langinfo(ABMON_5), nl_langinfo(ABMON_6), nl_langinfo(ABMON_7), nl_langinfo(ABMON_8), + * nl_langinfo(ABMON_9), nl_langinfo(ABMON_10), nl_langinfo(ABMON_11), nl_langinfo(ABMON_12)}); + * + * std::setlocale(LC_TIME,""); // reset to default locale + * } + * @endcode * * @throw cudf::logic_error if `timestamps` column parameter is not a timestamp type. + * @throw cudf::logic_error if the `format` string is empty + * @throw cudf::logic_error if `names.size()` is an invalid size. Must be 0 or 40 strings. * * @param timestamps Timestamp values to convert. * @param format The string specifying output format. * Default format is "%Y-%m-%dT%H:%M:%SZ". + * @param names The string names to use for weekdays ("%a", "%A") and months ("%b", "%B") + * Default is an empty `strings_column_view`. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with formatted timestamps. */ std::unique_ptr from_timestamps( column_view const& timestamps, std::string const& format = "%Y-%m-%dT%H:%M:%SZ", + strings_column_view const& names = strings_column_view(column_view{ + data_type{type_id::STRING}, 0, nullptr}), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp index d91979708e0..b44276fcc33 100644 --- a/cpp/include/cudf/strings/detail/converters.hpp +++ b/cpp/include/cudf/strings/detail/converters.hpp @@ -100,12 +100,13 @@ std::unique_ptr to_timestamps(strings_column_view const& strings, /** * @copydoc from_timestamps(strings_column_view const&,std::string - * const&,rmm::mr::device_memory_resource*) + * const&,strings_column_view const&,rmm::mr::device_memory_resource*) * * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr from_timestamps(column_view const& timestamps, std::string const& format, + strings_column_view const& names, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 335634b7fa8..f50aae72418 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -231,7 +231,12 @@ struct column_to_strings_fn { format = "\"" + format + "\""; } - return cudf::strings::detail::from_timestamps(column, format, stream_, mr_); + return cudf::strings::detail::from_timestamps( + column, + format, + strings_column_view(column_view{data_type{type_id::STRING}, 0, nullptr}), + stream_, + mr_); } template diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index d804ac66961..ce5eb015039 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -26,17 +27,19 @@ #include #include #include -#include +#include #include #include #include #include +#include #include #include #include +#include #include namespace cudf { @@ -45,38 +48,23 @@ namespace detail { namespace { /** - * @brief Units for timestamp conversion. - * These are defined since there are more than what cudf supports. + * @brief Structure of date/time components */ -enum class timestamp_units { - years, ///< precision is years - months, ///< precision is months - days, ///< precision is days - hours, ///< precision is hours - minutes, ///< precision is minutes - seconds, ///< precision is seconds - ms, ///< precision is milliseconds - us, ///< precision is microseconds - ns ///< precision is nanoseconds -}; - -// used to index values in a timeparts array -enum timestamp_parse_component { - TP_YEAR = 0, - TP_MONTH = 1, - TP_DAY = 2, - TP_DAY_OF_YEAR = 3, - TP_HOUR = 4, - TP_MINUTE = 5, - TP_SECOND = 6, - TP_SUBSECOND = 7, - TP_TZ_MINUTES = 8, - TP_ARRAYSIZE = 9 +struct timestamp_components { + int16_t year; + int8_t month; + int8_t day; + int16_t day_of_year; + int8_t hour; + int8_t minute; + int8_t second; + int32_t subsecond; + int32_t tz_minutes; }; enum class format_char_type : int8_t { - literal, // literal char type passed through - specifier // timestamp format specifier + literal, ///< literal char type passed through + specifier ///< timestamp format specifier }; /** @@ -93,49 +81,49 @@ struct alignas(4) format_item { { return format_item{format_char_type::specifier, format_char, length}; } - static format_item new_delimiter(char literal) + static format_item new_literal(char literal) { return format_item{format_char_type::literal, literal, 1}; } }; /** - * @brief The format_compiler parses a timestamp format string into a vector of - * format_items. + * @brief The format-compiler parses a timestamp format string into a vector of + * `format_items`. * - * The vector of format_items are used when parsing a string into timestamp + * The vector of `format_items` is used when parsing a string into timestamp * components and when formatting a string from timestamp components. */ +using specifier_map = std::map; + struct format_compiler { - std::string format; - std::string template_string; + std::string const format; rmm::device_uvector d_items; - std::map specifier_lengths = {{'Y', 4}, - {'y', 2}, - {'m', 2}, - {'d', 2}, - {'H', 2}, - {'I', 2}, - {'M', 2}, - {'S', 2}, - {'f', 6}, - {'z', 5}, - {'Z', 3}, - {'p', 2}, - {'j', 3}}; - - format_compiler(const char* fmt, rmm::cuda_stream_view stream) : format(fmt), d_items(0, stream) + // clang-format off + // The specifiers are documented here (not all are supported): + // https://en.cppreference.com/w/cpp/chrono/system_clock/formatter + specifier_map specifiers = { + {'Y', 4}, {'y', 2}, {'m', 2}, {'d', 2}, {'H', 2}, {'I', 2}, {'M', 2}, + {'S', 2}, {'f', 6}, {'z', 5}, {'Z', 3}, {'p', 2}, {'j', 3}}; + // clang-format on + + format_compiler(std::string fmt, + rmm::cuda_stream_view stream, + specifier_map extra_specifiers = {}) + : format(fmt), d_items(0, stream) { + specifiers.insert(extra_specifiers.begin(), extra_specifiers.end()); std::vector items; const char* str = format.c_str(); auto length = format.length(); while (length > 0) { char ch = *str++; length--; + + // first check for a literal character if (ch != '%') { - items.push_back(format_item::new_delimiter(ch)); - template_string.append(1, ch); + items.push_back(format_item::new_literal(ch)); continue; } CUDF_EXPECTS(length > 0, "Unfinished specifier in timestamp format"); @@ -144,45 +132,42 @@ struct format_compiler { length--; if (ch == '%') // escaped % char { - items.push_back(format_item::new_delimiter(ch)); - template_string.append(1, ch); + items.push_back(format_item::new_literal(ch)); continue; } if (ch >= '0' && ch <= '9') { CUDF_EXPECTS(*str == 'f', "precision not supported for specifier: " + std::string(1, *str)); - specifier_lengths[*str] = static_cast(ch - '0'); - ch = *str++; + specifiers[*str] = static_cast(ch - '0'); + ch = *str++; length--; } - CUDF_EXPECTS(specifier_lengths.find(ch) != specifier_lengths.end(), + + // check if the specifier found is supported + CUDF_EXPECTS(specifiers.find(ch) != specifiers.end(), "invalid format specifier: " + std::string(1, ch)); - int8_t spec_length = specifier_lengths[ch]; - items.push_back(format_item::new_specifier(ch, spec_length)); - template_string.append((size_t)spec_length, ch); + // create the format item for this specifier + items.push_back(format_item::new_specifier(ch, specifiers[ch])); } - // create program in device memory - d_items.resize(items.size(), stream); - CUDA_TRY(cudaMemcpyAsync(d_items.data(), - items.data(), - items.size() * sizeof(items[0]), - cudaMemcpyHostToDevice, - stream.value())); + + // copy format_items to device memory + d_items = cudf::detail::make_device_uvector_async(items, stream); } - format_item const* format_items() { return d_items.data(); } - size_type template_bytes() const { return static_cast(template_string.size()); } - size_type items_count() const { return static_cast(d_items.size()); } - int8_t subsecond_precision() const { return specifier_lengths.at('f'); } + device_span format_items() { return device_span(d_items); } + + int8_t subsecond_precision() const { return specifiers.at('f'); } }; -// this parses date/time characters into a timestamp integer -template // timestamp type +/** + * @brief This parses date/time characters into a timestamp integer + * + * @tparam T cudf::timestamp type + */ +template struct parse_datetime { column_device_view const d_strings; - format_item const* d_format_items; - size_type items_count; - timestamp_units units; + device_span const d_format_items; int8_t subsecond_precision; /** @@ -210,16 +195,17 @@ struct parse_datetime { return value; } - // Walk the format_items to read the datetime string. - // Returns 0 if all ok. - __device__ int parse_into_parts(string_view const& d_string, int32_t* timeparts) + // Walk the format_items to parse the string into date/time components + __device__ timestamp_components parse_into_parts(string_view const& d_string) { + timestamp_components timeparts = {1970, 1, 1, 0}; // init to epoch time + auto ptr = d_string.data(); auto length = d_string.size_bytes(); - for (size_t idx = 0; idx < items_count; ++idx) { - auto item = d_format_items[idx]; + for (auto item : d_format_items) { if (item.value != 'f') item.length = static_cast(std::min(static_cast(item.length), length)); + if (item.item_type == format_char_type::literal) { // static character we'll just skip; // consume item.length bytes from string @@ -230,93 +216,77 @@ struct parse_datetime { // special logic for each specifier switch (item.value) { - case 'Y': timeparts[TP_YEAR] = str2int(ptr, item.length); break; + case 'Y': timeparts.year = static_cast(str2int(ptr, item.length)); break; case 'y': { - auto const year = str2int(ptr, item.length); - timeparts[TP_YEAR] = year + (year < 69 ? 2000 : 1900); + auto const year = str2int(ptr, item.length); + timeparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); break; } - case 'm': timeparts[TP_MONTH] = str2int(ptr, item.length); break; - case 'd': timeparts[TP_DAY] = str2int(ptr, item.length); break; - case 'j': timeparts[TP_DAY_OF_YEAR] = str2int(ptr, item.length); break; + case 'm': timeparts.month = static_cast(str2int(ptr, item.length)); break; + case 'd': timeparts.day = static_cast(str2int(ptr, item.length)); break; + case 'j': timeparts.day_of_year = static_cast(str2int(ptr, item.length)); break; case 'H': - case 'I': timeparts[TP_HOUR] = str2int(ptr, item.length); break; - case 'M': timeparts[TP_MINUTE] = str2int(ptr, item.length); break; - case 'S': timeparts[TP_SECOND] = str2int(ptr, item.length); break; + case 'I': timeparts.hour = static_cast(str2int(ptr, item.length)); break; + case 'M': timeparts.minute = static_cast(str2int(ptr, item.length)); break; + case 'S': timeparts.second = static_cast(str2int(ptr, item.length)); break; case 'f': { int32_t const read_size = std::min(static_cast(item.length), static_cast(length)); - int64_t const fraction = str2int(ptr, read_size) * power_of_ten(item.length - read_size); - timeparts[TP_SUBSECOND] = static_cast(fraction); + int64_t const fraction = str2int(ptr, read_size) * power_of_ten(item.length - read_size); + timeparts.subsecond = static_cast(fraction); break; } case 'p': { string_view am_pm(ptr, 2); - auto hour = timeparts[TP_HOUR]; + auto hour = timeparts.hour; if ((am_pm.compare("AM", 2) == 0) || (am_pm.compare("am", 2) == 0)) { if (hour == 12) hour = 0; } else if (hour < 12) hour += 12; - timeparts[TP_HOUR] = hour; + timeparts.hour = hour; break; } case 'z': { - int sign = *ptr == '-' ? 1 : -1; // revert timezone back to UTC - int hh = str2int(ptr + 1, 2); - int mm = str2int(ptr + 3, 2); + auto const sign = *ptr == '-' ? 1 : -1; // revert timezone back to UTC + auto const hh = str2int(ptr + 1, 2); + auto const mm = str2int(ptr + 3, 2); // ignoring the rest for now // item.length has how many chars we should read - timeparts[TP_TZ_MINUTES] = sign * ((hh * 60) + mm); + timeparts.tz_minutes = sign * ((hh * 60) + mm); break; } case 'Z': break; // skip - default: return 3; + default: break; } ptr += item.length; length -= item.length; } - return 0; + return timeparts; } - __device__ int64_t timestamp_from_parts(int32_t const* timeparts, timestamp_units units) + __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) { - auto year = timeparts[TP_YEAR]; - if (units == timestamp_units::years) return year - 1970; - auto month = timeparts[TP_MONTH]; - if (units == timestamp_units::months) - return ((year - 1970) * 12) + (month - 1); // months are 1-12, need to 0-base it here - auto day = timeparts[TP_DAY]; - auto ymd = // convenient chrono class handles the leap year calculations for us - cuda::std::chrono::year_month_day(cuda::std::chrono::year{year}, - cuda::std::chrono::month{static_cast(month)}, - cuda::std::chrono::day{static_cast(day)}); - int32_t days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); - if (units == timestamp_units::days) return days; - - auto tzadjust = timeparts[TP_TZ_MINUTES]; // in minutes - auto hour = timeparts[TP_HOUR]; - if (units == timestamp_units::hours) return (days * 24L) + hour + (tzadjust / 60); - - auto minute = timeparts[TP_MINUTE]; - if (units == timestamp_units::minutes) - return static_cast(days * 24L * 60L) + (hour * 60L) + minute + tzadjust; - - auto second = timeparts[TP_SECOND]; - int64_t timestamp = - (days * 24L * 3600L) + (hour * 3600L) + (minute * 60L) + second + (tzadjust * 60); - if (units == timestamp_units::seconds) return timestamp; - - int64_t subsecond = - timeparts[TP_SUBSECOND] * power_of_ten(9 - subsecond_precision); // normalize to nanoseconds - if (units == timestamp_units::ms) { - timestamp *= 1000L; - subsecond = subsecond / 1000000L; - } else if (units == timestamp_units::us) { - timestamp *= 1000000L; - subsecond = subsecond / 1000L; - } else if (units == timestamp_units::ns) - timestamp *= 1000000000L; + auto const ymd = // convenient chrono class handles the leap year calculations for us + cuda::std::chrono::year_month_day( + cuda::std::chrono::year{timeparts.year}, + cuda::std::chrono::month{static_cast(timeparts.month)}, + cuda::std::chrono::day{static_cast(timeparts.day)}); + auto const days = cuda::std::chrono::sys_days(ymd).time_since_epoch().count(); + + if constexpr (std::is_same_v) { return days; } + + int64_t timestamp = (days * 24L * 3600L) + (timeparts.hour * 3600L) + (timeparts.minute * 60L) + + timeparts.second + (timeparts.tz_minutes * 60L); + + if constexpr (std::is_same_v) { return timestamp; } + + int64_t const subsecond = + (timeparts.subsecond * power_of_ten(9 - subsecond_precision)) / // normalize to nanoseconds + (1000000000L / T::period::type::den); // and rescale to T + + timestamp *= T::period::type::den; timestamp += subsecond; + return timestamp; } @@ -326,73 +296,34 @@ struct parse_datetime { if (d_strings.is_null(idx)) return epoch_time; string_view d_str = d_strings.element(idx); if (d_str.empty()) return epoch_time; - // - int32_t timeparts[TP_ARRAYSIZE] = {1970, 1, 1}; // month and day are 1-based - if (parse_into_parts(d_str, timeparts)) return epoch_time; // unexpected parse case - // - return T{T::duration(timestamp_from_parts(timeparts, units))}; - } -}; -// convert cudf type to timestamp units -struct dispatch_timestamp_to_units_fn { - template - timestamp_units operator()() - { - CUDF_FAIL("Invalid type for timestamp conversion."); + auto const timeparts = parse_into_parts(d_str); + + return T{T::duration(timestamp_from_parts(timeparts))}; } }; -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::days; -} -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::seconds; -} -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::ms; -} -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::us; -} -template <> -timestamp_units dispatch_timestamp_to_units_fn::operator()() -{ - return timestamp_units::ns; -} - -// dispatch operator to map timestamp to native fixed-width-type +/** + * @brief Type-dispatch operator to convert timestamp strings to native fixed-width-type + */ struct dispatch_to_timestamps_fn { template ()>* = nullptr> void operator()(column_device_view const& d_strings, std::string const& format, - timestamp_units units, mutable_column_view& results_view, rmm::cuda_stream_view stream) const { - format_compiler compiler(format.c_str(), stream); - auto d_items = compiler.format_items(); - auto d_results = results_view.data(); - parse_datetime pfn{ - d_strings, d_items, compiler.items_count(), units, compiler.subsecond_precision()}; + format_compiler compiler(format, stream); + parse_datetime pfn{d_strings, compiler.format_items(), compiler.subsecond_precision()}; thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(results_view.size()), - d_results, + results_view.data(), pfn); } template ()>* = nullptr> void operator()(column_device_view const&, std::string const&, - timestamp_units, mutable_column_view&, rmm::cuda_stream_view) const { @@ -403,31 +334,31 @@ struct dispatch_to_timestamps_fn { } // namespace // -std::unique_ptr to_timestamps(strings_column_view const& strings, +std::unique_ptr to_timestamps(strings_column_view const& input, data_type timestamp_type, std::string const& format, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_timestamp_column(timestamp_type, 0); + if (input.is_empty()) + return make_empty_column(timestamp_type); // make_timestamp_column(timestamp_type, 0); CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); - timestamp_units units = cudf::type_dispatcher(timestamp_type, dispatch_timestamp_to_units_fn()); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; + auto d_strings = column_device_view::create(input.parent(), stream); - auto results = make_timestamp_column(timestamp_type, - strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + auto results = make_timestamp_column(timestamp_type, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); + auto results_view = results->mutable_view(); cudf::type_dispatcher( - timestamp_type, dispatch_to_timestamps_fn(), d_column, format, units, results_view, stream); - results->set_null_count(strings.null_count()); + timestamp_type, dispatch_to_timestamps_fn(), *d_strings, format, results_view, stream); + + results->set_null_count(input.null_count()); return results; } @@ -438,8 +369,7 @@ std::unique_ptr to_timestamps(strings_column_view const& strings, */ struct check_datetime_format { column_device_view const d_strings; - format_item const* d_format_items; - size_type items_count; + device_span const d_format_items; /** * @brief Check the specified characters are between ['0','9']. @@ -508,16 +438,17 @@ struct check_datetime_format { * The checking here is a little more strict than the actual * parser used for conversion. */ - __device__ bool check_string(string_view const& d_string, int32_t* dateparts) + __device__ thrust::optional check_string(string_view const& d_string) { + timestamp_components dateparts = {1970, 1, 1, 0}; // init to epoch time + auto ptr = d_string.data(); auto length = d_string.size_bytes(); - for (size_t idx = 0; idx < items_count; ++idx) { - auto item = d_format_items[idx]; + for (auto item : d_format_items) { // eliminate static character values first if (item.item_type == format_char_type::literal) { // check static character matches - if (*ptr != item.value) return false; + if (*ptr != item.value) return thrust::nullopt; ptr += item.length; length -= item.length; continue; @@ -532,30 +463,30 @@ struct check_datetime_format { switch (item.value) { case 'Y': { if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts[TP_YEAR] = value.value(); + result = true; + dateparts.year = static_cast(value.value()); } break; } case 'y': { if (auto value = str2int(ptr, item.length)) { - result = true; - auto const year = value.value(); - dateparts[TP_YEAR] = year + (year < 69 ? 2000 : 1900); + result = true; + auto const year = value.value(); + dateparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); } break; } case 'm': { if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts[TP_MONTH] = value.value(); + result = true; + dateparts.month = static_cast(value.value()); } break; } case 'd': { if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts[TP_DAY] = value.value(); + result = true; + dateparts.day = static_cast(value.value()); } break; } @@ -587,23 +518,26 @@ struct check_datetime_format { case 'Z': result = true; // skip default: break; } - if (!result) return false; + if (!result) return thrust::nullopt; ptr += item.length; length -= item.length; } - return true; + return dateparts; } __device__ bool operator()(size_type idx) { if (d_strings.is_null(idx)) return false; + string_view d_str = d_strings.element(idx); if (d_str.empty()) return false; - int32_t dateparts[] = {1970, 1, 1}; // year, month, day - if (!check_string(d_str, dateparts)) return false; - auto year = dateparts[TP_YEAR]; - auto month = static_cast(dateparts[TP_MONTH]); - auto day = static_cast(dateparts[TP_DAY]); + + auto const dateparts = check_string(d_str); + if (!dateparts.has_value()) return false; + + auto const year = dateparts.value().year; + auto const month = static_cast(dateparts.value().month); + auto const day = static_cast(dateparts.value().day); return cuda::std::chrono::year_month_day(cuda::std::chrono::year{year}, cuda::std::chrono::month{month}, cuda::std::chrono::day{day}) @@ -611,36 +545,34 @@ struct check_datetime_format { } }; -std::unique_ptr is_timestamp(strings_column_view const& strings, +std::unique_ptr is_timestamp(strings_column_view const& input, std::string const& format, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); + size_type strings_count = input.size(); if (strings_count == 0) return make_empty_column(data_type{type_id::BOOL8}); CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; + auto d_strings = column_device_view::create(input.parent(), stream); auto results = make_numeric_column(data_type{type_id::BOOL8}, strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto d_results = results->mutable_view().data(); - format_compiler compiler(format.c_str(), stream); - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_results, - check_datetime_format{d_strings, compiler.format_items(), compiler.items_count()}); + format_compiler compiler(format, stream); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_results, + check_datetime_format{*d_strings, compiler.format_items()}); - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } @@ -648,141 +580,205 @@ std::unique_ptr is_timestamp(strings_column_view const& strings, // external APIs -std::unique_ptr to_timestamps(strings_column_view const& strings, +std::unique_ptr to_timestamps(strings_column_view const& input, data_type timestamp_type, std::string const& format, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_timestamps(strings, timestamp_type, format, rmm::cuda_stream_default, mr); + return detail::to_timestamps(input, timestamp_type, format, rmm::cuda_stream_default, mr); } -std::unique_ptr is_timestamp(strings_column_view const& strings, +std::unique_ptr is_timestamp(strings_column_view const& input, std::string const& format, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_timestamp(strings, format, rmm::cuda_stream_default, mr); + return detail::is_timestamp(input, format, rmm::cuda_stream_default, mr); } namespace detail { namespace { -// converts a timestamp into date-time string + +constexpr size_type format_names_size = 40; // 2(am/pm) + 2x7(weekdays) + 2x12(months) +constexpr size_type offset_weekdays = 2; +constexpr size_type offset_months = 16; +constexpr size_type days_in_week = 7; +constexpr size_type months_in_year = 12; + +/** + * @brief Time components used by the date_time_formatter + */ +struct time_components { + int8_t hour; + int8_t minute; + int8_t second; + int32_t subsecond; +}; + +/** + * @brief Base class for the `from_timestamps_size_fn` and the `date_time_formatter` + * + * These contain some common utility functions used by both subclasses. + */ template -struct datetime_formatter { - const column_device_view d_timestamps; - const format_item* d_format_items; - size_type items_count; - timestamp_units units; - const int32_t* d_offsets; - char* d_chars; - - __device__ cudf::timestamp_D::duration convert_to_days(int64_t timestamp, timestamp_units units) +struct from_timestamp_base { + /** + * @brief Specialized modulo expression that handles negative values. + * + * @code{.pseudo} + * Examples: + * modulo(1,60) -> 1 + * modulo(-1,60) -> 59 + * @endcode + */ + __device__ int32_t modulo_time(int64_t time, int64_t base) const { - using namespace cuda::std::chrono; - using minutes = duration; - using hours = duration; - switch (units) { - case timestamp_units::minutes: return floor(minutes(timestamp)); - case timestamp_units::seconds: return floor(cudf::timestamp_s::duration(timestamp)); - case timestamp_units::hours: return floor(hours(timestamp)); - case timestamp_units::ms: return floor(cudf::timestamp_ms::duration(timestamp)); - case timestamp_units::us: return floor(cudf::timestamp_us::duration(timestamp)); - case timestamp_units::ns: return floor(cudf::timestamp_ns::duration(timestamp)); - default: return cudf::timestamp_D::duration(timestamp); - } - } + return static_cast(((time % base) + base) % base); + }; - // divide timestamp integer into time components (year, month, day, etc) - // TODO call the cuda::std::chrono methods here instead when they are ready - __device__ void dissect_timestamp(int64_t timestamp, int32_t* timeparts) + /** + * @brief This function handles converting units by dividing and adjusting for negative values. + * + * @code{.pseudo} + * Examples: + * scale(-61,60) -> -2 + * scale(-60,60) -> -1 + * scale(-59,60) -> -1 + * scale( 59,60) -> 0 + * scale( 60,60) -> 1 + * scale( 61,60) -> 1 + * @endcode + */ + __device__ int32_t scale_time(int64_t time, int64_t base) const + { + return static_cast((time - ((time < 0) * (base - 1L))) / base); + }; + + __device__ time_components get_time_components(int64_t tstamp) const { - if (units == timestamp_units::years) { - timeparts[TP_YEAR] = static_cast(timestamp) + 1970; - timeparts[TP_MONTH] = 1; - timeparts[TP_DAY] = 1; - return; + time_components result = {0}; + if constexpr (std::is_same_v) { return result; } + + // Note: Tried using: cuda::std::chrono::hh_mm_ss(T::duration(timestamp)); + // and retrieving the hour, minute, second, and subsecond values from it + // but it did not scale/modulo the components for negative timestamps + // correctly -- it simply did an abs(timestamp) as documented here: + // https://en.cppreference.com/w/cpp/chrono/hh_mm_ss/hh_mm_ss + + if constexpr (not std::is_same_v) { + int64_t constexpr base = T::period::type::den; // 1000=ms, 1000000=us, etc + auto const subsecond = modulo_time(tstamp, base); + tstamp = tstamp / base - ((tstamp < 0) and (subsecond != 0)); + result.subsecond = subsecond; } - // Specialized modulo expression that handles negative values. - // Examples: - // modulo(1,60) 1 - // modulo(-1,60) 59 - auto modulo_time = [](int64_t time, int64_t base) { - return static_cast(((time % base) + base) % base); - }; + result.hour = modulo_time(scale_time(tstamp, 3600), 24); + result.minute = modulo_time(scale_time(tstamp, 60), 60); + result.second = modulo_time(tstamp, 60); - // This function handles converting units by dividing and adjusting for negative values. - // Examples: - // scale(-61,60) -2 - // scale(-60,60) -1 - // scale(-59,60) -1 - // scale( 59,60) 0 - // scale( 60,60) 1 - // scale( 61,60) 1 - auto scale_time = [](int64_t time, int64_t base) { - return static_cast((time - ((time < 0) * (base - 1L))) / base); - }; + return result; + } +}; - if (units == timestamp_units::months) { - int32_t month = modulo_time(timestamp, 12); - int32_t year = scale_time(timestamp, 12) + 1970; - timeparts[TP_YEAR] = year; - timeparts[TP_MONTH] = month + 1; // months start at 1 and not 0 - timeparts[TP_DAY] = 1; - return; - } +template +struct from_timestamps_size_fn : public from_timestamp_base { + column_device_view const d_timestamps; + column_device_view const d_format_names; + device_span const d_format_items; + + from_timestamps_size_fn(column_device_view const& d_timestamps, + column_device_view const& d_format_names, + device_span const& d_format_items) + : d_timestamps(d_timestamps), d_format_names(d_format_names), d_format_items(d_format_items) + { + } - // first, convert to days so we can handle months, years, day of the year. - auto const days = convert_to_days(timestamp, units); - auto const ymd = cuda::std::chrono::year_month_day(cuda::std::chrono::sys_days(days)); - auto const year = static_cast(ymd.year()); - auto const month = static_cast(ymd.month()); - auto const day = static_cast(ymd.day()); + __device__ size_type operator()(size_type idx) const + { + if (d_timestamps.is_null(idx)) { return 0; } - int32_t const monthDayOffset[] = {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; - timeparts[TP_DAY_OF_YEAR] = - day + monthDayOffset[month - 1] + (month > 2 and ymd.year().is_leap()); + // We only dissect the timestamp into components if needed + // by a specifier. And then we only do it once and reuse it. + // This can improve performance when not using uncommon specifiers. + thrust::optional days; - timeparts[TP_YEAR] = year; - timeparts[TP_MONTH] = month; - timeparts[TP_DAY] = day; - if (units == timestamp_units::days) return; + auto days_from_timestamp = [&]() { + auto const tstamp = d_timestamps.element(idx).time_since_epoch().count(); + return cuda::std::chrono::sys_days(static_cast( + floor(T::duration(tstamp)))); + }; - // done with date, now work on time + size_type bytes = 0; // output size + for (auto item : d_format_items) { + if (item.item_type == format_char_type::literal) { + bytes += item.length; + continue; + } - if (units == timestamp_units::hours) { - timeparts[TP_HOUR] = modulo_time(timestamp, 24); - return; - } - if (units == timestamp_units::minutes) { - timeparts[TP_HOUR] = modulo_time(scale_time(timestamp, 60), 24); - timeparts[TP_MINUTE] = modulo_time(timestamp, 60); - return; - } - if (units == timestamp_units::seconds) { - timeparts[TP_HOUR] = modulo_time(scale_time(timestamp, 3600), 24); - timeparts[TP_MINUTE] = modulo_time(scale_time(timestamp, 60), 60); - timeparts[TP_SECOND] = modulo_time(timestamp, 60); - return; + // only specifiers resulting in strings require special logic + switch (item.value) { + case 'a': // weekday abbreviated + case 'A': { // weekday full name + if (!days.has_value()) { days = days_from_timestamp(); } + auto const day_of_week = + cuda::std::chrono::year_month_weekday(days.value()).weekday().c_encoding(); + auto const day_idx = + day_of_week + offset_weekdays + (item.value == 'a' ? days_in_week : 0); + if (day_idx < d_format_names.size()) + bytes += d_format_names.element(day_idx).size_bytes(); + break; + } + case 'b': // month abbreviated + case 'B': { // month full name + if (!days.has_value()) { days = days_from_timestamp(); } + auto const month = + static_cast(cuda::std::chrono::year_month_day(days.value()).month()); + auto const month_idx = + month - 1 + offset_months + (item.value == 'b' ? months_in_year : 0); + if (month_idx < d_format_names.size()) + bytes += d_format_names.element(month_idx).size_bytes(); + break; + } + case 'p': // AM/PM + { + auto times = get_time_components(d_timestamps.element(idx).time_since_epoch().count()); + bytes += d_format_names.size() > 1 + ? d_format_names.element(static_cast(times.hour >= 12)) + .size_bytes() + : 2; + break; + } + default: { + bytes += item.length; + break; + } + } } + return bytes; + } +}; - // common utility for setting time components from a subsecond unit value - auto subsecond_fn = [&](int64_t subsecond_base) { - auto subsecond = modulo_time(timestamp, subsecond_base); - timestamp = timestamp / subsecond_base - ((timestamp < 0) and (subsecond != 0)); - timeparts[TP_SUBSECOND] = subsecond; - timeparts[TP_HOUR] = modulo_time(scale_time(timestamp, 3600), 24); - timeparts[TP_MINUTE] = modulo_time(scale_time(timestamp, 60), 60); - timeparts[TP_SECOND] = modulo_time(timestamp, 60); - }; - - if (units == timestamp_units::ms) - subsecond_fn(1000); - else if (units == timestamp_units::us) - subsecond_fn(1000000); - else - subsecond_fn(1000000000); +// converts a timestamp into date-time formatted string +template +struct datetime_formatter : public from_timestamp_base { + column_device_view const d_timestamps; + column_device_view const d_format_names; + device_span const d_format_items; + int32_t const* d_offsets{}; + char* d_chars{}; + + datetime_formatter(column_device_view const& d_timestamps, + column_device_view const& d_format_names, + device_span const& d_format_items, + int32_t const* d_offsets, + char* d_chars) + : d_timestamps(d_timestamps), + d_format_names(d_format_names), + d_format_items(d_format_items), + d_offsets(d_offsets), + d_chars(d_chars) + { } // utility to create 0-padded integers (up to 9 chars) @@ -801,120 +797,234 @@ struct datetime_formatter { return str; } - __device__ char* format_from_parts(int32_t const* timeparts, char* ptr) + // from https://howardhinnant.github.io/date/date.html + __device__ thrust::pair get_iso_week_year( + cuda::std::chrono::year_month_day const& ymd) const + { + auto const days = cuda::std::chrono::sys_days(ymd); + auto year = ymd.year(); + + auto iso_week_start = [](cuda::std::chrono::year const y) { + // clang-format off + return cuda::std::chrono::sys_days{cuda::std::chrono::Thursday[1]/cuda::std::chrono::January/y} - + (cuda::std::chrono::Thursday - cuda::std::chrono::Monday); + // clang-format on + }; + + auto start = iso_week_start(year); + if (days < start) + start = iso_week_start(--year); + else { + auto const next_start = iso_week_start(year + cuda::std::chrono::years{1}); + if (days >= next_start) { + ++year; + start = next_start; + } + } + return thrust::make_pair( + (cuda::std::chrono::duration_cast(days - start) + + cuda::std::chrono::weeks{1}) // always [1-53] + .count(), + static_cast(year)); + } + + __device__ int8_t get_week_of_year(cuda::std::chrono::sys_days const days, + cuda::std::chrono::sys_days const start) const { - for (size_t idx = 0; idx < items_count; ++idx) { - auto item = d_format_items[idx]; + return days < start + ? 0 + : (cuda::std::chrono::duration_cast(days - start) + + cuda::std::chrono::weeks{1}) + .count(); + } + + __device__ int32_t get_day_of_year(cuda::std::chrono::year_month_day const& ymd) + { + auto const month = static_cast(ymd.month()); + auto const day = static_cast(ymd.day()); + int32_t const monthDayOffset[] = {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; + return static_cast(day + monthDayOffset[month - 1] + + (month > 2 and ymd.year().is_leap())); + } + + __device__ void operator()(size_type idx) + { + if (d_timestamps.is_null(idx)) return; + auto tstamp = d_timestamps.element(idx).time_since_epoch().count(); + + auto const days = cuda::std::chrono::sys_days(static_cast( + cuda::std::chrono::floor(T::duration(tstamp)))); + auto const ymd = cuda::std::chrono::year_month_day(days); + + auto timeparts = get_time_components(tstamp); + + // convert to characters using the format items + auto ptr = d_chars + d_offsets[idx]; + for (auto item : d_format_items) { if (item.item_type == format_char_type::literal) { *ptr++ = item.value; continue; } + + // Value to use for int2str call at the end of the switch-statement. + // This simplifies the case statements and prevents alot of extra inlining. + int32_t copy_value = -1; // default set for non-int2str usage cases + // special logic for each specifier switch (item.value) { case 'Y': // 4-digit year - ptr = int2str(ptr, item.length, timeparts[TP_YEAR]); + copy_value = static_cast(ymd.year()); break; case 'y': // 2-digit year { - auto year = timeparts[TP_YEAR]; + auto year = static_cast(ymd.year()); // remove hundredths digits and above - ptr = int2str(ptr, item.length, year - ((year / 100) * 100)); + copy_value = year - ((year / 100) * 100); break; } case 'm': // month - ptr = int2str(ptr, item.length, timeparts[TP_MONTH]); + copy_value = static_cast(static_cast(ymd.month())); break; case 'd': // day of month - ptr = int2str(ptr, item.length, timeparts[TP_DAY]); + copy_value = static_cast(static_cast(ymd.day())); break; case 'j': // day of year - ptr = int2str(ptr, item.length, timeparts[TP_DAY_OF_YEAR]); + copy_value = get_day_of_year(ymd); break; case 'H': // 24-hour - ptr = int2str(ptr, item.length, timeparts[TP_HOUR]); + copy_value = timeparts.hour; break; case 'I': // 12-hour { // 0 = 12am; 12 = 12pm; 6 = 06am; 18 = 06pm - auto hour = timeparts[TP_HOUR]; - if (hour == 0) hour = 12; - if (hour > 12) hour -= 12; - ptr = int2str(ptr, item.length, hour); + copy_value = [h = timeparts.hour] { + if (h == 0) return 12; + return h > 12 ? h - 12 : h; + }(); break; } case 'M': // minute - ptr = int2str(ptr, item.length, timeparts[TP_MINUTE]); + copy_value = timeparts.minute; break; case 'S': // second - ptr = int2str(ptr, item.length, timeparts[TP_SECOND]); + copy_value = timeparts.second; break; case 'f': // sub-second { char subsecond_digits[] = "000000000"; // 9 max digits - const int digits = [units = units] { - if (units == timestamp_units::ms) return 3; - if (units == timestamp_units::us) return 6; - if (units == timestamp_units::ns) return 9; + const int digits = [] { + if constexpr (std::is_same_v) return 3; + if constexpr (std::is_same_v) return 6; + if constexpr (std::is_same_v) return 9; return 0; }(); - int2str(subsecond_digits, digits, timeparts[TP_SUBSECOND]); + int2str(subsecond_digits, digits, timeparts.subsecond); ptr = copy_and_increment(ptr, subsecond_digits, item.length); break; } case 'p': // am or pm + { // 0 = 12am, 12 = 12pm - if (timeparts[TP_HOUR] < 12) - memcpy(ptr, "AM", 2); - else - memcpy(ptr, "PM", 2); - ptr += 2; + auto const am_pm = [&] { + if (d_format_names.size() > 1) + return d_format_names.element( + static_cast(timeparts.hour >= 12)); + return string_view(timeparts.hour >= 12 ? "PM" : "AM", 2); + }(); + ptr = copy_string(ptr, am_pm); break; - case 'z': // timezone - memcpy(ptr, "+0000", 5); // always UTC - ptr += 5; + } + case 'z': // timezone -- always UTC + ptr = copy_and_increment(ptr, "+0000", 5); break; - case 'Z': - memcpy(ptr, "UTC", 3); - ptr += 3; + case 'Z': // timezone string -- always UTC + ptr = copy_and_increment(ptr, "UTC", 3); break; - default: // ignore everything else + case 'u': // day of week ISO + case 'w': { // day of week non-ISO + auto const day_of_week = static_cast( + cuda::std::chrono::year_month_weekday(days).weekday().c_encoding()); + copy_value = day_of_week == 0 && item.value == 'u' ? 7 : day_of_week; break; + } + // clang-format off + case 'U': { // week of year: first week includes the first Sunday of the year + copy_value = get_week_of_year(days, cuda::std::chrono::sys_days{ + cuda::std::chrono::Sunday[1]/cuda::std::chrono::January/ymd.year()}); + break; + } + case 'W': { // week of year: first week includes the first Monday of the year + copy_value = get_week_of_year(days, cuda::std::chrono::sys_days{ + cuda::std::chrono::Monday[1]/cuda::std::chrono::January/ymd.year()}); + break; + } + // clang-format on + case 'V': // ISO week number + case 'G': { // ISO year number + auto const [week, year] = get_iso_week_year(ymd); + copy_value = item.value == 'G' ? year : week; + break; + } + case 'a': // abbreviated day of the week + case 'A': { // day of the week + auto const day_of_week = + cuda::std::chrono::year_month_weekday(days).weekday().c_encoding(); + auto const day_idx = + day_of_week + offset_weekdays + (item.value == 'a' ? days_in_week : 0); + if (d_format_names.size()) + ptr = copy_string(ptr, d_format_names.element(day_idx)); + break; + } + case 'b': // abbreviated month of the year + case 'B': { // month of the year + auto const month = static_cast(ymd.month()); + auto const month_idx = + month - 1 + offset_months + (item.value == 'b' ? months_in_year : 0); + if (d_format_names.size()) + ptr = copy_string(ptr, d_format_names.element(month_idx)); + break; + } + default: break; } + if (copy_value >= 0) ptr = int2str(ptr, item.length, copy_value); } - return ptr; - } - - __device__ void operator()(size_type idx) - { - if (d_timestamps.is_null(idx)) return; - auto timestamp = d_timestamps.element(idx); - int32_t timeparts[TP_ARRAYSIZE] = {0}; - dissect_timestamp(timestamp.time_since_epoch().count(), timeparts); - // convert to characters - format_from_parts(timeparts, d_chars + d_offsets[idx]); } }; // +using strings_children = std::pair, std::unique_ptr>; struct dispatch_from_timestamps_fn { template ()>* = nullptr> - void operator()(column_device_view const& d_timestamps, - format_item const* d_format_items, - size_type items_count, - timestamp_units units, - const int32_t* d_offsets, - char* d_chars, - rmm::cuda_stream_view stream) const + strings_children operator()(column_device_view const& d_timestamps, + column_device_view const& d_format_names, + device_span d_format_items, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { - datetime_formatter pfn{d_timestamps, d_format_items, items_count, units, d_offsets, d_chars}; + size_type const strings_count = d_timestamps.size(); + // build offsets column + auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( + 0, from_timestamps_size_fn{d_timestamps, d_format_names, d_format_items}); + auto offsets_column = make_offsets_child_column( + offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); + auto d_offsets = offsets_column->mutable_view().template data(); + + // build chars column + auto const bytes = + cudf::detail::get_value(offsets_column->view(), strings_count, stream); + auto chars_column = create_chars_child_column(bytes, stream, mr); + auto d_chars = chars_column->mutable_view().template data(); + + datetime_formatter pfn{d_timestamps, d_format_names, d_format_items, d_offsets, d_chars}; thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), d_timestamps.size(), pfn); + return std::make_pair(std::move(offsets_column), std::move(chars_column)); } template - std::enable_if_t(), void> operator()(Args&&...) const + std::enable_if_t(), strings_children> operator()(Args&&...) const { CUDF_FAIL("Only timestamps type are expected"); } @@ -925,59 +1035,41 @@ struct dispatch_from_timestamps_fn { // std::unique_ptr from_timestamps(column_view const& timestamps, std::string const& format, + strings_column_view const& names, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = timestamps.size(); - if (strings_count == 0) return make_empty_column(data_type{type_id::STRING}); + if (timestamps.is_empty()) return make_empty_column(data_type{type_id::STRING}); CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); - timestamp_units units = - cudf::type_dispatcher(timestamps.type(), dispatch_timestamp_to_units_fn()); - - format_compiler compiler(format.c_str(), stream); - auto d_format_items = compiler.format_items(); - - auto column = column_device_view::create(timestamps, stream); - auto d_column = *column; - - // copy null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(timestamps, stream, mr); - // Each string will be the same number of bytes which can be determined - // directly from the format string. - auto d_str_bytes = compiler.template_bytes(); // size in bytes of each string - // build offsets column - auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( - 0, [d_column, d_str_bytes] __device__(size_type idx) { - return d_column.is_null(idx) ? 0 : d_str_bytes; - }); - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto offsets_view = offsets_column->view(); - auto d_new_offsets = offsets_view.template data(); - - // build chars column - auto const bytes = - cudf::detail::get_value(offsets_column->view(), strings_count, stream); - auto chars_column = create_chars_child_column(bytes, stream, mr); - auto d_chars = chars_column->mutable_view().template data(); - // fill in chars column with timestamps + CUDF_EXPECTS(names.is_empty() || names.size() == format_names_size, + "Invalid size for format names."); + + auto const d_names = column_device_view::create(names.parent(), stream); + + // This API supports a few more specifiers than to_timestamps. + // clang-format off + format_compiler compiler(format, stream, + specifier_map{{'w', 1}, {'W', 2}, {'u', 1}, {'U', 2}, {'V', 2}, {'G', 4}, + {'a', 3}, {'A', 3}, {'b', 3}, {'B', 3}}); + // clang-format on + auto const d_format_items = compiler.format_items(); + auto const d_timestamps = column_device_view::create(timestamps, stream); + // dispatcher is called to handle the different timestamp types - cudf::type_dispatcher(timestamps.type(), - dispatch_from_timestamps_fn(), - d_column, - d_format_items, - compiler.items_count(), - units, - d_new_offsets, - d_chars, - stream); - - return make_strings_column(strings_count, + auto [offsets_column, chars_column] = cudf::type_dispatcher(timestamps.type(), + dispatch_from_timestamps_fn(), + *d_timestamps, + *d_names, + d_format_items, + stream, + mr); + + return make_strings_column(timestamps.size(), std::move(offsets_column), std::move(chars_column), timestamps.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(timestamps, stream, mr), stream, mr); } @@ -988,10 +1080,11 @@ std::unique_ptr from_timestamps(column_view const& timestamps, std::unique_ptr from_timestamps(column_view const& timestamps, std::string const& format, + strings_column_view const& names, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_timestamps(timestamps, format, rmm::cuda_stream_default, mr); + return detail::from_timestamps(timestamps, format, names, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index a0f1eed9935..1a814ea707e 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -288,6 +288,145 @@ TEST_F(StringsDatetimeTest, FromTimestampDayOfYear) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +// Format names used for some specifiers in from_timestamps +// clang-format off +cudf::test::strings_column_wrapper format_names({"AM", "PM", + "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", + "January", "February", "March", "April", "May", "June", "July", + "August", "September", "October", "November", "December", + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}); +// clang-format on + +TEST_F(StringsDatetimeTest, FromTimestampDayOfWeekOfYear) +{ + cudf::test::fixed_width_column_wrapper timestamps{ + 1645059720L, // 2022-02-17 + 1647167880L, // 2022-03-13 + 1649276040L, // 2022-04-06 + 1588734621L, // 2020-05-06 + 1560948892L, // 2019-06-19 + -265880250L, // 1961-07-29 + 1628194442L, // 2021-08-05 + 1632410760L, // 2021-09-23 + 1633464842L, // 2021-10-05 + 1636100042L, // 2021-11-05 + // These are a sequence of dates which are particular to the ISO week and + // year numbers which shift through Monday and Thursday and nicely includes + // a leap year (1980). https://en.wikipedia.org/wiki/ISO_week_date + 220924800L, // 1977-01-01 + 221011200L, // 1977-01-02 + 252374400L, // 1977-12-31 + 252460800L, // 1978-01-01 + 252547200L, // 1978-01-02 + 283910400L, // 1978-12-31 + 283996800L, // 1979-01-01 + 315360000L, // 1979-12-30 + 315446400L, // 1979-12-31 + 315532800L, // 1980-01-01 + 346809600L, // 1980-12-28 + 346896000L, // 1980-12-29 + 346982400L, // 1980-12-30 + 347068800L, // 1980-12-31 + 347155200L, // 1981-01-01 + 378604800L, // 1981-12-31 + 378691200L, // 1982-01-01 + 378777600L, // 1982-01-02 + 378864000L // 1982-01-03 + }; + + cudf::test::strings_column_wrapper expected( + {"[Thu 17, Feb 2022 4 07 4 07 2022 07]", "[Sun 13, Mar 2022 0 10 7 11 2022 10]", + "[Wed 06, Apr 2022 3 14 3 14 2022 14]", "[Wed 06, May 2020 3 18 3 18 2020 19]", + "[Wed 19, Jun 2019 3 24 3 24 2019 25]", "[Sat 29, Jul 1961 6 30 6 30 1961 30]", + "[Thu 05, Aug 2021 4 31 4 31 2021 31]", "[Thu 23, Sep 2021 4 38 4 38 2021 38]", + "[Tue 05, Oct 2021 2 40 2 40 2021 40]", "[Fri 05, Nov 2021 5 44 5 44 2021 44]", + "[Sat 01, Jan 1977 6 00 6 00 1976 53]", "[Sun 02, Jan 1977 0 00 7 01 1976 53]", + "[Sat 31, Dec 1977 6 52 6 52 1977 52]", "[Sun 01, Jan 1978 0 00 7 01 1977 52]", + "[Mon 02, Jan 1978 1 01 1 01 1978 01]", "[Sun 31, Dec 1978 0 52 7 53 1978 52]", + "[Mon 01, Jan 1979 1 01 1 00 1979 01]", "[Sun 30, Dec 1979 0 52 7 52 1979 52]", + "[Mon 31, Dec 1979 1 53 1 52 1980 01]", "[Tue 01, Jan 1980 2 00 2 00 1980 01]", + "[Sun 28, Dec 1980 0 51 7 52 1980 52]", "[Mon 29, Dec 1980 1 52 1 52 1981 01]", + "[Tue 30, Dec 1980 2 52 2 52 1981 01]", "[Wed 31, Dec 1980 3 52 3 52 1981 01]", + "[Thu 01, Jan 1981 4 00 4 00 1981 01]", "[Thu 31, Dec 1981 4 52 4 52 1981 53]", + "[Fri 01, Jan 1982 5 00 5 00 1981 53]", "[Sat 02, Jan 1982 6 00 6 00 1981 53]", + "[Sun 03, Jan 1982 0 00 7 01 1981 53]"}); + + auto results = cudf::strings::from_timestamps( + timestamps, "[%a %d, %b %Y %w %W %u %U %G %V]", cudf::strings_column_view(format_names)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsDatetimeTest, FromTimestampWeekdayMonthYear) +{ + cudf::test::fixed_width_column_wrapper timestamps{ + 1642951560L, // 2022-01-23 15:26:00 Sunday + 1645059720L, // 2022-02-17 01:02:00 Thursday + 1647167880L, // 2022-03-13 10:38:00 Sunday + 1649276040L, // 2022-04-06 20:14:00 Wednesday + 1588734621L, // 2020-05-06 03:10:21 Wednesday + 1560948892L, // 2019-06-19 12:54:52 Wednesday + -265880250L, // 1961-07-29 16:22:30 Saturday + 1628194442L, // 2021-08-05 20:14:02 Thursday + 1632410760L, // 2021-09-23 15:26:00 Thursday + 1633464842L, // 2021-10-05 20:14:02 Tuesday + 1636100042L, // 2021-11-05 08:14:02 Friday + 1638757202L // 2021-12-06 02:20:00 Monday + }; + + cudf::test::strings_column_wrapper expected({"[Sunday January 23, 2022: 03 PM]", + "[Thursday February 17, 2022: 01 AM]", + "[Sunday March 13, 2022: 10 AM]", + "[Wednesday April 06, 2022: 08 PM]", + "[Wednesday May 06, 2020: 03 AM]", + "[Wednesday June 19, 2019: 12 PM]", + "[Saturday July 29, 1961: 04 PM]", + "[Thursday August 05, 2021: 08 PM]", + "[Thursday September 23, 2021: 03 PM]", + "[Tuesday October 05, 2021: 08 PM]", + "[Friday November 05, 2021: 08 AM]", + "[Monday December 06, 2021: 02 AM]"}); + + auto results = cudf::strings::from_timestamps( + timestamps, "[%A %B %d, %Y: %I %p]", cudf::strings_column_view(format_names)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsDatetimeTest, FromTimestampAllSpecifiers) +{ + cudf::test::fixed_width_column_wrapper input{ + 1645059720000000001L, + 1647167880000001000L, + 1649276040001000000L, + 1588734621123456789L, + 1560948892987654321L, + -265880250010203040L, + 1628194442090807060L, + 1632410760500400300L, + 1633464842000000000L, + 1636100042999999999L}; + + auto results = cudf::strings::from_timestamps( + input, + "[%d/%m/%y/%Y %H:%I:%M:%S.%f %z:%Z %j %u %U %W %V %G %p %a %A %b %B]", + cudf::strings_column_view(format_names)); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[17/02/22/2022 01:01:02:00.000000 +0000:UTC 048 4 07 07 07 2022 AM Thu Thursday Feb February]", + "[13/03/22/2022 10:10:38:00.000001 +0000:UTC 072 7 11 10 10 2022 AM Sun Sunday Mar March]", + "[06/04/22/2022 20:08:14:00.001000 +0000:UTC 096 3 14 14 14 2022 PM Wed Wednesday Apr April]", + "[06/05/20/2020 03:03:10:21.123456 +0000:UTC 127 3 18 18 19 2020 AM Wed Wednesday May May]", + "[19/06/19/2019 12:12:54:52.987654 +0000:UTC 170 3 24 24 25 2019 PM Wed Wednesday Jun June]", + "[29/07/61/1961 16:04:22:29.989796 +0000:UTC 210 6 30 30 30 1961 PM Sat Saturday Jul July]", + "[05/08/21/2021 20:08:14:02.090807 +0000:UTC 217 4 31 31 31 2021 PM Thu Thursday Aug August]", + "[23/09/21/2021 15:03:26:00.500400 +0000:UTC 266 4 38 38 38 2021 PM Thu Thursday Sep September]", + "[05/10/21/2021 20:08:14:02.000000 +0000:UTC 278 2 40 40 40 2021 PM Tue Tuesday Oct October]", + "[05/11/21/2021 08:08:14:02.999999 +0000:UTC 309 5 44 44 44 2021 AM Fri Friday Nov November]"}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsDatetimeTest, ZeroSizeStringsColumn) { cudf::column_view zero_size_column( @@ -324,6 +463,7 @@ TEST_F(StringsDatetimeTest, Errors) cudf::test::fixed_width_column_wrapper timestamps{ 1530705600}; EXPECT_THROW(cudf::strings::from_timestamps(timestamps, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::from_timestamps(timestamps, "%A %B", view), cudf::logic_error); } TEST_F(StringsDatetimeTest, ToTimestampSingleSpecifier)