diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 4108e23d4ab..c0a20e1e47e 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -159,6 +159,25 @@ struct format_compiler { int8_t subsecond_precision() const { return specifiers.at('f'); } }; +/** + * @brief Specialized function to return the integer value reading up to the specified + * bytes or until an invalid character is encountered. + * + * @param str Beginning of characters to read. + * @param bytes Number of bytes in str to read. + * @return Integer value of valid characters read and how many bytes were not read. + */ +__device__ thrust::pair parse_int(char const* str, size_type bytes) +{ + int32_t value = 0; + while (bytes-- > 0) { + char chr = *str++; + if (chr < '0' || chr > '9') break; + value = (value * 10) + static_cast(chr - '0'); + } + return thrust::make_pair(value, bytes + 1); +} + /** * @brief This parses date/time characters into a timestamp integer * @@ -168,35 +187,22 @@ template struct parse_datetime { column_device_view const d_strings; device_span const d_format_items; - int8_t subsecond_precision; + int8_t const subsecond_precision; /** * @brief Return power of ten value given an exponent. * * @return `1x10^exponent` for `0 <= exponent <= 9` */ - __device__ constexpr int64_t power_of_ten(int32_t exponent) + __device__ constexpr int64_t power_of_ten(int32_t const exponent) const { constexpr int64_t powers_of_ten[] = { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L, 1000000000L}; return powers_of_ten[exponent]; } - // - __device__ int32_t str2int(const char* str, size_type bytes) - { - const char* ptr = str; - int32_t value = 0; - for (size_type idx = 0; idx < bytes; ++idx) { - char chr = *ptr++; - if (chr < '0' || chr > '9') break; - value = (value * 10) + static_cast(chr - '0'); - } - return value; - } - // Walk the format_items to parse the string into date/time components - __device__ timestamp_components parse_into_parts(string_view const& d_string) + __device__ timestamp_components parse_into_parts(string_view const& d_string) const { timestamp_components timeparts = {1970, 1, 1, 0}; // init to epoch time @@ -208,32 +214,71 @@ struct parse_datetime { if (item.item_type == format_char_type::literal) { // static character we'll just skip; - // consume item.length bytes from string + // consume item.length bytes from the input string ptr += item.length; length -= item.length; continue; } + size_type bytes_read = item.length; // number of bytes processed // special logic for each specifier switch (item.value) { - case 'Y': timeparts.year = static_cast(str2int(ptr, item.length)); break; + case 'Y': { + auto const [year, left] = parse_int(ptr, item.length); + timeparts.year = static_cast(year); + bytes_read -= left; + break; + } case 'y': { - auto const year = str2int(ptr, item.length); - timeparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); + auto const [year, left] = parse_int(ptr, item.length); + timeparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); + bytes_read -= left; + break; + } + case 'm': { + auto const [month, left] = parse_int(ptr, item.length); + timeparts.month = static_cast(month); + bytes_read -= left; + break; + } + case 'd': { + auto const [day, left] = parse_int(ptr, item.length); + timeparts.day = static_cast(day); + bytes_read -= left; + break; + } + case 'j': { + auto const [day, left] = parse_int(ptr, item.length); + timeparts.day_of_year = static_cast(day); + bytes_read -= left; break; } - case 'm': timeparts.month = static_cast(str2int(ptr, item.length)); break; - case 'd': timeparts.day = static_cast(str2int(ptr, item.length)); break; - case 'j': timeparts.day_of_year = static_cast(str2int(ptr, item.length)); break; case 'H': - case 'I': timeparts.hour = static_cast(str2int(ptr, item.length)); break; - case 'M': timeparts.minute = static_cast(str2int(ptr, item.length)); break; - case 'S': timeparts.second = static_cast(str2int(ptr, item.length)); break; + case 'I': { + auto const [hour, left] = parse_int(ptr, item.length); + timeparts.hour = static_cast(hour); + bytes_read -= left; + break; + } + case 'M': { + auto const [minute, left] = parse_int(ptr, item.length); + timeparts.minute = static_cast(minute); + bytes_read -= left; + break; + } + case 'S': { + auto const [second, left] = parse_int(ptr, item.length); + timeparts.second = static_cast(second); + bytes_read -= left; + break; + } case 'f': { int32_t const read_size = std::min(static_cast(item.length), static_cast(length)); - int64_t const fraction = str2int(ptr, read_size) * power_of_ten(item.length - read_size); - timeparts.subsecond = static_cast(fraction); + auto const [fraction, left] = parse_int(ptr, read_size); + timeparts.subsecond = + static_cast(fraction * power_of_ten(item.length - read_size - left)); + bytes_read = read_size - left; break; } case 'p': { @@ -247,24 +292,25 @@ struct parse_datetime { break; } case 'z': { - auto const sign = *ptr == '-' ? 1 : -1; // revert timezone back to UTC - auto const hh = str2int(ptr + 1, 2); - auto const mm = str2int(ptr + 3, 2); - // ignoring the rest for now - // item.length has how many chars we should read + // 'z' format is +hh:mm -- single sign char and 2 chars each for hour and minute + auto const sign = *ptr == '-' ? 1 : -1; + auto const [hh, lh] = parse_int(ptr + 1, 2); + auto const [mm, lm] = parse_int(ptr + 3, 2); + // revert timezone back to UTC timeparts.tz_minutes = sign * ((hh * 60) + mm); + bytes_read -= lh + lm; break; } case 'Z': break; // skip default: break; } - ptr += item.length; - length -= item.length; + ptr += bytes_read; + length -= bytes_read; } return timeparts; } - __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) + __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) const { auto const ymd = // convenient chrono class handles the leap year calculations for us cuda::std::chrono::year_month_day( @@ -290,7 +336,7 @@ struct parse_datetime { return timestamp; } - __device__ T operator()(size_type idx) + __device__ T operator()(size_type idx) const { T epoch_time{typename T::duration{0}}; if (d_strings.is_null(idx)) return epoch_time; @@ -385,29 +431,6 @@ struct check_datetime_format { }); } - /** - * @brief Specialized function to return the value and check for non-decimal characters. - * - * If non-decimal characters are found within `str` and `str + bytes` then - * the returned result is `thrust::nullopt` (_does not contain a value_). - * Otherwise, the parsed integer result is returned. - * - * @param str Beginning of characters to read/check. - * @param bytes Number of bytes in str to read/check. - * @return Integer value if characters are valid. - */ - __device__ thrust::optional str2int(const char* str, size_type bytes) - { - const char* ptr = str; - int32_t value = 0; - for (size_type idx = 0; idx < bytes; ++idx) { - char chr = *ptr++; - if (chr < '0' || chr > '9') return thrust::nullopt; - value = (value * 10) + static_cast(chr - '0'); - } - return value; - } - /** * @brief Check the specified characters are between ['0','9'] * and the resulting integer is within [`min_value`, `max_value`]. @@ -416,18 +439,23 @@ struct check_datetime_format { * @param bytes Number of bytes to check. * @param min_value Inclusive minimum value * @param max_value Inclusive maximum value - * @return true if parsed value is between `min_value` and `max_value`. + * @return If value is valid and number of bytes not successfully processed */ - __device__ bool check_value(const char* str, size_type bytes, int min_value, int max_value) + __device__ thrust::pair check_value(char const* str, + size_type const bytes, + int const min_value, + int const max_value) { - const char* ptr = str; + if (*str < '0' || *str > '9') { return thrust::make_pair(false, bytes); } int32_t value = 0; - for (size_type idx = 0; idx < bytes; ++idx) { - char chr = *ptr++; - if (chr < '0' || chr > '9') return false; + size_type count = bytes; + while (count-- > 0) { + char chr = *str++; + if (chr < '0' || chr > '9') break; value = (value * 10) + static_cast(chr - '0'); } - return value >= min_value && value <= max_value; + return (value >= min_value && value <= max_value) ? thrust::make_pair(true, count + 1) + : thrust::make_pair(false, bytes); } /** @@ -459,44 +487,72 @@ struct check_datetime_format { // special logic for each specifier // reference: https://man7.org/linux/man-pages/man3/strptime.3.html - bool result = false; + bool result = false; + size_type bytes_read = item.length; switch (item.value) { case 'Y': { - if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts.year = static_cast(value.value()); - } + auto const [year, left] = parse_int(ptr, item.length); + result = (left < item.length); + dateparts.year = static_cast(year); + bytes_read -= left; break; } case 'y': { - if (auto value = str2int(ptr, item.length)) { - result = true; - auto const year = value.value(); - dateparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); - } + auto const [year, left] = parse_int(ptr, item.length); + result = (left < item.length); + dateparts.year = static_cast(year + (year < 69 ? 2000 : 1900)); + bytes_read -= left; break; } case 'm': { - if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts.month = static_cast(value.value()); - } + auto const [month, left] = parse_int(ptr, item.length); + result = (left < item.length); + dateparts.month = static_cast(month); + bytes_read -= left; break; } case 'd': { - if (auto value = str2int(ptr, item.length)) { - result = true; - dateparts.day = static_cast(value.value()); - } + auto const [day, left] = parse_int(ptr, item.length); + result = (left < item.length); + dateparts.day = static_cast(day); // value.value() + bytes_read -= left; + break; + } + case 'j': { + auto const cv = check_value(ptr, item.length, 1, 366); + result = cv.first; + bytes_read -= cv.second; + break; + } + case 'H': { + auto const cv = check_value(ptr, item.length, 0, 23); + result = cv.first; + bytes_read -= cv.second; + break; + } + case 'I': { + auto const cv = check_value(ptr, item.length, 1, 12); + result = cv.first; + bytes_read -= cv.second; + break; + } + case 'M': { + auto const cv = check_value(ptr, item.length, 0, 59); + result = cv.first; + bytes_read -= cv.second; + break; + } + case 'S': { + auto const cv = check_value(ptr, item.length, 0, 60); + result = cv.first; + bytes_read -= cv.second; break; } - case 'j': result = check_value(ptr, item.length, 1, 366); break; - case 'H': result = check_value(ptr, item.length, 0, 23); break; - case 'I': result = check_value(ptr, item.length, 1, 12); break; - case 'M': result = check_value(ptr, item.length, 0, 59); break; - case 'S': result = check_value(ptr, item.length, 0, 60); break; case 'f': { - result = check_digits(ptr, std::min(static_cast(item.length), length)); + int32_t const read_size = + std::min(static_cast(item.length), static_cast(length)); + result = check_digits(ptr, read_size); + bytes_read = read_size; break; } case 'p': { @@ -509,9 +565,10 @@ struct check_datetime_format { } case 'z': { // timezone offset if (item.length == 5) { - result = (*ptr == '-' || *ptr == '+') && // sign - check_value(ptr + 1, 2, 0, 23) && // hour - check_value(ptr + 3, 2, 0, 59); // minute + auto const cvh = check_value(ptr + 1, 2, 0, 23); + auto const cvm = check_value(ptr + 3, 2, 0, 59); + result = (*ptr == '-' || *ptr == '+') && cvh.first && cvm.first; + bytes_read -= cvh.second + cvm.second; } break; } @@ -519,8 +576,8 @@ struct check_datetime_format { default: break; } if (!result) return thrust::nullopt; - ptr += item.length; - length -= item.length; + ptr += bytes_read; + length -= bytes_read; } return dateparts; } @@ -867,7 +924,7 @@ struct datetime_formatter : public from_timestamp_base { } // Value to use for int2str call at the end of the switch-statement. - // This simplifies the case statements and prevents alot of extra inlining. + // This simplifies the case statements and prevents a lot of extra inlining. int32_t copy_value = -1; // default set for non-int2str usage cases // special logic for each specifier diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index 1a814ea707e..4543607614f 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -155,6 +155,120 @@ TEST_F(StringsDatetimeTest, ToTimestampTimezone) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected); } +TEST_F(StringsDatetimeTest, ToTimestampSingleSpecifier) +{ + cudf::test::strings_column_wrapper strings{"12", "10", "09", "05"}; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%d"); + cudf::test::fixed_width_column_wrapper expected_days{ + 11, 9, 8, 4}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_days); + + results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%m"); + cudf::test::fixed_width_column_wrapper expected_months{ + 334, 273, 243, 120}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_months); + + results = cudf::strings::is_timestamp(strings_view, "%m"); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, + cudf::test::fixed_width_column_wrapper{1, 1, 1, 1}); +} + +TEST_F(StringsDatetimeTest, ToTimestampVariableFractions) +{ + cudf::test::strings_column_wrapper strings{"01:02:03.000001000", + "01:02:03.000001", + "01:02:03.1", + "01:02:03.01", + "01:02:03.0098700", + "01:02:03.0023456"}; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}, "%H:%M:%S.%9f"); + auto durations = + cudf::cast(results->view(), cudf::data_type{cudf::type_id::DURATION_NANOSECONDS}); + + cudf::test::fixed_width_column_wrapper expected{ + cudf::duration_ns{3723000001000}, + cudf::duration_ns{3723000001000}, + cudf::duration_ns{3723100000000}, + cudf::duration_ns{3723010000000}, + cudf::duration_ns{3723009870000}, + cudf::duration_ns{3723002345600}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*durations, expected); + + results = cudf::strings::is_timestamp(strings_view, "%H:%M:%S.%f"); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, + cudf::test::fixed_width_column_wrapper{1, 1, 1, 1, 1, 1}); +} + +TEST_F(StringsDatetimeTest, ToTimestampYear) +{ + cudf::test::strings_column_wrapper strings{ + "28/02/74", "17/07/68", "20/03/19", "29/02/20", "07/02/69"}; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%d/%m/%y"); + cudf::test::fixed_width_column_wrapper expected{ + 1519, 35992, 17975, 18321, -328}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_timestamp(strings_view, "%d/%m/%y"); + cudf::test::fixed_width_column_wrapper is_expected({1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected); +} + +TEST_F(StringsDatetimeTest, ToTimestampSingleDigits) +{ + cudf::test::strings_column_wrapper strings{"1974-2-28 01:23:45.987000123", + "2019-7-17 2:34:56.001234567", + "2019-3-20 12:34:56.100100100", + "2020-02-2 00:00:00.555777999", + "1969-12-1 00:00:01.000055000", + "1944-07-21 11:15:09.333444000", + "2021-9-8 12:07:30.000000000"}; + auto strings_view = cudf::strings_column_view(strings); + + auto results = cudf::strings::to_timestamps( + strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}, "%Y-%m-%d %H:%M:%S.%9f"); + cudf::test::fixed_width_column_wrapper expected_ns{ + 131246625987000123, + 1563330896001234567, + 1553085296100100100, + 1580601600555777999, + -2678398999945000, + -803047490666556000, + 1631102850000000000}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ns); + + results = cudf::strings::is_timestamp(strings_view, "%Y-%m-%d %H:%M:%S.%6f"); + cudf::test::fixed_width_column_wrapper is_expected({1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected); +} + +TEST_F(StringsDatetimeTest, IsTimestamp) +{ + cudf::test::strings_column_wrapper strings{"2020-10-07 13:02:03 1PM +0130", + "2020:10:07 01-02-03 1AM +0130", + "2020-10-7 11:02:03 11AM -1025", + "2020-13-07 01:02:03 1AM +0000", + "2020-10-32 01:32:03 1AM +0000", + "2020-10-07 25:02:03 1AM +0000", + "2020-10-07 01:62:03 1AM +0000", + "2020-10-07 01:02:63 1AM +0000", + "2020-02-29 01:32:03 1AM +0000", + "2020-02-30 01:32:03 01AM +0000", + "2020-00-31 01:32:03 1AM +0000", + "2020-02-00 02:32:03 2AM +0000", + "2020-2-9 9:12:13 9AM +1111"}; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_timestamp(strings_view, "%Y-%m-%d %H:%M:%S %I%p %z"); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *results, cudf::test::fixed_width_column_wrapper{1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1}); +} + TEST_F(StringsDatetimeTest, FromTimestamp) { std::vector h_timestamps{ @@ -465,88 +579,3 @@ TEST_F(StringsDatetimeTest, Errors) EXPECT_THROW(cudf::strings::from_timestamps(timestamps, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::from_timestamps(timestamps, "%A %B", view), cudf::logic_error); } - -TEST_F(StringsDatetimeTest, ToTimestampSingleSpecifier) -{ - cudf::test::strings_column_wrapper strings{"12", "10", "09", "05"}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::to_timestamps( - strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%d"); - cudf::test::fixed_width_column_wrapper expected_days{ - 11, 9, 8, 4}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_days); - - results = cudf::strings::to_timestamps( - strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%m"); - cudf::test::fixed_width_column_wrapper expected_months{ - 334, 273, 243, 120}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_months); - - results = cudf::strings::is_timestamp(strings_view, "%m"); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, - cudf::test::fixed_width_column_wrapper{1, 1, 1, 1}); -} - -TEST_F(StringsDatetimeTest, ToTimestampVariableFractions) -{ - cudf::test::strings_column_wrapper strings{"01:02:03.000001000", - "01:02:03.000001", - "01:02:03.1", - "01:02:03.01", - "01:02:03.0098700", - "01:02:03.0023456"}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::to_timestamps( - strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}, "%H:%M:%S.%9f"); - auto durations = - cudf::cast(results->view(), cudf::data_type{cudf::type_id::DURATION_NANOSECONDS}); - - cudf::test::fixed_width_column_wrapper expected{ - cudf::duration_ns{3723000001000}, - cudf::duration_ns{3723000001000}, - cudf::duration_ns{3723100000000}, - cudf::duration_ns{3723010000000}, - cudf::duration_ns{3723009870000}, - cudf::duration_ns{3723002345600}}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*durations, expected); - - results = cudf::strings::is_timestamp(strings_view, "%H:%M:%S.%f"); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, - cudf::test::fixed_width_column_wrapper{1, 1, 1, 1, 1, 1}); -} - -TEST_F(StringsDatetimeTest, IsTimestamp) -{ - cudf::test::strings_column_wrapper strings{"2020-10-07 01:02:03", - "2020:10:07 01-02-03", - "2020-10-7 01:02:03", - "2020-13-07 01:02:03", - "2020-10-32 01:32:03", - "2020-10-07 25:02:03", - "2020-10-07 01:62:03", - "2020-10-07 01:02:63", - "2020-02-29 01:32:03", - "2020-02-30 01:32:03", - "2020-00-31 01:32:03", - "2020-02-00 01:32:03"}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::is_timestamp(strings_view, "%Y-%m-%d %H:%M:%S"); - CUDF_TEST_EXPECT_COLUMNS_EQUAL( - *results, cudf::test::fixed_width_column_wrapper{1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}); -} - -TEST_F(StringsDatetimeTest, ToTimestampYear) -{ - cudf::test::strings_column_wrapper strings{ - "28/02/74", "17/07/68", "20/03/19", "29/02/20", "07/02/69"}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::to_timestamps( - strings_view, cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}, "%d/%m/%y"); - cudf::test::fixed_width_column_wrapper expected{ - 1519, 35992, 17975, 18321, -328}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - results = cudf::strings::is_timestamp(strings_view, "%d/%m/%y"); - cudf::test::fixed_width_column_wrapper is_expected({1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected); -}