diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 06d38b73446..6e71fd4b17d 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #include #include @@ -61,12 +63,9 @@ enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILUR /** * @brief Providing additional information about the type casting result. */ -template struct data_casting_result_info { - // One past the last input element that was parsed - in_iterator_t input_parsed_end; - // One past the last output element that was written - out_iterator_t output_processed_end; + // Number of bytes written to output + size_type bytes; // Whether parsing succeeded, item was parsed to null, or failed data_casting_result result; }; @@ -128,20 +127,15 @@ __device__ __forceinline__ int32_t parse_unicode_hex(char const* str) } /** - * @brief Writes the UTF-8 byte sequence to \p out_it and returns the iterator to one past the - * last item that was written to \p out_it + * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to + * \p out_it */ -template -__device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_it_t out_it) +constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) { - constexpr size_type MAX_UTF8_BYTES_PER_CODE_POINT = 4; - char char_bytes[MAX_UTF8_BYTES_PER_CODE_POINT]; - auto const num_chars_written = strings::detail::from_char_utf8(utf8_chars, char_bytes); - - for (size_type i = 0; i < MAX_UTF8_BYTES_PER_CODE_POINT; i++) { - if (i < num_chars_written) { *out_it++ = char_bytes[i]; } - } - return out_it; + auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character) + : strings::detail::from_char_utf8(character, out_it); + if (out_it) out_it += bytes; + return bytes; } /** @@ -150,47 +144,39 @@ __device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_ * * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to * char - * @tparam out_iterator_t A forward output iterator type * @param in_begin Iterator to the first item to process * @param in_end Iterator to one past the last item to process - * @param out_it Iterator to the first item to write + * @param d_buffer Output character buffer to the first item to write * @param options Settings for controlling string processing behavior - * @return A four-tuple of (in_it_end, out_it_end, set_null, is_invalid), where in_it_end is an - * iterator to one past the last character from the input that was processed, out_it_end is an - * iterator to one past the last character that was written, set_null is true if a null literal - * was read or a parsing error occurred, and is_invalid is true if a parsing error was - * encountered + * @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is + * the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether + * parsing succeeded, item was parsed to null, or failed. */ -template -__device__ __forceinline__ data_casting_result_info process_string( - in_iterator_t in_begin, - in_iterator_t in_end, - out_iterator_t out_it, - cudf::io::parse_options_view const& options) +template +__device__ __forceinline__ data_casting_result_info +process_string(in_iterator_t in_begin, + in_iterator_t in_end, + char* d_buffer, + cudf::io::parse_options_view const& options) { - auto const num_in_chars = thrust::distance(in_begin, in_end); - - // Check if the value corresponds to the null literal - auto const is_null_literal = - serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); - if (is_null_literal) { return {in_begin, out_it, data_casting_result::PARSED_TO_NULL}; } - - // Whether in the original JSON this was a string value enclosed in quotes - // ({"a":"foo"} vs. {"a":1.23}) - char const quote_char = '"'; - char const backslash_char = '\\'; - + int32_t bytes = 0; + const auto num_in_chars = thrust::distance(in_begin, in_end); // String values are indicated by keeping the quote character - bool const is_string_value = - num_in_chars >= 2LL && (*in_begin == quote_char) && (*thrust::prev(in_end) == quote_char); + bool const is_string_value = num_in_chars >= 2LL && (*in_begin == options.quotechar) && + (*thrust::prev(in_end) == options.quotechar); // Copy literal/numeric value if (not is_string_value) { while (in_begin != in_end) { - *out_it++ = *in_begin++; + if (d_buffer) *d_buffer++ = *in_begin; + ++in_begin; + ++bytes; } - return {in_begin, out_it, data_casting_result::PARSING_SUCCESS}; + return {bytes, data_casting_result::PARSING_SUCCESS}; } + // Whether in the original JSON this was a string value enclosed in quotes + // ({"a":"foo"} vs. {"a":1.23}) + char const backslash_char = '\\'; // Escape-flag, set after encountering a backslash character bool escape = false; @@ -206,7 +192,10 @@ __device__ __forceinline__ data_casting_result_info "fail"/null for this item - if (escaped_char == NON_ESCAPE_CHAR) { - return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; - } + if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; } // Regular, single-character escape if (escaped_char != UNICODE_SEQ) { - *out_it++ = escaped_char; + if (d_buffer) *d_buffer++ = escaped_char; + ++bytes; ++in_begin; continue; } @@ -238,13 +226,13 @@ __device__ __forceinline__ data_casting_result_info "fail"/null for this item - if (hex_val < 0) { return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; } + if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; } // Skip over the four hex digits thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); @@ -269,21 +257,59 @@ __device__ __forceinline__ data_casting_result_info "fail"/null for this item - if (escape) { return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; } - return {in_begin, out_it, data_casting_result::PARSING_SUCCESS}; + if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; } + return {bytes, data_casting_result::PARSING_SUCCESS}; } +template +struct string_parse { + str_tuple_it str_tuples; + bitmask_type* null_mask; + cudf::io::parse_options_view const options; + size_type* d_offsets{}; + char* d_chars{}; + + __device__ void operator()(size_type idx) + { + if (not bit_is_set(null_mask, idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const in_begin = str_tuples[idx].first; + auto const in_end = in_begin + str_tuples[idx].second; + auto const num_in_chars = str_tuples[idx].second; + + // Check if the value corresponds to the null literal + auto const is_null_literal = + (!d_chars) && + serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal) { + clear_bit(null_mask, idx); + if (!d_chars) d_offsets[idx] = 0; + return; + } + + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto str_process_info = process_string(in_begin, in_end, d_buffer, options); + if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { + clear_bit(null_mask, idx); + if (!d_chars) d_offsets[idx] = 0; + } else { + if (!d_chars) d_offsets[idx] = str_process_info.bytes; + } + } +}; /** * @brief Parses the data from an iterator of string views, casting it to the given target data type * @@ -307,67 +333,18 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, { CUDF_FUNC_RANGE(); if (col_type == cudf::data_type{cudf::type_id::STRING}) { - rmm::device_uvector offsets(col_size + 1, stream); - - // Compute string sizes of the post-processed strings - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - sizes = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - // String at current offset is null, e.g., due to omissions - // ([{"b":"foo"},{"a":"foo"}]) - if (not bit_is_set(null_mask, row)) { - sizes[row] = 0; - return; - } - - auto const in_begin = str_tuples[row].first; - auto const in_end = in_begin + str_tuples[row].second; - auto out_it = cub::DiscardOutputIterator<>{}; - auto const str_process_info = - process_string(in_begin, in_end, out_it, options); - - // The total number of characters that we're supposed to copy out - auto const num_chars_copied_out = - thrust::distance(out_it, str_process_info.output_processed_end); - - // If, during parsing, an error occurred or we parsed the null literal -> - // set to null - if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { - sizes[row] = 0; - clear_bit(null_mask, row); - } else { - sizes[row] = num_chars_copied_out; - } - }); - - // Compute offsets for the post-processed strings - thrust::exclusive_scan( - rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); - - // Write out post-processed strings (stripping off quotes, replacing escape sequences) - rmm::device_uvector chars(offsets.back_element(stream), stream); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - chars = device_span{chars}, - offsets = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - if (not bit_is_set(null_mask, row)) { return; } - - auto const in_begin = str_tuples[row].first; - auto const in_end = in_begin + str_tuples[row].second; - auto out_it = &chars[offsets[row]]; - process_string(in_begin, in_end, out_it, options); - }); - + // this utility calls the functor to build the offsets and chars columns + auto [offsets, chars] = cudf::strings::detail::make_strings_children( + string_parse{ + str_tuples, static_cast(null_mask.data()), options}, + col_size, + stream, + mr); + + auto null_count = + cudf::detail::null_count(static_cast(null_mask.data()), 0, col_size, stream); return make_strings_column( - col_size, std::move(offsets), std::move(chars), std::move(null_mask)); + col_size, std::move(offsets), std::move(chars), null_count, std::move(null_mask)); } auto out_col = make_fixed_width_column( diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 5f512b4959e..535854302bb 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -567,10 +567,13 @@ std::pair, std::vector> device_json_co rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto make_validity = - [stream](device_json_column& json_col) -> std::pair { + auto validity_size_check = [](device_json_column& json_col) { CUDF_EXPECTS(json_col.validity.size() >= bitmask_allocation_size_bytes(json_col.num_rows), "valid_count is too small"); + }; + auto make_validity = [stream, validity_size_check]( + device_json_column& json_col) -> std::pair { + validity_size_check(json_col); auto null_count = cudf::detail::null_count(json_col.validity.data(), 0, json_col.num_rows, stream); // full null_mask is always required for parse_data @@ -626,11 +629,12 @@ std::pair, std::vector> device_json_co target_type = cudf::io::detail::infer_data_type( options.json_view(), d_input, string_ranges_it, col_size, stream); } + validity_size_check(json_col); // Convert strings to the inferred data type auto col = experimental::detail::parse_data(string_spans_it, col_size, target_type, - make_validity(json_col).first, + json_col.validity.release(), options.view(), stream, mr);