Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use make_strings_children in parse_data nested json reader #12382

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 98 additions & 110 deletions cpp/include/cudf/io/detail/data_casting.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utf8.hpp>
#include <cudf/types.hpp>

Expand Down Expand Up @@ -61,12 +63,9 @@ enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILUR
/**
* @brief Providing additional information about the type casting result.
*/
template <typename in_iterator_t, typename out_iterator_t>
struct data_casting_result_info {
// One past the last input element that was parsed
in_iterator_t input_parsed_end;
// One past the last output element that was written
out_iterator_t output_processed_end;
// Number of bytes written to output
size_type bytes;
// Whether parsing succeeded, item was parsed to null, or failed
data_casting_result result;
};
Expand Down Expand Up @@ -128,20 +127,24 @@ __device__ __forceinline__ int32_t parse_unicode_hex(char const* str)
}

/**
* @brief Writes the UTF-8 byte sequence to \p out_it and returns the iterator to one past the
* last item that was written to \p out_it
* @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to
* \p out_it
*/
template <typename utf8_char_t, typename out_it_t>
__device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_it_t out_it)
template <typename utf8_char_t>
__device__ __forceinline__ size_type write_utf8_char(utf8_char_t utf8_chars, char*& out_it)
{
constexpr size_type MAX_UTF8_BYTES_PER_CODE_POINT = 4;
char char_bytes[MAX_UTF8_BYTES_PER_CODE_POINT];
auto const num_chars_written = strings::detail::from_char_utf8(utf8_chars, char_bytes);
size_type bytes = 0;

for (size_type i = 0; i < MAX_UTF8_BYTES_PER_CODE_POINT; i++) {
if (i < num_chars_written) { *out_it++ = char_bytes[i]; }
if (i < num_chars_written) {
if (out_it) *out_it++ = char_bytes[i];
++bytes;
}
}
return out_it;
return bytes;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When is bytes != num_chars_written?
This function seems overly complex. Maybe I'm missing something?

constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) {
   auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character) : 
                                            strings::detail::from_char_utf8(character, out_it);
   out_it += bytes;
   return bytes;
}

There does not seem to be a need for the template as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think, this is written to allow the compiler to unroll the loop.
I will change this to above suggestion.

}

/**
Expand All @@ -150,47 +153,39 @@ __device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_
*
* @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to
* char
* @tparam out_iterator_t A forward output iterator type
* @param in_begin Iterator to the first item to process
* @param in_end Iterator to one past the last item to process
* @param out_it Iterator to the first item to write
* @param d_buffer Output character buffer to the first item to write
* @param options Settings for controlling string processing behavior
* @return A four-tuple of (in_it_end, out_it_end, set_null, is_invalid), where in_it_end is an
* iterator to one past the last character from the input that was processed, out_it_end is an
* iterator to one past the last character that was written, set_null is true if a null literal
* was read or a parsing error occurred, and is_invalid is true if a parsing error was
* encountered
* @return A struct of (num_bytes_written, parsing_success_result), where num_bytes_written is
* the number of bytes written to d_buffer, parsing_success_result is enum value indicating whether
* parsing succeeded, item was parsed to null, or failed.
*/
template <typename in_iterator_t, typename out_iterator_t>
__device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_t> process_string(
in_iterator_t in_begin,
in_iterator_t in_end,
out_iterator_t out_it,
cudf::io::parse_options_view const& options)
template <typename in_iterator_t>
__device__ __forceinline__ data_casting_result_info
process_string(in_iterator_t in_begin,
in_iterator_t in_end,
char* d_buffer,
cudf::io::parse_options_view const& options)
{
auto const num_in_chars = thrust::distance(in_begin, in_end);

// Check if the value corresponds to the null literal
auto const is_null_literal =
serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
if (is_null_literal) { return {in_begin, out_it, data_casting_result::PARSED_TO_NULL}; }

// Whether in the original JSON this was a string value enclosed in quotes
// ({"a":"foo"} vs. {"a":1.23})
char const quote_char = '"';
char const backslash_char = '\\';

int32_t bytes = 0;
const auto num_in_chars = thrust::distance(in_begin, in_end);
// String values are indicated by keeping the quote character
bool const is_string_value =
num_in_chars >= 2LL && (*in_begin == quote_char) && (*thrust::prev(in_end) == quote_char);
bool const is_string_value = num_in_chars >= 2LL && (*in_begin == options.quotechar) &&
(*thrust::prev(in_end) == options.quotechar);

// Copy literal/numeric value
if (not is_string_value) {
while (in_begin != in_end) {
*out_it++ = *in_begin++;
if (d_buffer) *d_buffer++ = *in_begin;
++in_begin;
++bytes;
}
return {in_begin, out_it, data_casting_result::PARSING_SUCCESS};
return {bytes, data_casting_result::PARSING_SUCCESS};
}
// Whether in the original JSON this was a string value enclosed in quotes
// ({"a":"foo"} vs. {"a":1.23})
char const backslash_char = '\\';

// Escape-flag, set after encountering a backslash character
bool escape = false;
Expand All @@ -206,7 +201,10 @@ __device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_
// Copy single character to output
if (!escape) {
escape = (*in_begin == backslash_char);
if (!escape) { *out_it++ = *in_begin; }
if (!escape) {
if (d_buffer) *d_buffer++ = *in_begin;
++bytes;
}
++in_begin;
continue;
}
Expand All @@ -219,13 +217,12 @@ __device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_
auto escaped_char = get_escape_char(*in_begin);

// We escaped an invalid escape character -> "fail"/null for this item
if (escaped_char == NON_ESCAPE_CHAR) {
return {in_begin, out_it, data_casting_result::PARSING_FAILURE};
}
if (escaped_char == NON_ESCAPE_CHAR) { return {bytes, data_casting_result::PARSING_FAILURE}; }

// Regular, single-character escape
if (escaped_char != UNICODE_SEQ) {
*out_it++ = escaped_char;
if (d_buffer) *d_buffer++ = escaped_char;
++bytes;
++in_begin;
continue;
}
Expand All @@ -238,13 +235,13 @@ __device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_
// Make sure that there's at least 4 characters left from the
// input, which are expected to be hex digits
if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) {
return {in_begin, out_it, data_casting_result::PARSING_FAILURE};
return {bytes, data_casting_result::PARSING_FAILURE};
}

auto hex_val = parse_unicode_hex(in_begin);

// Couldn't parse hex values from the four-character sequence -> "fail"/null for this item
if (hex_val < 0) { return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; }
if (hex_val < 0) { return {bytes, data_casting_result::PARSING_FAILURE}; }

// Skip over the four hex digits
thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT);
Expand All @@ -269,21 +266,59 @@ __device__ __forceinline__ data_casting_result_info<in_iterator_t, out_iterator_
uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) +
(hex_low_val - UTF16_LOW_SURROGATE_BEGIN);
auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point);
out_it = write_utf8_char(utf8_chars, out_it);
bytes += write_utf8_char(utf8_chars, d_buffer);
}

// Just a single \uXXXX sequence
else {
auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val);
out_it = write_utf8_char(utf8_chars, out_it);
bytes += write_utf8_char(utf8_chars, d_buffer);
}
}

// The last character of the input is a backslash -> "fail"/null for this item
if (escape) { return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; }
return {in_begin, out_it, data_casting_result::PARSING_SUCCESS};
if (escape) { return {bytes, data_casting_result::PARSING_FAILURE}; }
return {bytes, data_casting_result::PARSING_SUCCESS};
}

template <typename str_tuple_it>
struct string_parse {
str_tuple_it str_tuples;
bitmask_type* null_mask;
cudf::io::parse_options_view const options;
int32_t* d_offsets{};
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
char* d_chars{};

__device__ void operator()(size_type idx)
{
if (not bit_is_set(null_mask, idx)) {
if (!d_chars) d_offsets[idx] = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!d_chars) d_offsets[idx] = 0;
if (d_chars == nullptr) { d_offsets[idx] = 0; }

return;
}
auto const in_begin = str_tuples[idx].first;
auto const in_end = in_begin + str_tuples[idx].second;
auto const num_in_chars = str_tuples[idx].second;

// Check if the value corresponds to the null literal
auto const is_null_literal =
(!d_chars) &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
(!d_chars) &&
(d_chars == nullptr) &&

serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
if (is_null_literal) {
clear_bit(null_mask, idx);
if (!d_chars) d_offsets[idx] = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!d_chars) d_offsets[idx] = 0;
if (d_chars == nullptr) { d_offsets[idx] = 0; }

return;
}

char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
clear_bit(null_mask, idx);
if (!d_chars) d_offsets[idx] = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!d_chars) d_offsets[idx] = 0;
if (d_chars == nullptr) { d_offsets[idx] = 0; }

} else {
if (!d_chars) d_offsets[idx] = str_process_info.bytes;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!d_chars) d_offsets[idx] = str_process_info.bytes;
if (d_chars == nullptr) { d_offsets[idx] = str_process_info.bytes; }

}
}
};
/**
* @brief Parses the data from an iterator of string views, casting it to the given target data type
*
Expand All @@ -307,67 +342,20 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
{
CUDF_FUNC_RANGE();
if (col_type == cudf::data_type{cudf::type_id::STRING}) {
rmm::device_uvector<size_type> offsets(col_size + 1, stream);

// Compute string sizes of the post-processed strings
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
col_size,
[str_tuples,
sizes = device_span<size_type>{offsets},
null_mask = static_cast<bitmask_type*>(null_mask.data()),
options] __device__(size_type row) {
// String at current offset is null, e.g., due to omissions
// ([{"b":"foo"},{"a":"foo"}])
if (not bit_is_set(null_mask, row)) {
sizes[row] = 0;
return;
}

auto const in_begin = str_tuples[row].first;
auto const in_end = in_begin + str_tuples[row].second;
auto out_it = cub::DiscardOutputIterator<>{};
auto const str_process_info =
process_string(in_begin, in_end, out_it, options);

// The total number of characters that we're supposed to copy out
auto const num_chars_copied_out =
thrust::distance(out_it, str_process_info.output_processed_end);

// If, during parsing, an error occurred or we parsed the null literal ->
// set to null
if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
sizes[row] = 0;
clear_bit(null_mask, row);
} else {
sizes[row] = num_chars_copied_out;
}
});

// Compute offsets for the post-processed strings
thrust::exclusive_scan(
rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());

// Write out post-processed strings (stripping off quotes, replacing escape sequences)
rmm::device_uvector<char> chars(offsets.back_element(stream), stream);
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
col_size,
[str_tuples,
chars = device_span<char>{chars},
offsets = device_span<size_type>{offsets},
null_mask = static_cast<bitmask_type*>(null_mask.data()),
options] __device__(size_type row) {
if (not bit_is_set(null_mask, row)) { return; }

auto const in_begin = str_tuples[row].first;
auto const in_end = in_begin + str_tuples[row].second;
auto out_it = &chars[offsets[row]];
process_string(in_begin, in_end, out_it, options);
});
// this utility calls the functor to build the offsets and chars columns
auto [offsets, chars] = cudf::strings::detail::make_strings_children(
string_parse<decltype(str_tuples)>{
str_tuples, static_cast<bitmask_type*>(null_mask.data()), options},
col_size,
stream,
mr);

return make_strings_column(
col_size, std::move(offsets), std::move(chars), std::move(null_mask));
col_size,
std::move(offsets),
std::move(chars),
cudf::detail::null_count(static_cast<bitmask_type*>(null_mask.data()), 0, col_size, stream),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was curious if this could be computed in the functor and if that would be faster than doing the null_count() calculation here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

null_count method will be faster because it handles 32-bit per thread, while functor will handle 1 bit per thread, which will eventually lead to lesser atomicAdds to global memory too.

Besides, functor's shared memory is not in our control since we don't know threads per block. So, we can't use BlockReduce, which will increase no of atomicAdds to the global memory location.

std::move(null_mask));
}

auto out_col = make_fixed_width_column(
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,10 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
CUDF_EXPECTS(json_col.validity.size() >= bitmask_allocation_size_bytes(json_col.num_rows),
"valid_count is too small");
auto null_count =
cudf::detail::null_count(json_col.validity.data(), 0, json_col.num_rows, stream);
json_col.type == json_col_t::StringColumn
// Skipped because calculated again in parse_data
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
? cudf::UNKNOWN_NULL_COUNT
: cudf::detail::null_count(json_col.validity.data(), 0, json_col.num_rows, stream);
// full null_mask is always required for parse_data
return {json_col.validity.release(), null_count};
// Note: json_col modified here, moves this memory
Expand Down