From cf9f85ea2b5a5a3dd22efe0d74e64496720a348d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 17 Apr 2023 09:35:52 -0400 Subject: [PATCH] Add null-count parameter to json experimental parse_data utility (#13107) Add `null_count` parameter to the `cudf::io::json::experimental::detail::parse_data` function which already accepts a `null_mask`. Normally, the callers already know the count. This unction can use the parameter to help build the output column. Found while working on #13081 Contributes to: https://github.com/rapidsai/cudf/issues/11968 Authors: - David Wendt (https://github.com/davidwendt) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/13107 --- cpp/include/cudf/io/detail/data_casting.cuh | 41 +++++++++++++++------ cpp/src/io/json/json_column.cu | 9 +++-- cpp/src/io/json/nested_json_gpu.cu | 5 ++- cpp/tests/io/json_type_cast_test.cu | 30 ++++++++++++--- 4 files changed, 64 insertions(+), 21 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 9091745cd2e..a62c9873e75 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -299,6 +299,7 @@ template struct string_parse { str_tuple_it str_tuples; bitmask_type* null_mask; + size_type* null_count_data; cudf::io::parse_options_view const options; size_type* d_offsets{}; char* d_chars{}; @@ -319,6 +320,7 @@ struct string_parse { serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); if (is_null_literal && null_mask != nullptr) { clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); if (!d_chars) d_offsets[idx] = 0; return; } @@ -326,7 +328,10 @@ struct string_parse { char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; auto str_process_info = process_string(in_begin, in_end, d_buffer, options); if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { - if (null_mask != nullptr) clear_bit(null_mask, idx); + if (null_mask != nullptr) { + clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); + } if (!d_chars) d_offsets[idx] = 0; } else { if (!d_chars) d_offsets[idx] = str_process_info.bytes; @@ -350,28 +355,35 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, size_type col_size, data_type col_type, B&& null_mask, + size_type null_count, cudf::io::parse_options_view const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); + + auto d_null_count = rmm::device_scalar(null_count, stream); + auto null_count_data = d_null_count.data(); + if (col_type == cudf::data_type{cudf::type_id::STRING}) { - // this utility calls the functor to build the offsets and chars columns + // this utility calls the functor to build the offsets and chars columns; + // the bitmask and null count may be updated by parse failures auto [offsets, chars] = cudf::strings::detail::make_strings_children( string_parse{ - str_tuples, static_cast(null_mask.data()), options}, + str_tuples, static_cast(null_mask.data()), null_count_data, options}, col_size, stream, mr); - auto null_count = - cudf::detail::null_count(static_cast(null_mask.data()), 0, col_size, stream); - return make_strings_column( - col_size, std::move(offsets), std::move(chars), null_count, std::move(null_mask)); + return make_strings_column(col_size, + std::move(offsets), + std::move(chars), + d_null_count.value(stream), + std::move(null_mask)); } - auto out_col = make_fixed_width_column( - col_type, col_size, std::move(null_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); + auto out_col = + make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr); auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); // use existing code (`ConvertFunctor`) to convert values @@ -379,7 +391,8 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, rmm::exec_policy(stream), thrust::make_counting_iterator(0), col_size, - [str_tuples, col = *output_dv_ptr, options, col_type] __device__(size_type row) { + [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__( + size_type row) { if (col.is_null(row)) { return; } auto const in = str_tuples[row]; @@ -388,6 +401,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, if (is_null_literal) { col.set_null(row); + atomicAdd(null_count_data, 1); return; } @@ -403,9 +417,14 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, col_type, options, false); - if (not is_parsed) { col.set_null(row); } + if (not is_parsed) { + col.set_null(row); + atomicAdd(null_count_data, 1); + } }); + out_col->set_null_count(d_null_count.value(stream)); + return out_col; } diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index d7b76b17644..65c93105304 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -349,7 +349,8 @@ std::vector copy_strings_to_host(device_span input, auto d_column_names = experimental::detail::parse_data(string_views.begin(), num_strings, data_type{type_id::STRING}, - rmm::device_buffer{0, stream}, + rmm::device_buffer{}, + 0, options_view, stream, rmm::mr::get_current_device_resource()); @@ -790,12 +791,14 @@ std::pair, std::vector> device_json_co target_type = cudf::io::detail::infer_data_type( options.json_view(), d_input, string_ranges_it, col_size, stream); } - validity_size_check(json_col); + + auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type auto col = experimental::detail::parse_data(string_spans_it, col_size, target_type, - json_col.validity.release(), + std::move(result_bitmask), + null_count, options.view(), stream, mr); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index c6eef20f16b..c29bf2f8866 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1656,11 +1656,14 @@ std::pair, std::vector> json_column_to parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); } + auto [result_bitmask, null_count] = make_validity(json_col); + // Convert strings to the inferred data type auto col = experimental::detail::parse_data(string_spans_it, col_size, target_type, - make_validity(json_col).first, + std::move(result_bitmask), + null_count, parsing_options(options).view(), stream, mr); diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index a5e56b91aa8..806ff991579 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -79,8 +79,14 @@ TEST_F(JSONTypeCastTest, String) auto null_mask = std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - auto str_col = cudf::io::json::experimental::detail::parse_data( - svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); + auto str_col = cudf::io::json::experimental::detail::parse_data(svs.data(), + svs.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto out_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; }); @@ -109,8 +115,14 @@ TEST_F(JSONTypeCastTest, Int) auto null_mask = std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - auto col = cudf::io::json::experimental::detail::parse_data( - svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); + auto col = cudf::io::json::experimental::detail::parse_data(svs.data(), + svs.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}}; @@ -146,8 +158,14 @@ TEST_F(JSONTypeCastTest, StringEscapes) auto null_mask = std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - auto col = cudf::io::json::experimental::detail::parse_data( - svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); + auto col = cudf::io::json::experimental::detail::parse_data(svs.data(), + svs.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::strings_column_wrapper{ {"🚀", "A🚀AA", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"},