From 8393bf5877c4129e6cc3dd1fdb2dca564ded8171 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 10 Apr 2023 14:18:33 -0400 Subject: [PATCH 1/3] Add null-count parameter to json experimental parse_data utility --- cpp/include/cudf/io/detail/data_casting.cuh | 41 +++++++++++++++------ cpp/src/io/json/json_column.cu | 9 +++-- cpp/src/io/json/nested_json_gpu.cu | 7 +++- cpp/tests/io/json_type_cast_test.cu | 35 +++++++++++++----- 4 files changed, 67 insertions(+), 25 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 9091745cd2e..829530f393b 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -299,6 +299,7 @@ template struct string_parse { str_tuple_it str_tuples; bitmask_type* null_mask; + size_type* null_count_data; cudf::io::parse_options_view const options; size_type* d_offsets{}; char* d_chars{}; @@ -319,6 +320,7 @@ struct string_parse { serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); if (is_null_literal && null_mask != nullptr) { clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); if (!d_chars) d_offsets[idx] = 0; return; } @@ -326,7 +328,10 @@ struct string_parse { char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; auto str_process_info = process_string(in_begin, in_end, d_buffer, options); if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { - if (null_mask != nullptr) clear_bit(null_mask, idx); + if (null_mask != nullptr) { + clear_bit(null_mask, idx); + atomicAdd(null_count_data, 1); + } if (!d_chars) d_offsets[idx] = 0; } else { if (!d_chars) d_offsets[idx] = str_process_info.bytes; @@ -350,28 +355,35 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, size_type col_size, data_type col_type, B&& null_mask, + size_type null_count, cudf::io::parse_options_view const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); + + auto d_null_count = rmm::device_scalar(null_count, stream); + auto null_count_data = d_null_count.data(); + if (col_type == cudf::data_type{cudf::type_id::STRING}) { - // this utility calls the functor to build the offsets and chars columns + // this utility calls the functor to build the offsets and chars columns; + // the bitmask and null-count may be updated by parse failures auto [offsets, chars] = cudf::strings::detail::make_strings_children( string_parse{ - str_tuples, static_cast(null_mask.data()), options}, + str_tuples, static_cast(null_mask.data()), null_count_data, options}, col_size, stream, mr); - auto null_count = - cudf::detail::null_count(static_cast(null_mask.data()), 0, col_size, stream); - return make_strings_column( - col_size, std::move(offsets), std::move(chars), null_count, std::move(null_mask)); + return make_strings_column(col_size, + std::move(offsets), + std::move(chars), + d_null_count.value(stream), + std::move(null_mask)); } - auto out_col = make_fixed_width_column( - col_type, col_size, std::move(null_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); + auto out_col = + make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr); auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); // use existing code (`ConvertFunctor`) to convert values @@ -379,7 +391,8 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, rmm::exec_policy(stream), thrust::make_counting_iterator(0), col_size, - [str_tuples, col = *output_dv_ptr, options, col_type] __device__(size_type row) { + [str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__( + size_type row) { if (col.is_null(row)) { return; } auto const in = str_tuples[row]; @@ -388,6 +401,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, if (is_null_literal) { col.set_null(row); + atomicAdd(null_count_data, 1); return; } @@ -403,9 +417,14 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, col_type, options, false); - if (not is_parsed) { col.set_null(row); } + if (not is_parsed) { + col.set_null(row); + atomicAdd(null_count_data, 1); + } }); + out_col->set_null_count(d_null_count.value(stream)); + return out_col; } diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index c937315969c..99b15a8c3df 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -349,7 +349,8 @@ std::vector copy_strings_to_host(device_span input, auto d_column_names = experimental::detail::parse_data(string_views.begin(), num_strings, data_type{type_id::STRING}, - rmm::device_buffer{0, stream}, + rmm::device_buffer{}, + 0, options_view, stream, rmm::mr::get_current_device_resource()); @@ -790,12 +791,14 @@ std::pair, std::vector> device_json_co target_type = cudf::io::detail::infer_data_type( options.json_view(), d_input, string_ranges_it, col_size, stream); } - validity_size_check(json_col); + + auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type auto col = experimental::detail::parse_data(string_spans_it, col_size, target_type, - json_col.validity.release(), + std::move(result_bitmask), + null_count, options.view(), stream, mr); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 77749b42781..ee147b1f0b7 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -46,7 +46,7 @@ // Debug print flag #ifndef NJP_DEBUG_PRINT -//#define NJP_DEBUG_PRINT +// #define NJP_DEBUG_PRINT #endif namespace { @@ -1637,11 +1637,14 @@ std::pair, std::vector> json_column_to parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); } + auto [result_bitmask, null_count] = make_validity(json_col); + // Convert strings to the inferred data type auto col = experimental::detail::parse_data(string_spans_it, col_size, target_type, - make_validity(json_col).first, + std::move(result_bitmask), + null_count, parsing_options(options).view(), stream, mr); diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 2170ce4a3e2..4a37d5ac29d 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,8 +34,7 @@ using namespace cudf::test::iterators; -struct JSONTypeCastTest : public cudf::test::BaseFixture { -}; +struct JSONTypeCastTest : public cudf::test::BaseFixture {}; namespace { struct to_thrust_pair_fn { @@ -80,8 +79,14 @@ TEST_F(JSONTypeCastTest, String) auto null_mask = cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); - auto str_col = cudf::io::json::experimental::detail::parse_data( - svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); + auto str_col = cudf::io::json::experimental::detail::parse_data(svs.data(), + svs.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto out_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; }); @@ -110,8 +115,14 @@ TEST_F(JSONTypeCastTest, Int) auto null_mask = cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); - auto col = cudf::io::json::experimental::detail::parse_data( - svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); + auto col = cudf::io::json::experimental::detail::parse_data(svs.data(), + svs.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}}; @@ -147,8 +158,14 @@ TEST_F(JSONTypeCastTest, StringEscapes) auto null_mask = cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); - auto col = cudf::io::json::experimental::detail::parse_data( - svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); + auto col = cudf::io::json::experimental::detail::parse_data(svs.data(), + svs.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::strings_column_wrapper{ {"🚀", "A🚀AA", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"}, From b066b50e2116fc1193407ca57936f8df2e9d25bf Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 10 Apr 2023 14:29:34 -0400 Subject: [PATCH 2/3] fix style violation --- cpp/tests/io/json_type_cast_test.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 4a37d5ac29d..31081840581 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -34,7 +34,8 @@ using namespace cudf::test::iterators; -struct JSONTypeCastTest : public cudf::test::BaseFixture {}; +struct JSONTypeCastTest : public cudf::test::BaseFixture { +}; namespace { struct to_thrust_pair_fn { From 067dafaa1051f91081592d8331209a930354cc33 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 13 Apr 2023 09:11:28 -0400 Subject: [PATCH 3/3] remove extra hyphen --- cpp/include/cudf/io/detail/data_casting.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 829530f393b..a62c9873e75 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -367,7 +367,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, if (col_type == cudf::data_type{cudf::type_id::STRING}) { // this utility calls the functor to build the offsets and chars columns; - // the bitmask and null-count may be updated by parse failures + // the bitmask and null count may be updated by parse failures auto [offsets, chars] = cudf::strings::detail::make_strings_children( string_parse{ str_tuples, static_cast(null_mask.data()), null_count_data, options},