diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 5d7fb9d6b43..5ea29fcfd2d 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -799,9 +799,7 @@ std::pair, std::vector> device_json_co // This is to match the existing JSON reader's behaviour: // - Non-string columns will always be returned as nullable // - String columns will be returned as nullable, iff there's at least one null entry - if (target_type.id() == type_id::STRING and col->null_count() == 0) { - col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); - } + if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } // For string columns return ["offsets", "char"] schema if (target_type.id() == type_id::STRING) { @@ -830,7 +828,7 @@ std::pair, std::vector> device_json_co // The null_mask is set after creation of struct column is to skip the superimpose_nulls and // null validation applied in make_structs_column factory, which is not needed for json auto ret_col = make_structs_column(num_rows, std::move(child_columns), 0, {}, stream, mr); - ret_col->set_null_mask(std::move(result_bitmask), null_count); + if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); } return {std::move(ret_col), column_names}; } case json_col_t::ListColumn: { @@ -877,7 +875,7 @@ std::pair, std::vector> device_json_co // The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and // null validation applied in make_lists_column factory, which is not needed for json // parent column cannot be null when its children is non-empty in JSON - ret_col->set_null_mask(std::move(result_bitmask), null_count); + if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); } return {std::move(ret_col), std::move(column_names)}; } default: CUDF_FAIL("Unsupported column type"); break; diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu index 205a6b96aa9..5580628b0fe 100644 --- a/cpp/src/io/json/legacy/reader_impl.cu +++ b/cpp/src/io/json/legacy/reader_impl.cu @@ -569,6 +569,9 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, } else { out_columns.emplace_back(std::move(out_column)); } + if (out_columns.back()->null_count() == 0) { + out_columns.back()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); + } } std::vector column_infos; diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 496e5b25e60..5eb3883dc64 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2068,11 +2068,13 @@ std::pair, std::vector> json_column_to auto make_validity = [stream, mr](json_column const& json_col) -> std::pair { + auto const null_count = json_col.current_offset - json_col.valid_count; + if (null_count == 0) { return {rmm::device_buffer{}, null_count}; } return {rmm::device_buffer{json_col.validity.data(), bitmask_allocation_size_bytes(json_col.current_offset), stream, mr}, - json_col.current_offset - json_col.valid_count}; + null_count}; }; auto get_child_schema = [schema](auto child_name) -> std::optional { @@ -2138,9 +2140,7 @@ std::pair, std::vector> json_column_to // This is to match the existing JSON reader's behaviour: // - Non-string columns will always be returned as nullable // - String columns will be returned as nullable, iff there's at least one null entry - if (target_type.id() == type_id::STRING and col->null_count() == 0) { - col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); - } + if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } // For string columns return ["offsets", "char"] schema if (target_type.id() == type_id::STRING) { diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index a2db2d69984..09c9179de82 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -145,12 +145,10 @@ MATCHER_P(FloatNearPointwise, tolerance, "Out-of-range") // temporary method to verify the float columns until // CUDF_TEST_EXPECT_COLUMNS_EQUAL supports floating point -template -void check_float_column(cudf::column_view const& col, - std::vector const& data, - valid_t const& validity) +template +void check_float_column(cudf::column_view const& col, std::vector const& data) { - CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(col, (wrapper{data.begin(), data.end(), validity})); + CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(col, (wrapper(data.begin(), data.end()))); EXPECT_EQ(col.null_count(), 0); EXPECT_THAT(cudf::test::to_host(col).first, ::testing::Pointwise(FloatNearPointwise(1e-6), data)); @@ -325,11 +323,8 @@ TEST_P(JsonReaderParamTest, BasicJsonLines) EXPECT_EQ(result.metadata.schema_info[0].name, "0"); EXPECT_EQ(result.metadata.schema_info[1].name, "1"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2, 3}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), - float64_wrapper{{1.1, 2.2, 3.3}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2, 3}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3}}); } TEST_P(JsonReaderParamTest, FloatingPoint) @@ -366,15 +361,9 @@ TEST_P(JsonReaderParamTest, FloatingPoint) EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT32); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - CUDF_TEST_EXPECT_COLUMNS_EQUAL( result.tbl->get_column(0), - float_wrapper{{5.6, 56.79, 12000000000., 0.7, 3.000, 12.34, 0.31, -73.98007199999998}, - validity}); - - auto const bitmask = cudf::test::bitmask_to_host(result.tbl->get_column(0)); - ASSERT_EQ((1u << result.tbl->get_column(0).size()) - 1, bitmask[0]); + float_wrapper{{5.6, 56.79, 12000000000., 0.7, 3.000, 12.34, 0.31, -73.98007199999998}}); } TEST_P(JsonReaderParamTest, JsonLinesStrings) @@ -405,10 +394,8 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings) EXPECT_EQ(result.metadata.schema_info[1].name, "1"); EXPECT_EQ(result.metadata.schema_info[2].name, "2"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), cudf::test::strings_column_wrapper({"aa ", " bbb"})); } @@ -465,8 +452,6 @@ TEST_P(JsonReaderParamTest, MultiColumn) .legacy(is_legacy_test(test_opt)); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - auto const view = result.tbl->view(); EXPECT_EQ(view.num_columns(), 6); @@ -478,15 +463,15 @@ TEST_P(JsonReaderParamTest, MultiColumn) EXPECT_EQ(view.column(5).type().id(), cudf::type_id::FLOAT64); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(0), - int8_wrapper{int8_values.begin(), int8_values.end(), validity}); + int8_wrapper(int8_values.begin(), int8_values.end())); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(1), - int16_wrapper{int16_values.begin(), int16_values.end(), validity}); + int16_wrapper(int16_values.begin(), int16_values.end())); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(2), - int_wrapper{int32_values.begin(), int32_values.end(), validity}); + int_wrapper(int32_values.begin(), int32_values.end())); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3), - int64_wrapper{int64_values.begin(), int64_values.end(), validity}); - check_float_column(view.column(4), float32_values, validity); - check_float_column(view.column(5), float64_values, validity); + int64_wrapper(int64_values.begin(), int64_values.end())); + check_float_column(view.column(4), float32_values); + check_float_column(view.column(5), float64_values); } TEST_P(JsonReaderParamTest, Booleans) @@ -522,10 +507,8 @@ TEST_P(JsonReaderParamTest, Booleans) EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::BOOL8); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), - bool_wrapper{{true, true, false, false, true}, validity}); + bool_wrapper{{true, true, false, false, true}}); } TEST_P(JsonReaderParamTest, Dates) @@ -669,10 +652,8 @@ TEST_P(JsonReaderParamTest, JsonLinesDtypeInference) EXPECT_EQ(result.metadata.schema_info[1].name, "1"); EXPECT_EQ(result.metadata.schema_info[2].name, "2"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{100, 200}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{100, 200}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), cudf::test::strings_column_wrapper({"aa ", " bbb"})); } @@ -706,10 +687,8 @@ TEST_P(JsonReaderParamTest, JsonLinesFileInput) EXPECT_EQ(result.metadata.schema_info[0].name, "0"); EXPECT_EQ(result.metadata.schema_info[1].name, "1"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{11, 22}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{11, 22}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}}); } TEST_F(JsonReaderTest, JsonLinesByteRange) @@ -734,10 +713,7 @@ TEST_F(JsonReaderTest, JsonLinesByteRange) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); EXPECT_EQ(result.metadata.schema_info[0].name, "0"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), - int64_wrapper{{3000, 4000, 5000}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{3000, 4000, 5000}}); } TEST_P(JsonReaderDualTest, JsonLinesObjects) @@ -763,10 +739,8 @@ TEST_P(JsonReaderDualTest, JsonLinesObjects) EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{1}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{2.0}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{1}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{2.0}}); } TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings) @@ -791,11 +765,8 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings) EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); EXPECT_EQ(result.metadata.schema_info[2].name, "col3"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{100, 200}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), - float64_wrapper{{1.1, 2.2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{100, 200}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), cudf::test::strings_column_wrapper({"aaa", "bbb"})); }; @@ -870,10 +841,8 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder) EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); EXPECT_EQ(result.metadata.schema_info[2].name, "col3"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{100, 200}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{100, 200}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), cudf::test::strings_column_wrapper({"aaa", "bbb"})); } @@ -952,10 +921,7 @@ TEST_F(JsonReaderTest, ArrowFileSource) EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), - int8_wrapper{{9, 8, 7, 6, 5, 4, 3, 2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int8_wrapper{{9, 8, 7, 6, 5, 4, 3, 2}}); } TEST_P(JsonReaderParamTest, InvalidFloatingPoint) @@ -1241,12 +1207,8 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs) EXPECT_EQ(result.metadata.schema_info[0].name, "0"); EXPECT_EQ(result.metadata.schema_info[1].name, "1"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), - int64_wrapper{{11, 22, 33, 44}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), - float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{11, 22, 33, 44}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}}); } TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL) @@ -1286,12 +1248,8 @@ TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL) EXPECT_EQ(result.metadata.schema_info[0].name, "0"); EXPECT_EQ(result.metadata.schema_info[1].name, "1"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), - int64_wrapper{{11, 22, 33, 44}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), - float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{11, 22, 33, 44}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}}); } TEST_F(JsonReaderTest, BadDtypeParams) @@ -1427,7 +1385,10 @@ TEST_F(JsonReaderTest, JsonLongString) cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); - cudf::table_view const expected = tbl_view; + cudf::column_view int16_with_mask(repeat_times); + cudf::column_view int16( + int16_with_mask.type(), int16_with_mask.size(), int16_with_mask.head(), nullptr, 0); + cudf::table_view const expected = cudf::table_view{{col1, col2, int16}}; std::map types; types["col1"] = data_type{type_id::STRING}; types["col2"] = data_type{type_id::STRING}; @@ -1641,10 +1602,8 @@ TEST_P(JsonReaderParamTest, JsonDtypeSchema) EXPECT_EQ(result.metadata.schema_info[1].name, "1"); EXPECT_EQ(result.metadata.schema_info[2].name, "2"); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2}, validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), cudf::test::strings_column_wrapper({"aa ", " bbb"})); } @@ -1700,8 +1659,7 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(0), int_wrapper{{0, 2, 2, 2}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(1).child(0), float_wrapper{{0.0, 123.0}, {false, true}}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), - int_wrapper{{1, 1, 2}, {true, true, true}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), int_wrapper{{1, 1, 2}}); // List column expected auto leaf_child = float_wrapper{{0.0, 123.0}, {false, true}}; auto const validity = {1, 0, 0}; diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index b0ffbe3d154..93ad05a29fe 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -646,10 +646,8 @@ TEST_P(JsonParserTest, ExtractColumn) auto const expected_col_count = 2; EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); - auto expected_col1 = - cudf::test::fixed_width_column_wrapper({0.0, 0.1, 0.2}, {true, true, true}); - auto expected_col2 = - cudf::test::fixed_width_column_wrapper({1.0, 1.1, 1.2}, {true, true, true}); + auto expected_col1 = cudf::test::fixed_width_column_wrapper({0.0, 0.1, 0.2}); + auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 1.2}); cudf::column_view parsed_col1 = cudf_table.tbl->get_column(0); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); cudf::column_view parsed_col2 = cudf_table.tbl->get_column(1); @@ -952,8 +950,7 @@ TEST_P(JsonParserTest, ExtractColumnWithQuotes) auto expected_col1 = cudf::test::strings_column_wrapper({R"("0.0")", R"()", R"("2.0")"}, {true, false, true}); - auto expected_col2 = - cudf::test::fixed_width_column_wrapper({1.0, 1.1, 2.1}, {true, true, true}); + auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 2.1}); cudf::column_view parsed_col1 = cudf_table.tbl->get_column(0); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); cudf::column_view parsed_col2 = cudf_table.tbl->get_column(1);