From f990f931dce2c5df4802cdd40a6aee83a9a31edb Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 11 Jan 2023 19:07:49 +0530 Subject: [PATCH] Fix List offsets bug in mixed type list column in nested JSON reader (#12447) Fixes the bug in list offsets in mixed types - string/value type with struct type. In nested JSON reader, following json string `[{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}]` should produce first column as ``` List>: Length : 3 Offsets : 0, 2, 2, 2 Null count: 2 001 Struct: Length : 2: Null count: 1 10 10 NULL, 123 ``` Depends on #12330 closes #12418 Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12447 --- cpp/src/io/json/json_column.cu | 169 ++++++++++++++++++++-------- cpp/tests/io/json_test.cpp | 13 ++- python/cudf/cudf/tests/test_json.py | 132 ++++++++++++++++++++++ 3 files changed, 263 insertions(+), 51 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 535854302bb..959ff5c49a6 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -195,6 +197,45 @@ reduce_to_column_tree(tree_meta_t& tree, : col_ids[parent_node_id]; }); + // Mixed types in List children go to different columns, + // so all immediate children of list column should have same max_row_offsets. + // create list's children max_row_offsets array. (initialize to zero) + // atomicMax on children max_row_offsets array. + // gather the max_row_offsets from children row offset array. + { + rmm::device_uvector list_parents_children_max_row_offsets(num_columns, stream); + thrust::fill(rmm::exec_policy(stream), + list_parents_children_max_row_offsets.begin(), + list_parents_children_max_row_offsets.end(), + 0); + thrust::for_each(rmm::exec_policy(stream), + unique_col_ids.begin(), + unique_col_ids.end(), + [column_categories = column_categories.begin(), + parent_col_ids = parent_col_ids.begin(), + max_row_offsets = max_row_offsets.begin(), + list_parents_children_max_row_offsets = + list_parents_children_max_row_offsets.begin()] __device__(auto col_id) { + auto parent_col_id = parent_col_ids[col_id]; + if (parent_col_id != parent_node_sentinel and + column_categories[parent_col_id] == node_t::NC_LIST) { + atomicMax(list_parents_children_max_row_offsets + parent_col_id, + max_row_offsets[col_id]); + } + }); + thrust::gather_if( + rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.end(), + parent_col_ids.begin(), + list_parents_children_max_row_offsets.begin(), + max_row_offsets.begin(), + [column_categories = column_categories.begin()] __device__(size_type parent_col_id) { + return parent_col_id != parent_node_sentinel and + column_categories[parent_col_id] == node_t::NC_LIST; + }); + } + // copy lists' max_row_offsets to children. // all structs should have same size. thrust::transform_if( @@ -465,47 +506,7 @@ void make_device_json_column(device_span input, auto d_ignore_vals = cudf::detail::make_device_uvector_async(ignore_vals, stream); auto d_columns_data = cudf::detail::make_device_uvector_async(columns_data, stream); - // 3. scatter List offset - // pre-condition: {node_id} is already sorted by {col_id} - // unique_copy_by_key {parent_node_id} {row_offset} to - // col[parent_col_id].child_offsets[row_offset[parent_node_id]] - - auto ordered_parent_node_ids = - thrust::make_permutation_iterator(tree.parent_node_ids.begin(), node_ids.begin()); - auto ordered_row_offsets = - thrust::make_permutation_iterator(row_offsets.begin(), node_ids.begin()); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [num_nodes, - ordered_parent_node_ids, - ordered_row_offsets, - col_ids = col_ids.begin(), - sorted_col_ids = sorted_col_ids.begin(), - row_offsets = row_offsets.begin(), - node_categories = tree.node_categories.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - auto parent_node_id = ordered_parent_node_ids[i]; - if (parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST) { - // unique item - if (i == 0 or (sorted_col_ids[i - 1] != sorted_col_ids[i] or - ordered_parent_node_ids[i - 1] != parent_node_id)) { - // scatter to list_offset - d_columns_data[col_ids[parent_node_id]].child_offsets[row_offsets[parent_node_id]] = - ordered_row_offsets[i]; - } - // TODO: verify if this code is right. check with more test cases. - if (i == num_nodes - 1 or (sorted_col_ids[i] != sorted_col_ids[i + 1] or - ordered_parent_node_ids[i + 1] != parent_node_id)) { - // last value of list child_offset is its size. - d_columns_data[col_ids[parent_node_id]].child_offsets[row_offsets[parent_node_id] + 1] = - ordered_row_offsets[i] + 1; - } - } - }); - - // 4. scatter string offsets to respective columns, set validity bits + // 3. scatter string offsets to respective columns, set validity bits thrust::for_each_n( rmm::exec_policy(stream), thrust::counting_iterator(0), @@ -531,6 +532,65 @@ void make_device_json_column(device_span input, } }); + // 4. scatter List offset + // copy_if only node's whose parent is list, (node_id, parent_col_id) + // stable_sort by parent_col_id of {node_id}. + // For all unique parent_node_id of (i==0, i-1!=i), write start offset. + // (i==last, i+1!=i), write end offset. + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids + auto parent_col_id = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [col_ids = col_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_ids[node_id]]; + }); + auto const list_children_end = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + + num_nodes, + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), + [node_categories = tree.node_categories.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST; + }); + + auto const num_list_children = + list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); + thrust::stable_sort_by_key(rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.begin() + num_list_children, + node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + parent_col_ids = parent_col_ids.begin(), + row_offsets = row_offsets.begin(), + d_columns_data = d_columns_data.begin(), + num_list_children] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + // scatter to list_offset + if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = + row_offsets[node_id]; + } + // last value of list child_offset is its size. + if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = + row_offsets[node_id] + 1; + } + }); + // 5. scan on offsets. for (auto& [id, col_ref] : columns) { auto& col = col_ref.get(); @@ -691,7 +751,12 @@ std::pair, std::vector> device_json_co auto [child_column, names] = json_col.child_columns.empty() ? std::pair, - std::vector>{std::make_unique(), {}} + // EMPTY type could not used because gather throws exception on EMPTY type. + std::vector>{std::make_unique( + data_type{type_id::INT8}, + 0, + rmm::device_buffer{0, stream, mr}), + {}} : device_json_column_to_cudf_column( json_col.child_columns.begin()->second, d_input, @@ -701,14 +766,18 @@ std::pair, std::vector> device_json_co mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); - return {make_lists_column(num_rows, - std::move(offsets_column), - std::move(child_column), - null_count, - std::move(result_bitmask), - stream, - mr), - std::move(column_names)}; + auto ret_col = make_lists_column(num_rows, + std::move(offsets_column), + std::move(child_column), + 0, + rmm::device_buffer{0, stream, mr}, + stream, + mr); + // The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and + // null validation applied in make_lists_column factory, which is not needed for json + // parent column cannot be null when its children is non-empty in JSON + ret_col->set_null_mask(std::move(result_bitmask), null_count); + return {std::move(ret_col), std::move(column_names)}; } default: CUDF_FAIL("Unsupported column type"); break; } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 2e0ae2aa341..3d6350a32a2 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1463,7 +1463,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeSchema) cudf::test::strings_column_wrapper({"aa ", " bbb"})); } -TEST_F(JsonReaderTest, DISABLED_JsonNestedDtypeSchema) +TEST_F(JsonReaderTest, JsonNestedDtypeSchema) { std::string json_string = R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}])"; @@ -1512,10 +1512,21 @@ TEST_F(JsonReaderTest, DISABLED_JsonNestedDtypeSchema) // Verify column "b" is an int column EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT32); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(0), int_wrapper{{0, 2, 2, 2}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(1).child(0), float_wrapper{{0.0, 123.0}, {false, true}}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), int_wrapper{{1, 1, 2}, {true, true, true}}); + // List column expected + auto leaf_child = float_wrapper{{0.0, 123.0}, {false, true}}; + auto const validity = {1, 0, 0}; + auto expected = cudf::make_lists_column( + 3, + int_wrapper{{0, 2, 2, 2}}.release(), + cudf::test::structs_column_wrapper{{leaf_child}, {false, true}}.release(), + 2, + cudf::test::detail::make_null_mask(validity.begin(), validity.end())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), *expected); } TEST_P(JsonReaderParamTest, JsonDtypeParsing) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 7745801f65a..f39b2448d29 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1006,3 +1006,135 @@ def test_json_round_trip_gzip(): fo.seek(loc) written_df = cudf.read_json(fo, orient="records", lines=True) assert_eq(written_df, df) + + +@pytest.mark.parametrize( + "jsonl_string", + [ + # simple list with mixed types + """{"a":[123, {}], "b":1.1}""", + """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[{"0": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""", + """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""", + """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""", + """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""", + """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""", + """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""", + """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""", + # nested list with mixed types + """{"a":[123, [{"0": 123}, {}]], "b":1.0} + {"b":1.1} + {"a":[]} + {"a":[123]} + {"a":[[123], []]}""", + """{"a":[], "b":1.0} + {"a":[[[456]]]} + {"a":[[123]]} + {"a":[123]}""", + """{"a":[123], "b":1.0} + {"b":1.1} + {"b":2.1} + {"a":[[[[[[]]]]]]}""", + """{"a":[123], "b":1.0} + {"a":[[[[[[]]]]]]} + {"a":[[[[[[]]]]], [[[[[]]]]]]} + {"a":[[[[[[]]]], [[[[]]]]]]} + {"a":[[[[[[]]], [[[]]]]]]} + {"a":[[[[[[]], [[]]]]]]} + {"a":[[[[[[], 123, []]]]]]}""", + # mixed elements in multiple columns + """{"a":[123, {"0": 123}], "b":1.0} + {"c": ["abc"], "b":1.1} + {"c": ["abc", []] }""", + ], +) +def test_json_nested_mixed_types_in_list(jsonl_string): + # utility function for this test: + # replace list elements with None if it has dict and non-dict (ignore None) + def _replace_in_list(list_to_replace, replace_items): + return [ + _replace_in_list(x, replace_items) + if isinstance(x, list) + else None + if x in replace_items + else x + for x in list_to_replace + ] + + def _replace_with_nulls(df, replace_items): + for col in df.columns: + if df[col].dtype == "object": + df[col] = df[col].apply( + lambda x: _replace_in_list(x, replace_items) + if isinstance(x, list) + else x + ) + return df + + # both json lines and json string tested. + json_string = "[" + jsonl_string.replace("\n", ",") + "]" + pdf = pd.read_json(jsonl_string, orient="records", lines=True) + pdf2 = pd.read_json(json_string, orient="records", lines=False) + assert_eq(pdf, pdf2) + # replace list elements with None if it has dict and non-dict + # in above test cases, these items are mixed with dict/list items + # so, replace them with None. + pdf = _replace_with_nulls(pdf, [123, "123", 12.3, "abc"]) + gdf = cudf.read_json( + StringIO(jsonl_string), + engine="cudf_experimental", + orient="records", + lines=True, + ) + gdf2 = cudf.read_json( + StringIO(json_string), + engine="cudf_experimental", + orient="records", + lines=False, + ) + if """[{"0": 123}, {}]""" not in jsonl_string: + # {} in pandas is represented as {"0": None} in cudf + assert_eq(gdf, pdf) + assert_eq(gdf2, pdf) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=gdf.to_arrow().schema, safe=False + ) + assert gdf.to_arrow().equals(pa_table_pdf) + assert gdf2.to_arrow().equals(pa_table_pdf) + + +@pytest.mark.parametrize( + "jsonl_string", + [ + # mixed type in list (in different order) + """{"a":[[{"0": 123}, {}], {"1": 321}], "b":1.0}""", + """{"a":[{"1": 321}, [{"0": 123}, {}], ], "b":1.0}""", + """{"a":[123, [{"0": 123}, {}], {"1": 321}], "b":1.0}""", + """{"a":[null, [{"0": 123}, {}], {"1": 321}], "b":1.0}""", + # mixed type in struct (in different order) + """{"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} + {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", + """{"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0} + {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0}""", + """{"a": {"b": {"0": 123}, "c": null}, "d":1.0} + {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} + {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", + """{"a": {"b": {"0": 123}, "c": 123}, "d":1.0} + {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} + {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", + ], +) +def test_json_nested_mixed_types_error(jsonl_string): + # mixing list and struct should raise an exception + with pytest.raises(RuntimeError): + cudf.read_json( + StringIO(jsonl_string), + engine="cudf_experimental", + orient="records", + lines=True, + )