From 4fd6dd7b960e497abd13127e8eb7939f168bce08 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 17 Aug 2023 03:22:54 +0530 Subject: [PATCH] Fix List's missing children metadata in JSON writer (#13869) Fixes #13800 The children metadata of list column was not constructed in Cython code. This is fixed by constructing all children columns metadata for list column. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/13869 --- python/cudf/cudf/_lib/json.pyx | 9 ++-- python/cudf/cudf/tests/test_json.py | 65 ++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 64189a5626f..437c3ef6ec4 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -259,9 +259,10 @@ cdef _set_col_children_metadata(Column col, child_col, col_meta.children[i] ) elif is_list_dtype(col): - _set_col_children_metadata( - col.children[0], - col_meta.children[0] - ) + for i, child_col in enumerate(col.children): + col_meta.children.push_back(child_info) + _set_col_children_metadata( + child_col, col_meta.children[i] + ) else: return diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 43b0ca0119a..47f5b99acf7 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -215,6 +215,69 @@ def test_cudf_json_writer_read(gdf_writer_types): assert_eq(pdf2, gdf2) +@pytest.mark.parametrize( + "jsonl_string, expected", + [ + # fixed width + ("""{"a":10, "b":1.1}\n {"a":20, "b":2.1}\n""", None), + # simple list + ("""{"a":[1, 2, 3], "b":1.1}\n {"a":[]}\n""", None), + # simple struct + ("""{"a":{"c": 123 }, "b":1.1}\n {"a": {"c": 456}}\n""", None), + # list of lists + ("""{"a":[[], [1, 2], [3, 4]], "b":1.1}\n""", None), + ("""{"a":[null, [1, 2], [null, 4]], "b":1.1}\n""", None), + # list of structs + # error ("""{"a":[null, {}], "b":1.1}\n""", None), + ( + """{"a":[null, {"L": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""", + None, + ), + ( + """{"a":[{"L": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""", + None, + ), + # struct of lists + ( + """{"a":{"L": [1, 2, 3]}, "b":1.1}\n {"a": {"L": [4, 5, 6]}}\n""", + None, + ), + ("""{"a":{"L": [1, 2, null]}, "b":1.1}\n {"a": {"L": []}}\n""", None), + # struct of structs + ( + """{"a":{"L": {"M": 123}}, "b":1.1} + {"a": {"L": {"M": 456}}}\n""", + None, + ), + ( + """{"a":{"L": {"M": null}}, "b":1.1}\n {"a": {"L": {}}}\n""", + """{"a":{"L": {}}, "b":1.1}\n {"a": {"L": {}}}\n""", + ), + # list of structs of lists + ("""{"a":[{"L": [1, 2, 3]}, {"L": [4, 5, 6]}], "b":1.1}\n""", None), + ("""{"a":[{"L": [1, 2, null]}, {"L": []}], "b":1.1}\n""", None), + # struct of lists of structs + ("""{"a":{"L": [{"M": 123}, {"M": 456}]}, "b":1.1}\n""", None), + ( + """{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""", + """{"a":{"L": [{}, {}]}, "b":1.1}\n""", + ), + ], +) +def test_cudf_json_roundtrip(jsonl_string, expected): + gdf = cudf.read_json( + StringIO(jsonl_string), + lines=True, + engine="cudf", + # dtype=dict(dtypes), + ) + expected = jsonl_string if expected is None else expected + gdf_string = gdf.to_json( + orient="records", lines=True, engine="cudf", include_nulls=False + ) + assert_eq(gdf_string, expected.replace(" ", "")) + + @pytest.mark.parametrize("sink", ["string", "file"]) def test_cudf_json_writer_sinks(sink, tmp_path_factory): df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -1185,7 +1248,7 @@ def test_json_array_of_arrays(data, lines): # simple list with mixed types """{"a":[123, {}], "b":1.1}""", """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[{"0": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",