From 35b0a52a6288cc971bb731371cda8d72772b530b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 27 Sep 2022 04:50:34 -0500 Subject: [PATCH] Enable `schema_element` & `keep_quotes` support in json reader (#11746) This PR plumbs `schema_element` and `keep_quotes` support in json reader. **Deprecation:** This PR also contains changes deprecating `dtype` as `list` inputs. This seems to be a very outdated legacy feature we continued to support and cannot be supported with the `schema_element`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/11746 --- python/cudf/cudf/_lib/cpp/io/json.pxd | 12 ++- python/cudf/cudf/_lib/json.pyx | 39 +++++++- python/cudf/cudf/io/json.py | 17 +++- python/cudf/cudf/tests/test_json.py | 135 ++++++++++++++++++++++++-- python/cudf/cudf/utils/ioutils.py | 29 ++++++ 5 files changed, 217 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index bc9d87a5cbf..7333aad7ddf 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -15,6 +15,10 @@ from cudf._lib.cpp.types cimport data_type, size_type cdef extern from "cudf/io/json.hpp" \ namespace "cudf::io" nogil: + cdef struct schema_element: + data_type type + map[string, schema_element] child_types + cdef cppclass json_reader_options: json_reader_options() except+ cudf_io_types.source_info get_source() except+ @@ -28,7 +32,7 @@ cdef extern from "cudf/io/json.hpp" \ # setter void set_dtypes(vector[data_type] types) except+ - void set_dtypes(map[string, data_type] types) except+ + void set_dtypes(map[string, schema_element] types) except+ void set_compression( cudf_io_types.compression_type compression ) except+ @@ -37,6 +41,7 @@ cdef extern from "cudf/io/json.hpp" \ void enable_lines(bool val) except+ void enable_dayfirst(bool val) except+ void enable_experimental(bool val) except+ + void enable_keep_quotes(bool val) except+ @staticmethod json_reader_options_builder builder( @@ -55,7 +60,7 @@ cdef extern from "cudf/io/json.hpp" \ vector[data_type] types ) except+ json_reader_options_builder& dtypes( - map[string, data_type] types + map[string, schema_element] types ) except+ json_reader_options_builder& compression( cudf_io_types.compression_type compression @@ -75,6 +80,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& experimental( bool val ) except+ + json_reader_options_builder& keep_quotes( + bool val + ) except+ json_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 376850b7b1b..b0aafc275d6 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -20,6 +20,7 @@ cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.io.json cimport ( json_reader_options, read_json as libcudf_read_json, + schema_element, ) from cudf._lib.cpp.types cimport data_type, size_type, type_id from cudf._lib.io.utils cimport make_source_info, update_struct_field_names @@ -32,7 +33,8 @@ cpdef read_json(object filepaths_or_buffers, bool lines, object compression, object byte_range, - bool experimental): + bool experimental, + bool keep_quotes): """ Cython function to call into libcudf API, see `read_json`. @@ -55,7 +57,7 @@ cpdef read_json(object filepaths_or_buffers, # Setup arguments cdef vector[data_type] c_dtypes_list - cdef map[string, data_type] c_dtypes_map + cdef map[string, schema_element] c_dtypes_schema_map cdef cudf_io_types.compression_type c_compression # Determine byte read offsets if applicable cdef size_type c_range_offset = ( @@ -81,8 +83,8 @@ cpdef read_json(object filepaths_or_buffers, elif dtype is not True: if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): - c_dtypes_map[str(k).encode()] = \ - _get_cudf_data_type_from_dtype(v) + c_dtypes_schema_map[str(k).encode()] = \ + _get_cudf_schema_element_from_dtype(v) elif isinstance(dtype, abc.Collection): is_list_like_dtypes = True c_dtypes_list.reserve(len(dtype)) @@ -105,8 +107,9 @@ cpdef read_json(object filepaths_or_buffers, if is_list_like_dtypes: opts.set_dtypes(c_dtypes_list) else: - opts.set_dtypes(c_dtypes_map) + opts.set_dtypes(c_dtypes_schema_map) + opts.enable_keep_quotes(keep_quotes) # Read JSON cdef cudf_io_types.table_with_metadata c_result @@ -123,6 +126,32 @@ cpdef read_json(object filepaths_or_buffers, return df + +cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +: + cdef schema_element s_element + cdef data_type lib_type + if cudf.api.types.is_categorical_dtype(dtype): + raise NotImplementedError( + "CategoricalDtype as dtype is not yet " + "supported in JSON reader" + ) + + dtype = cudf.dtype(dtype) + lib_type = dtype_to_data_type(dtype) + s_element.type = lib_type + if isinstance(dtype, cudf.StructDtype): + for name, child_type in dtype.fields.items(): + s_element.child_types[name.encode()] = \ + _get_cudf_schema_element_from_dtype(child_type) + elif isinstance(dtype, cudf.ListDtype): + s_element.child_types["offsets".encode()] = \ + _get_cudf_schema_element_from_dtype(cudf.dtype("int32")) + s_element.child_types["element".encode()] = \ + _get_cudf_schema_element_from_dtype(dtype.element_type) + + return s_element + + cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +: if cudf.api.types.is_categorical_dtype(dtype): raise NotImplementedError( diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index e1e8e7cdb3d..2a0ae565974 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -21,13 +21,27 @@ def read_json( lines=False, compression="infer", byte_range=None, + keep_quotes=False, *args, **kwargs, ): """{docstring}""" + if not isinstance(dtype, (abc.Mapping, bool)): + warnings.warn( + "passing 'dtype' as list is deprecated, instead pass " + "a dict of column name and types key-value paris." + "in future versions 'dtype' can only be a dict or bool", + FutureWarning, + ) + if engine == "cudf" and not lines: - raise ValueError("cudf engine only supports JSON Lines format") + raise ValueError(f"{engine} engine only supports JSON Lines format") + if engine != "cudf_experimental" and keep_quotes: + raise ValueError( + "keep_quotes='True' is supported only with" + " engine='cudf_experimental'" + ) if engine == "auto": engine = "cudf" if lines else "pandas" if engine == "cudf" or engine == "cudf_experimental": @@ -64,6 +78,7 @@ def read_json( compression, byte_range, engine == "cudf_experimental", + keep_quotes, ) else: warnings.warn( diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 92227707b18..1fdef44546a 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -274,11 +274,10 @@ def test_json_lines_byte_range(json_input): assert df.shape == (1, 3) -@pytest.mark.parametrize( - "dtype", [["float", "int", "short"], {1: "int", 2: "short", 0: "float"}] -) -def test_json_lines_dtypes(json_input, dtype): - df = cudf.read_json(json_input, lines=True, dtype=dtype) +def test_json_lines_dtypes(json_input): + df = cudf.read_json( + json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"} + ) assert all(df.dtypes == ["float64", "int64", "int16"]) @@ -302,7 +301,10 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp): pd_df.to_json(fname, compression=out_comp, lines=True, orient="records") cu_df = cudf.read_json( - str(fname), compression=in_comp, lines=True, dtype=["int32", "int32"] + str(fname), + compression=in_comp, + lines=True, + dtype={"col1": "int32", "col2": "int32"}, ) assert_eq(pd_df, cu_df) @@ -345,7 +347,9 @@ def test_json_bool_values(): # boolean values should be converted to 0/1 np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy()) - cu_df = cudf.read_json(buffer, lines=True, dtype=["bool", "long"]) + cu_df = cudf.read_json( + buffer, lines=True, dtype={"0": "bool", "1": "long"} + ) np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) @@ -663,3 +667,120 @@ def test_json_types_data(): pdf, schema=df.to_arrow().schema, safe=False ) assert df.to_arrow().equals(pa_table_pdf) + + +@pytest.mark.parametrize( + "keep_quotes,result", + [ + ( + True, + { + "c1": [ + {"f1": '"sf11"', "f2": '"sf21"'}, + {"f1": '"sf12"', "f2": '"sf22"'}, + ], + "c2": [['"l11"', '"l21"'], ['"l12"', '"l22"']], + }, + ), + ( + False, + { + "c1": [ + {"f1": "sf11", "f2": "sf21"}, + {"f1": "sf12", "f2": "sf22"}, + ], + "c2": [["l11", "l21"], ["l12", "l22"]], + }, + ), + ], +) +def test_json_keep_quotes(keep_quotes, result): + bytes_file = BytesIO() + data = { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + } + pdf = pd.DataFrame(data) + pdf.to_json(bytes_file, orient="records", lines=True) + + actual = cudf.read_json( + bytes_file, + engine="cudf_experimental", + orient="records", + lines=True, + keep_quotes=keep_quotes, + ) + expected = pd.DataFrame(result) + + assert_eq(actual, expected) + + +def test_json_dtypes_nested_data(): + # a: StructDtype({'a': StructDtype({'b': dtype('float64')}), + # 'b': dtype('int64')}) + # b: ListDtype(ListDtype(float64)) + actual_json_str = ( + '{"a":{"a":{"b":10.0},"b":11},"b":[[10.0,1.1],[12.0,23.0]]}\n' + '{"a":{"a":{"b":107.0},"b":5},"b":[[10.0,11.2],[12.0,0.23]]}\n' + '{"a":{"a":{"b":50.7},"b":2},"b":[[10.0,11.3],[12.0,2.3]]}\n' + '{"a":{"a":{"b":1.2},"b":67},"b":[[6.0,7.0]]}\n' + '{"a":{"a":{"b":40.1},"b":1090},"b":null}\n' + ) + + """ + In [3]: df + Out[3]: + a b + 0 {'a': {'b': 10.0}, 'b': 11} [[10.0, 1.1], [12.0, 23.0]] + 1 {'a': {'b': 107.0}, 'b': 5} [[10.0, 11.2], [12.0, 0.23]] + 2 {'a': {'b': 50.7}, 'b': 2} [[10.0, 11.3], [12.0, 2.3]] + 3 {'a': {'b': 1.2}, 'b': 67} [[6.0, 7.0]] + 4 {'a': {'b': 40.1}, 'b': 1090} None + """ + + # a: StructDtype({'a': StructDtype({'b': dtype('int64')}), + # 'b': dtype('float64')}) + # b: ListDtype(ListDtype(int64)) + expected_json_str = ( + '{"a":{"a":{"b":10},"b":11.0},"b":[[10,1],[12,23]]}\n' + '{"a":{"a":{"b":107},"b":5.0},"b":[[10,11],[12,0]]}\n' + '{"a":{"a":{"b":50},"b":2.0},"b":[[10,11],[12,2]]}\n' + '{"a":{"a":{"b":1},"b":67.0},"b":[[6,7]]}\n' + '{"a":{"a":{"b":40},"b":1090.0},"b":null}\n' + ) + + """ + In [7]: df + Out[7]: + a b + 0 {'a': {'b': 10}, 'b': 11.0} [[10, 1], [12, 23]] + 1 {'a': {'b': 107}, 'b': 5.0} [[10, 11], [12, 0]] + 2 {'a': {'b': 50}, 'b': 2.0} [[10, 11], [12, 2]] + 3 {'a': {'b': 1}, 'b': 67.0} [[6, 7]] + 4 {'a': {'b': 40}, 'b': 1090.0} None + """ + + df = cudf.read_json( + StringIO(actual_json_str), + engine="cudf_experimental", + orient="records", + lines=True, + dtype={ + "a": cudf.StructDtype( + { + "a": cudf.StructDtype({"b": cudf.dtype("int64")}), + "b": cudf.dtype("float64"), + } + ), + "b": cudf.ListDtype(cudf.ListDtype("int64")), + }, + ) + + pdf = pd.read_json( + StringIO(expected_json_str), orient="records", lines=True + ) + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 84d39459a12..9670a5e2d81 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -559,6 +559,11 @@ size in bytes. Set the size to zero to read all data after the offset location. Reads the row that starts before or at the end of the range, even if it ends after the end of the range. +keep_quotes : bool, default False + This parameter is only supported in ``cudf_experimental`` engine. + If `True`, any string values are read literally (and wrapped in an + additional set of quotes). + If `False` string values are parsed into Python strings. Returns ------- @@ -567,6 +572,30 @@ See Also -------- cudf.DataFrame.to_json + +Examples +-------- +>>> import cudf +>>> df = cudf.DataFrame({'a': ["hello", "rapids"], 'b': ["hello", "worlds"]}) +>>> df + a b +0 hello hello +1 rapids worlds +>>> json_str = df.to_json(orient='records', lines=True) +>>> json_str +'{"a":"hello","b":"hello"}\n{"a":"rapids","b":"worlds"}\n' +>>> cudf.read_json(json_str, engine="cudf", lines=True) + a b +0 hello hello +1 rapids worlds + +To read the strings with additional set of quotes: + +>>> cudf.read_json(json_str, engine="cudf_experimental", lines=True, +... keep_quotes=True) + a b +0 "hello" "hello" +1 "rapids" "worlds" """ doc_read_json = docfmt_partial(docstring=_docstring_read_json)