Enable schema_element & keep_quotes support in json reader (#11746)

This PR plumbs `schema_element` and `keep_quotes` support in json reader. **Deprecation:** This PR also contains changes deprecating `dtype` as `list` inputs. This seems to be a very outdated legacy feature we continued to support and cannot be supported with the `schema_element`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Lawrence Mitchell (https://github.com/wence-) URL: #11746
rapidsai · Sep 27, 2022 · 35b0a52 · 35b0a52
1 parent 831ef04
commit 35b0a52
Show file tree

Hide file tree

Showing 5 changed files with 217 additions and 15 deletions.
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -15,6 +15,10 @@ from cudf._lib.cpp.types cimport data_type, size_type
 cdef extern from "cudf/io/json.hpp" \
         namespace "cudf::io" nogil:
 
+    cdef struct schema_element:
+        data_type type
+        map[string, schema_element] child_types
+
     cdef cppclass json_reader_options:
         json_reader_options() except+
         cudf_io_types.source_info get_source() except+
@@ -28,7 +32,7 @@ cdef extern from "cudf/io/json.hpp" \
 
         # setter
         void set_dtypes(vector[data_type] types) except+
-        void set_dtypes(map[string, data_type] types) except+
+        void set_dtypes(map[string, schema_element] types) except+
         void set_compression(
             cudf_io_types.compression_type compression
         ) except+
@@ -37,6 +41,7 @@ cdef extern from "cudf/io/json.hpp" \
         void enable_lines(bool val) except+
         void enable_dayfirst(bool val) except+
         void enable_experimental(bool val) except+
+        void enable_keep_quotes(bool val) except+
 
         @staticmethod
         json_reader_options_builder builder(
@@ -55,7 +60,7 @@ cdef extern from "cudf/io/json.hpp" \
             vector[data_type] types
         ) except+
         json_reader_options_builder& dtypes(
-            map[string, data_type] types
+            map[string, schema_element] types
         ) except+
         json_reader_options_builder& compression(
             cudf_io_types.compression_type compression
@@ -75,6 +80,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& experimental(
             bool val
         ) except+
+        json_reader_options_builder& keep_quotes(
+            bool val
+        ) except+
 
         json_reader_options build() except+
 

diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
@@ -20,6 +20,7 @@ cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.io.json cimport (
     json_reader_options,
     read_json as libcudf_read_json,
+    schema_element,
 )
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
@@ -32,7 +33,8 @@ cpdef read_json(object filepaths_or_buffers,
                 bool lines,
                 object compression,
                 object byte_range,
-                bool experimental):
+                bool experimental,
+                bool keep_quotes):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -55,7 +57,7 @@ cpdef read_json(object filepaths_or_buffers,
 
     # Setup arguments
     cdef vector[data_type] c_dtypes_list
-    cdef map[string, data_type] c_dtypes_map
+    cdef map[string, schema_element] c_dtypes_schema_map
     cdef cudf_io_types.compression_type c_compression
     # Determine byte read offsets if applicable
     cdef size_type c_range_offset = (
@@ -81,8 +83,8 @@ cpdef read_json(object filepaths_or_buffers,
     elif dtype is not True:
         if isinstance(dtype, abc.Mapping):
             for k, v in dtype.items():
-                c_dtypes_map[str(k).encode()] = \
-                    _get_cudf_data_type_from_dtype(v)
+                c_dtypes_schema_map[str(k).encode()] = \
+                    _get_cudf_schema_element_from_dtype(v)
         elif isinstance(dtype, abc.Collection):
             is_list_like_dtypes = True
             c_dtypes_list.reserve(len(dtype))
@@ -105,8 +107,9 @@ cpdef read_json(object filepaths_or_buffers,
     if is_list_like_dtypes:
         opts.set_dtypes(c_dtypes_list)
     else:
-        opts.set_dtypes(c_dtypes_map)
+        opts.set_dtypes(c_dtypes_schema_map)
 
+    opts.enable_keep_quotes(keep_quotes)
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 
@@ -123,6 +126,32 @@ cpdef read_json(object filepaths_or_buffers,
 
     return df
 
+
+cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
+    cdef schema_element s_element
+    cdef data_type lib_type
+    if cudf.api.types.is_categorical_dtype(dtype):
+        raise NotImplementedError(
+            "CategoricalDtype as dtype is not yet "
+            "supported in JSON reader"
+        )
+
+    dtype = cudf.dtype(dtype)
+    lib_type = dtype_to_data_type(dtype)
+    s_element.type = lib_type
+    if isinstance(dtype, cudf.StructDtype):
+        for name, child_type in dtype.fields.items():
+            s_element.child_types[name.encode()] = \
+                _get_cudf_schema_element_from_dtype(child_type)
+    elif isinstance(dtype, cudf.ListDtype):
+        s_element.child_types["offsets".encode()] = \
+            _get_cudf_schema_element_from_dtype(cudf.dtype("int32"))
+        s_element.child_types["element".encode()] = \
+            _get_cudf_schema_element_from_dtype(dtype.element_type)
+
+    return s_element
+
+
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
     if cudf.api.types.is_categorical_dtype(dtype):
         raise NotImplementedError(

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
@@ -21,13 +21,27 @@ def read_json(
     lines=False,
     compression="infer",
     byte_range=None,
+    keep_quotes=False,
     *args,
     **kwargs,
 ):
     """{docstring}"""
 
+    if not isinstance(dtype, (abc.Mapping, bool)):
+        warnings.warn(
+            "passing 'dtype' as list is deprecated, instead pass "
+            "a dict of column name and types key-value paris."
+            "in future versions 'dtype' can only be a dict or bool",
+            FutureWarning,
+        )
+
     if engine == "cudf" and not lines:
-        raise ValueError("cudf engine only supports JSON Lines format")
+        raise ValueError(f"{engine} engine only supports JSON Lines format")
+    if engine != "cudf_experimental" and keep_quotes:
+        raise ValueError(
+            "keep_quotes='True' is supported only with"
+            " engine='cudf_experimental'"
+        )
     if engine == "auto":
         engine = "cudf" if lines else "pandas"
     if engine == "cudf" or engine == "cudf_experimental":
@@ -64,6 +78,7 @@ def read_json(
             compression,
             byte_range,
             engine == "cudf_experimental",
+            keep_quotes,
         )
     else:
         warnings.warn(

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
@@ -274,11 +274,10 @@ def test_json_lines_byte_range(json_input):
     assert df.shape == (1, 3)
 
 
-@pytest.mark.parametrize(
-    "dtype", [["float", "int", "short"], {1: "int", 2: "short", 0: "float"}]
-)
-def test_json_lines_dtypes(json_input, dtype):
-    df = cudf.read_json(json_input, lines=True, dtype=dtype)
+def test_json_lines_dtypes(json_input):
+    df = cudf.read_json(
+        json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"}
+    )
     assert all(df.dtypes == ["float64", "int64", "int16"])
 
 
@@ -302,7 +301,10 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
     pd_df.to_json(fname, compression=out_comp, lines=True, orient="records")
 
     cu_df = cudf.read_json(
-        str(fname), compression=in_comp, lines=True, dtype=["int32", "int32"]
+        str(fname),
+        compression=in_comp,
+        lines=True,
+        dtype={"col1": "int32", "col2": "int32"},
     )
     assert_eq(pd_df, cu_df)
 
@@ -345,7 +347,9 @@ def test_json_bool_values():
     # boolean values should be converted to 0/1
     np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy())
 
-    cu_df = cudf.read_json(buffer, lines=True, dtype=["bool", "long"])
+    cu_df = cudf.read_json(
+        buffer, lines=True, dtype={"0": "bool", "1": "long"}
+    )
     np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
 
 
@@ -663,3 +667,120 @@ def test_json_types_data():
         pdf, schema=df.to_arrow().schema, safe=False
     )
     assert df.to_arrow().equals(pa_table_pdf)
+
+
+@pytest.mark.parametrize(
+    "keep_quotes,result",
+    [
+        (
+            True,
+            {
+                "c1": [
+                    {"f1": '"sf11"', "f2": '"sf21"'},
+                    {"f1": '"sf12"', "f2": '"sf22"'},
+                ],
+                "c2": [['"l11"', '"l21"'], ['"l12"', '"l22"']],
+            },
+        ),
+        (
+            False,
+            {
+                "c1": [
+                    {"f1": "sf11", "f2": "sf21"},
+                    {"f1": "sf12", "f2": "sf22"},
+                ],
+                "c2": [["l11", "l21"], ["l12", "l22"]],
+            },
+        ),
+    ],
+)
+def test_json_keep_quotes(keep_quotes, result):
+    bytes_file = BytesIO()
+    data = {
+        "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}],
+        "c2": [["l11", "l21"], ["l12", "l22"]],
+    }
+    pdf = pd.DataFrame(data)
+    pdf.to_json(bytes_file, orient="records", lines=True)
+
+    actual = cudf.read_json(
+        bytes_file,
+        engine="cudf_experimental",
+        orient="records",
+        lines=True,
+        keep_quotes=keep_quotes,
+    )
+    expected = pd.DataFrame(result)
+
+    assert_eq(actual, expected)
+
+
+def test_json_dtypes_nested_data():
+    # a: StructDtype({'a': StructDtype({'b': dtype('float64')}),
+    #                 'b': dtype('int64')})
+    # b: ListDtype(ListDtype(float64))
+    actual_json_str = (
+        '{"a":{"a":{"b":10.0},"b":11},"b":[[10.0,1.1],[12.0,23.0]]}\n'
+        '{"a":{"a":{"b":107.0},"b":5},"b":[[10.0,11.2],[12.0,0.23]]}\n'
+        '{"a":{"a":{"b":50.7},"b":2},"b":[[10.0,11.3],[12.0,2.3]]}\n'
+        '{"a":{"a":{"b":1.2},"b":67},"b":[[6.0,7.0]]}\n'
+        '{"a":{"a":{"b":40.1},"b":1090},"b":null}\n'
+    )
+
+    """
+    In [3]: df
+    Out[3]:
+                                   a                             b
+    0    {'a': {'b': 10.0}, 'b': 11}   [[10.0, 1.1], [12.0, 23.0]]
+    1    {'a': {'b': 107.0}, 'b': 5}  [[10.0, 11.2], [12.0, 0.23]]
+    2     {'a': {'b': 50.7}, 'b': 2}   [[10.0, 11.3], [12.0, 2.3]]
+    3     {'a': {'b': 1.2}, 'b': 67}                  [[6.0, 7.0]]
+    4  {'a': {'b': 40.1}, 'b': 1090}                          None
+    """
+
+    # a: StructDtype({'a': StructDtype({'b': dtype('int64')}),
+    #                 'b': dtype('float64')})
+    # b: ListDtype(ListDtype(int64))
+    expected_json_str = (
+        '{"a":{"a":{"b":10},"b":11.0},"b":[[10,1],[12,23]]}\n'
+        '{"a":{"a":{"b":107},"b":5.0},"b":[[10,11],[12,0]]}\n'
+        '{"a":{"a":{"b":50},"b":2.0},"b":[[10,11],[12,2]]}\n'
+        '{"a":{"a":{"b":1},"b":67.0},"b":[[6,7]]}\n'
+        '{"a":{"a":{"b":40},"b":1090.0},"b":null}\n'
+    )
+
+    """
+    In [7]: df
+    Out[7]:
+                                  a                    b
+    0    {'a': {'b': 10}, 'b': 11.0}  [[10, 1], [12, 23]]
+    1    {'a': {'b': 107}, 'b': 5.0}  [[10, 11], [12, 0]]
+    2     {'a': {'b': 50}, 'b': 2.0}  [[10, 11], [12, 2]]
+    3     {'a': {'b': 1}, 'b': 67.0}             [[6, 7]]
+    4  {'a': {'b': 40}, 'b': 1090.0}                 None
+    """
+
+    df = cudf.read_json(
+        StringIO(actual_json_str),
+        engine="cudf_experimental",
+        orient="records",
+        lines=True,
+        dtype={
+            "a": cudf.StructDtype(
+                {
+                    "a": cudf.StructDtype({"b": cudf.dtype("int64")}),
+                    "b": cudf.dtype("float64"),
+                }
+            ),
+            "b": cudf.ListDtype(cudf.ListDtype("int64")),
+        },
+    )
+
+    pdf = pd.read_json(
+        StringIO(expected_json_str), orient="records", lines=True
+    )
+    pdf.columns = pdf.columns.astype("str")
+    pa_table_pdf = pa.Table.from_pandas(
+        pdf, schema=df.to_arrow().schema, safe=False
+    )
+    assert df.to_arrow().equals(pa_table_pdf)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -559,6 +559,11 @@
     size in bytes. Set the size to zero to read all data after the offset
     location. Reads the row that starts before or at the end of the range,
     even if it ends after the end of the range.
+keep_quotes : bool, default False
+    This parameter is only supported in ``cudf_experimental`` engine.
+    If `True`, any string values are read literally (and wrapped in an
+    additional set of quotes).
+    If `False` string values are parsed into Python strings.
 
 Returns
 -------
@@ -567,6 +572,30 @@
 See Also
 --------
 cudf.DataFrame.to_json
+
+Examples
+--------
+>>> import cudf
+>>> df = cudf.DataFrame({'a': ["hello", "rapids"], 'b': ["hello", "worlds"]})
+>>> df
+        a       b
+0   hello   hello
+1  rapids  worlds
+>>> json_str = df.to_json(orient='records', lines=True)
+>>> json_str
+'{"a":"hello","b":"hello"}\n{"a":"rapids","b":"worlds"}\n'
+>>> cudf.read_json(json_str,  engine="cudf", lines=True)
+        a       b
+0   hello   hello
+1  rapids  worlds
+
+To read the strings with additional set of quotes:
+
+>>> cudf.read_json(json_str,  engine="cudf_experimental", lines=True,
+...                keep_quotes=True)
+          a         b
+0   "hello"   "hello"
+1  "rapids"  "worlds"
 """
 doc_read_json = docfmt_partial(docstring=_docstring_read_json)