Skip to content

Commit

Permalink
Enable schema_element & keep_quotes support in json reader (#11746)
Browse files Browse the repository at this point in the history
This PR plumbs `schema_element` and `keep_quotes` support in json reader.

**Deprecation:** This PR also contains changes deprecating `dtype` as `list` inputs. This seems to be a very outdated legacy feature we continued to support and cannot be supported with the `schema_element`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #11746
  • Loading branch information
galipremsagar authored Sep 27, 2022
1 parent 831ef04 commit 35b0a52
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 15 deletions.
12 changes: 10 additions & 2 deletions python/cudf/cudf/_lib/cpp/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ from cudf._lib.cpp.types cimport data_type, size_type
cdef extern from "cudf/io/json.hpp" \
namespace "cudf::io" nogil:

cdef struct schema_element:
data_type type
map[string, schema_element] child_types

cdef cppclass json_reader_options:
json_reader_options() except+
cudf_io_types.source_info get_source() except+
Expand All @@ -28,7 +32,7 @@ cdef extern from "cudf/io/json.hpp" \

# setter
void set_dtypes(vector[data_type] types) except+
void set_dtypes(map[string, data_type] types) except+
void set_dtypes(map[string, schema_element] types) except+
void set_compression(
cudf_io_types.compression_type compression
) except+
Expand All @@ -37,6 +41,7 @@ cdef extern from "cudf/io/json.hpp" \
void enable_lines(bool val) except+
void enable_dayfirst(bool val) except+
void enable_experimental(bool val) except+
void enable_keep_quotes(bool val) except+

@staticmethod
json_reader_options_builder builder(
Expand All @@ -55,7 +60,7 @@ cdef extern from "cudf/io/json.hpp" \
vector[data_type] types
) except+
json_reader_options_builder& dtypes(
map[string, data_type] types
map[string, schema_element] types
) except+
json_reader_options_builder& compression(
cudf_io_types.compression_type compression
Expand All @@ -75,6 +80,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& experimental(
bool val
) except+
json_reader_options_builder& keep_quotes(
bool val
) except+

json_reader_options build() except+

Expand Down
39 changes: 34 additions & 5 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ cimport cudf._lib.cpp.types as libcudf_types
from cudf._lib.cpp.io.json cimport (
json_reader_options,
read_json as libcudf_read_json,
schema_element,
)
from cudf._lib.cpp.types cimport data_type, size_type, type_id
from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
Expand All @@ -32,7 +33,8 @@ cpdef read_json(object filepaths_or_buffers,
bool lines,
object compression,
object byte_range,
bool experimental):
bool experimental,
bool keep_quotes):
"""
Cython function to call into libcudf API, see `read_json`.
Expand All @@ -55,7 +57,7 @@ cpdef read_json(object filepaths_or_buffers,

# Setup arguments
cdef vector[data_type] c_dtypes_list
cdef map[string, data_type] c_dtypes_map
cdef map[string, schema_element] c_dtypes_schema_map
cdef cudf_io_types.compression_type c_compression
# Determine byte read offsets if applicable
cdef size_type c_range_offset = (
Expand All @@ -81,8 +83,8 @@ cpdef read_json(object filepaths_or_buffers,
elif dtype is not True:
if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
c_dtypes_map[str(k).encode()] = \
_get_cudf_data_type_from_dtype(v)
c_dtypes_schema_map[str(k).encode()] = \
_get_cudf_schema_element_from_dtype(v)
elif isinstance(dtype, abc.Collection):
is_list_like_dtypes = True
c_dtypes_list.reserve(len(dtype))
Expand All @@ -105,8 +107,9 @@ cpdef read_json(object filepaths_or_buffers,
if is_list_like_dtypes:
opts.set_dtypes(c_dtypes_list)
else:
opts.set_dtypes(c_dtypes_map)
opts.set_dtypes(c_dtypes_schema_map)

opts.enable_keep_quotes(keep_quotes)
# Read JSON
cdef cudf_io_types.table_with_metadata c_result

Expand All @@ -123,6 +126,32 @@ cpdef read_json(object filepaths_or_buffers,

return df


cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
cdef schema_element s_element
cdef data_type lib_type
if cudf.api.types.is_categorical_dtype(dtype):
raise NotImplementedError(
"CategoricalDtype as dtype is not yet "
"supported in JSON reader"
)

dtype = cudf.dtype(dtype)
lib_type = dtype_to_data_type(dtype)
s_element.type = lib_type
if isinstance(dtype, cudf.StructDtype):
for name, child_type in dtype.fields.items():
s_element.child_types[name.encode()] = \
_get_cudf_schema_element_from_dtype(child_type)
elif isinstance(dtype, cudf.ListDtype):
s_element.child_types["offsets".encode()] = \
_get_cudf_schema_element_from_dtype(cudf.dtype("int32"))
s_element.child_types["element".encode()] = \
_get_cudf_schema_element_from_dtype(dtype.element_type)

return s_element


cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
if cudf.api.types.is_categorical_dtype(dtype):
raise NotImplementedError(
Expand Down
17 changes: 16 additions & 1 deletion python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,27 @@ def read_json(
lines=False,
compression="infer",
byte_range=None,
keep_quotes=False,
*args,
**kwargs,
):
"""{docstring}"""

if not isinstance(dtype, (abc.Mapping, bool)):
warnings.warn(
"passing 'dtype' as list is deprecated, instead pass "
"a dict of column name and types key-value paris."
"in future versions 'dtype' can only be a dict or bool",
FutureWarning,
)

if engine == "cudf" and not lines:
raise ValueError("cudf engine only supports JSON Lines format")
raise ValueError(f"{engine} engine only supports JSON Lines format")
if engine != "cudf_experimental" and keep_quotes:
raise ValueError(
"keep_quotes='True' is supported only with"
" engine='cudf_experimental'"
)
if engine == "auto":
engine = "cudf" if lines else "pandas"
if engine == "cudf" or engine == "cudf_experimental":
Expand Down Expand Up @@ -64,6 +78,7 @@ def read_json(
compression,
byte_range,
engine == "cudf_experimental",
keep_quotes,
)
else:
warnings.warn(
Expand Down
135 changes: 128 additions & 7 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,11 +274,10 @@ def test_json_lines_byte_range(json_input):
assert df.shape == (1, 3)


@pytest.mark.parametrize(
"dtype", [["float", "int", "short"], {1: "int", 2: "short", 0: "float"}]
)
def test_json_lines_dtypes(json_input, dtype):
df = cudf.read_json(json_input, lines=True, dtype=dtype)
def test_json_lines_dtypes(json_input):
df = cudf.read_json(
json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"}
)
assert all(df.dtypes == ["float64", "int64", "int16"])


Expand All @@ -302,7 +301,10 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
pd_df.to_json(fname, compression=out_comp, lines=True, orient="records")

cu_df = cudf.read_json(
str(fname), compression=in_comp, lines=True, dtype=["int32", "int32"]
str(fname),
compression=in_comp,
lines=True,
dtype={"col1": "int32", "col2": "int32"},
)
assert_eq(pd_df, cu_df)

Expand Down Expand Up @@ -345,7 +347,9 @@ def test_json_bool_values():
# boolean values should be converted to 0/1
np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy())

cu_df = cudf.read_json(buffer, lines=True, dtype=["bool", "long"])
cu_df = cudf.read_json(
buffer, lines=True, dtype={"0": "bool", "1": "long"}
)
np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)


Expand Down Expand Up @@ -663,3 +667,120 @@ def test_json_types_data():
pdf, schema=df.to_arrow().schema, safe=False
)
assert df.to_arrow().equals(pa_table_pdf)


@pytest.mark.parametrize(
"keep_quotes,result",
[
(
True,
{
"c1": [
{"f1": '"sf11"', "f2": '"sf21"'},
{"f1": '"sf12"', "f2": '"sf22"'},
],
"c2": [['"l11"', '"l21"'], ['"l12"', '"l22"']],
},
),
(
False,
{
"c1": [
{"f1": "sf11", "f2": "sf21"},
{"f1": "sf12", "f2": "sf22"},
],
"c2": [["l11", "l21"], ["l12", "l22"]],
},
),
],
)
def test_json_keep_quotes(keep_quotes, result):
bytes_file = BytesIO()
data = {
"c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}],
"c2": [["l11", "l21"], ["l12", "l22"]],
}
pdf = pd.DataFrame(data)
pdf.to_json(bytes_file, orient="records", lines=True)

actual = cudf.read_json(
bytes_file,
engine="cudf_experimental",
orient="records",
lines=True,
keep_quotes=keep_quotes,
)
expected = pd.DataFrame(result)

assert_eq(actual, expected)


def test_json_dtypes_nested_data():
# a: StructDtype({'a': StructDtype({'b': dtype('float64')}),
# 'b': dtype('int64')})
# b: ListDtype(ListDtype(float64))
actual_json_str = (
'{"a":{"a":{"b":10.0},"b":11},"b":[[10.0,1.1],[12.0,23.0]]}\n'
'{"a":{"a":{"b":107.0},"b":5},"b":[[10.0,11.2],[12.0,0.23]]}\n'
'{"a":{"a":{"b":50.7},"b":2},"b":[[10.0,11.3],[12.0,2.3]]}\n'
'{"a":{"a":{"b":1.2},"b":67},"b":[[6.0,7.0]]}\n'
'{"a":{"a":{"b":40.1},"b":1090},"b":null}\n'
)

"""
In [3]: df
Out[3]:
a b
0 {'a': {'b': 10.0}, 'b': 11} [[10.0, 1.1], [12.0, 23.0]]
1 {'a': {'b': 107.0}, 'b': 5} [[10.0, 11.2], [12.0, 0.23]]
2 {'a': {'b': 50.7}, 'b': 2} [[10.0, 11.3], [12.0, 2.3]]
3 {'a': {'b': 1.2}, 'b': 67} [[6.0, 7.0]]
4 {'a': {'b': 40.1}, 'b': 1090} None
"""

# a: StructDtype({'a': StructDtype({'b': dtype('int64')}),
# 'b': dtype('float64')})
# b: ListDtype(ListDtype(int64))
expected_json_str = (
'{"a":{"a":{"b":10},"b":11.0},"b":[[10,1],[12,23]]}\n'
'{"a":{"a":{"b":107},"b":5.0},"b":[[10,11],[12,0]]}\n'
'{"a":{"a":{"b":50},"b":2.0},"b":[[10,11],[12,2]]}\n'
'{"a":{"a":{"b":1},"b":67.0},"b":[[6,7]]}\n'
'{"a":{"a":{"b":40},"b":1090.0},"b":null}\n'
)

"""
In [7]: df
Out[7]:
a b
0 {'a': {'b': 10}, 'b': 11.0} [[10, 1], [12, 23]]
1 {'a': {'b': 107}, 'b': 5.0} [[10, 11], [12, 0]]
2 {'a': {'b': 50}, 'b': 2.0} [[10, 11], [12, 2]]
3 {'a': {'b': 1}, 'b': 67.0} [[6, 7]]
4 {'a': {'b': 40}, 'b': 1090.0} None
"""

df = cudf.read_json(
StringIO(actual_json_str),
engine="cudf_experimental",
orient="records",
lines=True,
dtype={
"a": cudf.StructDtype(
{
"a": cudf.StructDtype({"b": cudf.dtype("int64")}),
"b": cudf.dtype("float64"),
}
),
"b": cudf.ListDtype(cudf.ListDtype("int64")),
},
)

pdf = pd.read_json(
StringIO(expected_json_str), orient="records", lines=True
)
pdf.columns = pdf.columns.astype("str")
pa_table_pdf = pa.Table.from_pandas(
pdf, schema=df.to_arrow().schema, safe=False
)
assert df.to_arrow().equals(pa_table_pdf)
29 changes: 29 additions & 0 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,11 @@
size in bytes. Set the size to zero to read all data after the offset
location. Reads the row that starts before or at the end of the range,
even if it ends after the end of the range.
keep_quotes : bool, default False
This parameter is only supported in ``cudf_experimental`` engine.
If `True`, any string values are read literally (and wrapped in an
additional set of quotes).
If `False` string values are parsed into Python strings.
Returns
-------
Expand All @@ -567,6 +572,30 @@
See Also
--------
cudf.DataFrame.to_json
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': ["hello", "rapids"], 'b': ["hello", "worlds"]})
>>> df
a b
0 hello hello
1 rapids worlds
>>> json_str = df.to_json(orient='records', lines=True)
>>> json_str
'{"a":"hello","b":"hello"}\n{"a":"rapids","b":"worlds"}\n'
>>> cudf.read_json(json_str, engine="cudf", lines=True)
a b
0 hello hello
1 rapids worlds
To read the strings with additional set of quotes:
>>> cudf.read_json(json_str, engine="cudf_experimental", lines=True,
... keep_quotes=True)
a b
0 "hello" "hello"
1 "rapids" "worlds"
"""
doc_read_json = docfmt_partial(docstring=_docstring_read_json)

Expand Down

0 comments on commit 35b0a52

Please sign in to comment.