Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Enable schema_element & keep_quotes support in json reader #11746

Merged
merged 18 commits into from
Sep 27, 2022
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions python/cudf/cudf/_lib/cpp/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ from cudf._lib.cpp.types cimport data_type, size_type
cdef extern from "cudf/io/json.hpp" \
namespace "cudf::io" nogil:

cdef struct schema_element:
data_type type
map[string, schema_element] child_types

cdef cppclass json_reader_options:
json_reader_options() except+
cudf_io_types.source_info get_source() except+
Expand All @@ -28,7 +32,7 @@ cdef extern from "cudf/io/json.hpp" \

# setter
void set_dtypes(vector[data_type] types) except+
void set_dtypes(map[string, data_type] types) except+
void set_dtypes(map[string, schema_element] types) except+
void set_compression(
cudf_io_types.compression_type compression
) except+
Expand All @@ -37,6 +41,7 @@ cdef extern from "cudf/io/json.hpp" \
void enable_lines(bool val) except+
void enable_dayfirst(bool val) except+
void enable_experimental(bool val) except+
void enable_keep_quotes(bool val) except+

@staticmethod
json_reader_options_builder builder(
Expand All @@ -55,7 +60,7 @@ cdef extern from "cudf/io/json.hpp" \
vector[data_type] types
) except+
json_reader_options_builder& dtypes(
map[string, data_type] types
map[string, schema_element] types
) except+
json_reader_options_builder& compression(
cudf_io_types.compression_type compression
Expand All @@ -75,6 +80,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& experimental(
bool val
) except+
json_reader_options_builder& keep_quotes(
bool val
) except+

json_reader_options build() except+

Expand Down
39 changes: 34 additions & 5 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ cimport cudf._lib.cpp.types as libcudf_types
from cudf._lib.cpp.io.json cimport (
json_reader_options,
read_json as libcudf_read_json,
schema_element,
)
from cudf._lib.cpp.types cimport data_type, size_type, type_id
from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
Expand All @@ -32,7 +33,8 @@ cpdef read_json(object filepaths_or_buffers,
bool lines,
object compression,
object byte_range,
bool experimental):
bool experimental,
bool keep_quotes):
"""
Cython function to call into libcudf API, see `read_json`.

Expand All @@ -55,7 +57,7 @@ cpdef read_json(object filepaths_or_buffers,

# Setup arguments
cdef vector[data_type] c_dtypes_list
cdef map[string, data_type] c_dtypes_map
cdef map[string, schema_element] c_dtypes_schema_map
cdef cudf_io_types.compression_type c_compression
# Determine byte read offsets if applicable
cdef size_type c_range_offset = (
Expand All @@ -81,8 +83,8 @@ cpdef read_json(object filepaths_or_buffers,
elif dtype is not True:
if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
c_dtypes_map[str(k).encode()] = \
_get_cudf_data_type_from_dtype(v)
c_dtypes_schema_map[str(k).encode()] = \
_get_cudf_schema_element_from_dtype(v)
elif isinstance(dtype, abc.Collection):
is_list_like_dtypes = True
c_dtypes_list.reserve(len(dtype))
Expand All @@ -105,8 +107,9 @@ cpdef read_json(object filepaths_or_buffers,
if is_list_like_dtypes:
opts.set_dtypes(c_dtypes_list)
else:
opts.set_dtypes(c_dtypes_map)
opts.set_dtypes(c_dtypes_schema_map)

opts.enable_keep_quotes(keep_quotes)
# Read JSON
cdef cudf_io_types.table_with_metadata c_result

Expand All @@ -123,6 +126,32 @@ cpdef read_json(object filepaths_or_buffers,

return df


cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
cdef schema_element s_element
cdef data_type lib_type
if cudf.api.types.is_categorical_dtype(dtype):
raise NotImplementedError(
"CategoricalDtype as dtype is not yet "
"supported in JSON reader"
)

dtype = cudf.dtype(dtype)
lib_type = dtype_to_data_type(dtype)
s_element.type = lib_type
if isinstance(dtype, cudf.StructDtype):
for name, child_type in dtype.fields.items():
s_element.child_types[name.encode()] = \
_get_cudf_schema_element_from_dtype(child_type)
elif isinstance(dtype, cudf.ListDtype):
wence- marked this conversation as resolved.
Show resolved Hide resolved
s_element.child_types["offsets".encode()] = \
_get_cudf_schema_element_from_dtype(cudf.dtype("int32"))
s_element.child_types["element".encode()] = \
_get_cudf_schema_element_from_dtype(dtype.element_type)

return s_element


cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
if cudf.api.types.is_categorical_dtype(dtype):
raise NotImplementedError(
Expand Down
19 changes: 17 additions & 2 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,27 @@ def read_json(
lines=False,
compression="infer",
byte_range=None,
keep_quotes=False,
*args,
**kwargs,
):
"""{docstring}"""

if engine == "cudf" and not lines:
raise ValueError("cudf engine only supports JSON Lines format")
if not isinstance(dtype, (abc.Mapping, bool)):
warnings.warn(
"passing 'dtype' as list is deprecated, instead pass "
"a dict of column name and types key-value paris."
"in future versions 'dtype' can only be a dict or bool",
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
FutureWarning,
)

if engine in {"cudf", "cudf_experimental"} and not lines:
Copy link
Contributor

@karthikeyann karthikeyann Sep 26, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With PR #11714, cudf_experimental will support both JSON lines, and also records.
Also, old cudf_experimental already supports records format.

Suggested change
if engine in {"cudf", "cudf_experimental"} and not lines:
if engine == "cudf" and not lines:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@karthikeyann, is #11714 ready to merge for 22.10?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if tests pass, it should be. Addressed most of review comments, left some for next PRs.
I merged this PR too locally and tested with above change. Tests pass.
experimental engine could use more python tests.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n" a valid JSON lines?
This is not supported by cudf_experimental engine. Record orient with lines=True/False is only supported.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n" a valid JSON lines?

It is:

In [1]: import pandas as pd

In [2]: json = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n"

In [5]: pd.read_json(json, lines=True, orient="records")
Out[5]: 
   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9

raise ValueError(f"{engine} engine only supports JSON Lines format")
if engine != "cudf_experimental" and keep_quotes:
raise ValueError(
"keep_quotes='True' is supported only with"
" engine='cudf_experimental'"
)
wence- marked this conversation as resolved.
Show resolved Hide resolved
if engine == "auto":
engine = "cudf" if lines else "pandas"
if engine == "cudf" or engine == "cudf_experimental":
Expand Down Expand Up @@ -64,6 +78,7 @@ def read_json(
compression,
byte_range,
engine == "cudf_experimental",
keep_quotes,
)
else:
warnings.warn(
Expand Down
63 changes: 56 additions & 7 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,11 +274,10 @@ def test_json_lines_byte_range(json_input):
assert df.shape == (1, 3)


@pytest.mark.parametrize(
"dtype", [["float", "int", "short"], {1: "int", 2: "short", 0: "float"}]
)
def test_json_lines_dtypes(json_input, dtype):
df = cudf.read_json(json_input, lines=True, dtype=dtype)
def test_json_lines_dtypes(json_input):
df = cudf.read_json(
json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"}
)
assert all(df.dtypes == ["float64", "int64", "int16"])


Expand All @@ -302,7 +301,10 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
pd_df.to_json(fname, compression=out_comp, lines=True, orient="records")

cu_df = cudf.read_json(
str(fname), compression=in_comp, lines=True, dtype=["int32", "int32"]
str(fname),
compression=in_comp,
lines=True,
dtype={"col1": "int32", "col2": "int32"},
)
assert_eq(pd_df, cu_df)

Expand Down Expand Up @@ -345,7 +347,9 @@ def test_json_bool_values():
# boolean values should be converted to 0/1
np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy())

cu_df = cudf.read_json(buffer, lines=True, dtype=["bool", "long"])
cu_df = cudf.read_json(
buffer, lines=True, dtype={"0": "bool", "1": "long"}
)
Comment on lines +350 to +352
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the dtype argument going to be a breaking change for the engine='cudf' reader?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, dtype is going to continue to work for engine='cudf' reader. But the reason I've deprecated supporting the list of dtypes in dtype param is there is no way we can give the column a name with schema_info being introduced going forward.

Copy link
Contributor Author

@galipremsagar galipremsagar Sep 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dtype=list is going to a breaking change once we drop it. But this PR is just deprecating. Pandas don't have it and hence we won't be needing it either.

np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)


Expand Down Expand Up @@ -660,3 +664,48 @@ def test_json_types_data():
pdf, schema=df.to_arrow().schema, safe=False
)
assert df.to_arrow().equals(pa_table_pdf)


galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize(
"keep_quotes,result",
[
(
True,
{
"c1": [
{"f1": '"sf11"', "f2": '"sf21"'},
{"f1": '"sf12"', "f2": '"sf22"'},
],
"c2": [['"l11"', '"l21"'], ['"l12"', '"l22"']],
},
),
(
False,
{
"c1": [
{"f1": "sf11", "f2": "sf21"},
{"f1": "sf12", "f2": "sf22"},
],
"c2": [["l11", "l21"], ["l12", "l22"]],
},
),
],
)
def test_json_keep_quotes(keep_quotes, result):
bytes_file = BytesIO()
data = {
"c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}],
"c2": [["l11", "l21"], ["l12", "l22"]],
}
pdf = pd.DataFrame(data)
pdf.to_json(bytes_file, orient="records")

actual = cudf.read_json(
bytes_file,
engine="cudf_experimental",
orient="records",
keep_quotes=keep_quotes,
)
expected = pd.DataFrame(result)

assert_eq(actual, expected)
29 changes: 29 additions & 0 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,11 @@
size in bytes. Set the size to zero to read all data after the offset
location. Reads the row that starts before or at the end of the range,
even if it ends after the end of the range.
keep_quotes : bool, default False
This parameter is only supported in ``cudf_experimental`` engine.
If `True`, any string values are read literally (and wrapped in an
additional set of quotes).
If `False` string values are parsed into Python strings.

Returns
-------
Expand All @@ -567,6 +572,30 @@
See Also
--------
cudf.DataFrame.to_json

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': ["hello", "rapids"], 'b': ["hello", "worlds"]})
>>> df
a b
0 hello hello
1 rapids worlds
>>> json_str = df.to_json(orient='records', lines=True)
>>> json_str
'{"a":"hello","b":"hello"}\n{"a":"rapids","b":"worlds"}\n'
>>> cudf.read_json(json_str, engine="cudf", lines=True)
a b
0 hello hello
1 rapids worlds

To read the strings with additional set of quotes:

>>> cudf.read_json(json_str, engine="cudf_experimental", lines=True,
... keep_quotes=True)
a b
0 "hello" "hello"
1 "rapids" "worlds"
"""
doc_read_json = docfmt_partial(docstring=_docstring_read_json)

Expand Down