Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement on_bad_lines in json reader #15834

Merged
merged 4 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ from cudf._lib.io.utils cimport (
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.json cimport (
json_reader_options,
json_recovery_mode_t,
json_writer_options,
read_json as libcudf_read_json,
schema_element,
Expand All @@ -42,14 +43,24 @@ from cudf._lib.types cimport dtype_to_data_type
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table


cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
if on_bad_lines.lower() == "error":
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
return json_recovery_mode_t.FAIL
elif on_bad_lines.lower() == "recover":
return json_recovery_mode_t.RECOVER_WITH_NULL
else:
raise TypeError(f"Invalid parameter for {on_bad_lines=}")


cpdef read_json(object filepaths_or_buffers,
object dtype,
bool lines,
object compression,
object byte_range,
bool keep_quotes,
bool mixed_types_as_string,
bool prune_columns):
bool prune_columns,
object on_bad_lines):
"""
Cython function to call into libcudf API, see `read_json`.

Expand Down Expand Up @@ -118,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,
.lines(c_lines)
.byte_range_offset(c_range_offset)
.byte_range_size(c_range_size)
.recovery_mode(_get_json_recovery_mode(on_bad_lines))
.build()
)
if is_list_like_dtypes:
Expand All @@ -128,6 +140,7 @@ cpdef read_json(object filepaths_or_buffers,
opts.enable_keep_quotes(keep_quotes)
opts.enable_mixed_types_as_string(mixed_types_as_string)
opts.enable_prune_columns(prune_columns)

# Read JSON
cdef cudf_io_types.table_with_metadata c_result

Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ cdef extern from "cudf/io/json.hpp" \
data_type type
map[string, schema_element] child_types

cdef enum json_recovery_mode_t:
FAIL "cudf::io::json_recovery_mode_t::FAIL"
RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL"

cdef cppclass json_reader_options:
json_reader_options() except +
cudf_io_types.source_info get_source() except +
Expand Down Expand Up @@ -90,6 +94,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& keep_quotes(
bool val
) except +
json_reader_options_builder& recovery_mode(
json_recovery_mode_t val
) except +

json_reader_options build() except +

Expand Down
18 changes: 10 additions & 8 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def read_json(
storage_options=None,
mixed_types_as_string=False,
prune_columns=False,
on_bad_lines="error",
*args,
**kwargs,
):
Expand Down Expand Up @@ -94,14 +95,15 @@ def read_json(
filepaths_or_buffers.append(tmp_source)

df = libjson.read_json(
filepaths_or_buffers,
dtype,
lines,
compression,
byte_range,
keep_quotes,
mixed_types_as_string,
prune_columns,
filepaths_or_buffers=filepaths_or_buffers,
dtype=dtype,
lines=lines,
compression=compression,
byte_range=byte_range,
keep_quotes=keep_quotes,
mixed_types_as_string=mixed_types_as_string,
prune_columns=prune_columns,
on_bad_lines=on_bad_lines,
)
else:
warnings.warn(
Expand Down
31 changes: 31 additions & 0 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -1392,3 +1392,34 @@ def test_json_nested_mixed_types_error(jsonl_string):
orient="records",
lines=True,
)


@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"])
def test_json_reader_on_bad_lines(on_bad_lines):
json_input = StringIO(
'{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
)
if on_bad_lines == "error":
with pytest.raises(RuntimeError):
cudf.read_json(
json_input,
lines=True,
orient="records",
on_bad_lines=on_bad_lines,
)
elif on_bad_lines == "recover":
actual = cudf.read_json(
json_input, lines=True, orient="records", on_bad_lines=on_bad_lines
)
expected = cudf.DataFrame(
{"a": [1, 2, None, 3], "b": [10, 11, None, 12]}
)
assert_eq(actual, expected)
else:
with pytest.raises(TypeError):
cudf.read_json(
json_input,
lines=True,
orient="records",
on_bad_lines=on_bad_lines,
)
5 changes: 5 additions & 0 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,11 @@

If True, only return those columns mentioned in the dtype argument.
If `False` dtype argument is used a type inference suggestion.
on_bad_lines : {'error', 'recover'}, default 'error'
Specifies what to do upon encountering a bad line. Allowed values are :

- ``'error'``, raise an Exception when a bad line is encountered.
- ``'recover'``, fills the row with <NA> when a bad line is encountered.
Returns
-------
result : Series or DataFrame, depending on the value of `typ`.
Expand Down
Loading