Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement on_bad_lines in json reader #15834

Merged
merged 4 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ from cudf._lib.io.utils cimport (
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.json cimport (
json_reader_options,
json_recovery_mode_t,
json_writer_options,
read_json as libcudf_read_json,
schema_element,
Expand All @@ -42,6 +43,15 @@ from cudf._lib.types cimport dtype_to_data_type
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table


def _get_json_recovery_mode(object on_bad_lines):
if on_bad_lines.lower() == "error":
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
return json_recovery_mode_t.FAIL
elif on_bad_lines.lower() == "recover":
return json_recovery_mode_t.RECOVER_WITH_NULL
else:
raise TypeError(f"Invalid parameter for {on_bad_lines=}")


cpdef read_json(object filepaths_or_buffers,
object dtype,
bool lines,
Expand All @@ -50,7 +60,8 @@ cpdef read_json(object filepaths_or_buffers,
bool legacy,
bool keep_quotes,
bool mixed_types_as_string,
bool prune_columns):
bool prune_columns,
object on_bad_lines):
"""
Cython function to call into libcudf API, see `read_json`.

Expand Down Expand Up @@ -130,6 +141,8 @@ cpdef read_json(object filepaths_or_buffers,
opts.enable_keep_quotes(keep_quotes)
opts.enable_mixed_types_as_string(mixed_types_as_string)
opts.enable_prune_columns(prune_columns)
opts.set_recovery_mode(_get_json_recovery_mode(on_bad_lines))

# Read JSON
cdef cudf_io_types.table_with_metadata c_result

Expand Down
5 changes: 5 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ cdef extern from "cudf/io/json.hpp" \
data_type type
map[string, schema_element] child_types

cdef enum json_recovery_mode_t:
FAIL "cudf::io::json_recovery_mode_t::FAIL"
RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL"

cdef cppclass json_reader_options:
json_reader_options() except +
cudf_io_types.source_info get_source() except +
Expand Down Expand Up @@ -46,6 +50,7 @@ cdef extern from "cudf/io/json.hpp" \
void enable_dayfirst(bool val) except +
void enable_experimental(bool val) except +
void enable_keep_quotes(bool val) except +
void set_recovery_mode(json_recovery_mode_t val) except +

@staticmethod
json_reader_options_builder builder(
Expand Down
20 changes: 11 additions & 9 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def read_json(
storage_options=None,
mixed_types_as_string=False,
prune_columns=False,
on_bad_lines="error",
*args,
**kwargs,
):
Expand Down Expand Up @@ -94,15 +95,16 @@ def read_json(
filepaths_or_buffers.append(tmp_source)

df = libjson.read_json(
filepaths_or_buffers,
dtype,
lines,
compression,
byte_range,
False,
keep_quotes,
mixed_types_as_string,
prune_columns,
filepaths_or_buffers=filepaths_or_buffers,
dtype=dtype,
lines=lines,
compression=compression,
byte_range=byte_range,
legacy=False,
keep_quotes=keep_quotes,
mixed_types_as_string=mixed_types_as_string,
prune_columns=prune_columns,
on_bad_lines=on_bad_lines,
)
else:
warnings.warn(
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,11 @@

If True, only return those columns mentioned in the dtype argument.
If `False` dtype argument is used a type inference suggestion.
on_bad_lines : {'error', 'recover'}, default 'error'
Specifies what to do upon encountering a bad line. Allowed values are :

- ``'error'``, raise an Exception when a bad line is encountered.
- ``'recover'``, fills the row with <NA> when a bad line is encountered.
Returns
-------
result : Series or DataFrame, depending on the value of `typ`.
Expand Down
Loading