From 12af24a82dcd4c3f5aa4ece98b059e773a88ef5e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 23 May 2024 09:54:36 +0000 Subject: [PATCH 1/2] implement on_bad_lines in json reader --- python/cudf/cudf/_lib/json.pyx | 15 +++++++++++++- .../cudf/_lib/pylibcudf/libcudf/io/json.pxd | 5 +++++ python/cudf/cudf/io/json.py | 20 ++++++++++--------- python/cudf/cudf/utils/ioutils.py | 5 +++++ 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 283a451dd4a..8dca784da4d 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -24,6 +24,7 @@ from cudf._lib.io.utils cimport ( from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink from cudf._lib.pylibcudf.libcudf.io.json cimport ( json_reader_options, + json_recovery_mode_t, json_writer_options, read_json as libcudf_read_json, schema_element, @@ -42,6 +43,15 @@ from cudf._lib.types cimport dtype_to_data_type from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +def _get_json_recovery_mode(object on_bad_lines): + if on_bad_lines.lower() == "error": + return json_recovery_mode_t.FAIL + elif on_bad_lines.lower() == "recover": + return json_recovery_mode_t.RECOVER_WITH_NULL + else: + raise TypeError(f"Invalid parameter for {on_bad_lines=}") + + cpdef read_json(object filepaths_or_buffers, object dtype, bool lines, @@ -50,7 +60,8 @@ cpdef read_json(object filepaths_or_buffers, bool legacy, bool keep_quotes, bool mixed_types_as_string, - bool prune_columns): + bool prune_columns, + object on_bad_lines): """ Cython function to call into libcudf API, see `read_json`. @@ -130,6 +141,8 @@ cpdef read_json(object filepaths_or_buffers, opts.enable_keep_quotes(keep_quotes) opts.enable_mixed_types_as_string(mixed_types_as_string) opts.enable_prune_columns(prune_columns) + opts.set_recovery_mode(_get_json_recovery_mode(on_bad_lines)) + # Read JSON cdef cudf_io_types.table_with_metadata c_result diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd index 7e64a4cae29..66ac8bc95f5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd @@ -19,6 +19,10 @@ cdef extern from "cudf/io/json.hpp" \ data_type type map[string, schema_element] child_types + cdef enum json_recovery_mode_t: + FAIL "cudf::io::json_recovery_mode_t::FAIL" + RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL" + cdef cppclass json_reader_options: json_reader_options() except + cudf_io_types.source_info get_source() except + @@ -46,6 +50,7 @@ cdef extern from "cudf/io/json.hpp" \ void enable_dayfirst(bool val) except + void enable_experimental(bool val) except + void enable_keep_quotes(bool val) except + + void set_recovery_mode(json_recovery_mode_t val) except + @staticmethod json_reader_options_builder builder( diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 03d07fc3a50..1e3805f732c 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -27,6 +27,7 @@ def read_json( storage_options=None, mixed_types_as_string=False, prune_columns=False, + on_bad_lines="error", *args, **kwargs, ): @@ -94,15 +95,16 @@ def read_json( filepaths_or_buffers.append(tmp_source) df = libjson.read_json( - filepaths_or_buffers, - dtype, - lines, - compression, - byte_range, - False, - keep_quotes, - mixed_types_as_string, - prune_columns, + filepaths_or_buffers=filepaths_or_buffers, + dtype=dtype, + lines=lines, + compression=compression, + byte_range=byte_range, + legacy=False, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + on_bad_lines=on_bad_lines, ) else: warnings.warn( diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1366a0b8e84..0209c692935 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -739,6 +739,11 @@ If True, only return those columns mentioned in the dtype argument. If `False` dtype argument is used a type inference suggestion. +on_bad_lines : {'error', 'recover'}, default 'error' + Specifies what to do upon encountering a bad line. Allowed values are : + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'recover'``, fills the row with when a bad line is encountered. Returns ------- result : Series or DataFrame, depending on the value of `typ`. From a3aff9aa834833db0f26659f080378476c80dd5a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 24 May 2024 18:56:42 +0000 Subject: [PATCH 2/2] fix cython and add pytests --- python/cudf/cudf/_lib/json.pyx | 4 +-- .../cudf/_lib/pylibcudf/libcudf/io/json.pxd | 4 ++- python/cudf/cudf/tests/test_json.py | 31 +++++++++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 40e7e5c2c10..a8fef907bad 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -43,7 +43,7 @@ from cudf._lib.types cimport dtype_to_data_type from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table -def _get_json_recovery_mode(object on_bad_lines): +cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): if on_bad_lines.lower() == "error": return json_recovery_mode_t.FAIL elif on_bad_lines.lower() == "recover": @@ -129,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers, .lines(c_lines) .byte_range_offset(c_range_offset) .byte_range_size(c_range_size) + .recovery_mode(_get_json_recovery_mode(on_bad_lines)) .build() ) if is_list_like_dtypes: @@ -139,7 +140,6 @@ cpdef read_json(object filepaths_or_buffers, opts.enable_keep_quotes(keep_quotes) opts.enable_mixed_types_as_string(mixed_types_as_string) opts.enable_prune_columns(prune_columns) - opts.set_recovery_mode(_get_json_recovery_mode(on_bad_lines)) # Read JSON cdef cudf_io_types.table_with_metadata c_result diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd index 939cc141e1d..2e50cccd132 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd @@ -50,7 +50,6 @@ cdef extern from "cudf/io/json.hpp" \ void enable_dayfirst(bool val) except + void enable_experimental(bool val) except + void enable_keep_quotes(bool val) except + - void set_recovery_mode(json_recovery_mode_t val) except + @staticmethod json_reader_options_builder builder( @@ -95,6 +94,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& keep_quotes( bool val ) except + + json_reader_options_builder& recovery_mode( + json_recovery_mode_t val + ) except + json_reader_options build() except + diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 51287fe26a0..ba6a8f94719 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1392,3 +1392,34 @@ def test_json_nested_mixed_types_error(jsonl_string): orient="records", lines=True, ) + + +@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"]) +def test_json_reader_on_bad_lines(on_bad_lines): + json_input = StringIO( + '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' + ) + if on_bad_lines == "error": + with pytest.raises(RuntimeError): + cudf.read_json( + json_input, + lines=True, + orient="records", + on_bad_lines=on_bad_lines, + ) + elif on_bad_lines == "recover": + actual = cudf.read_json( + json_input, lines=True, orient="records", on_bad_lines=on_bad_lines + ) + expected = cudf.DataFrame( + {"a": [1, 2, None, 3], "b": [10, 11, None, 12]} + ) + assert_eq(actual, expected) + else: + with pytest.raises(TypeError): + cudf.read_json( + json_input, + lines=True, + orient="records", + on_bad_lines=on_bad_lines, + )