From 41e70a5c9314fb8db5724340610d8a8962af001f Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 16 Nov 2022 18:07:26 +0530 Subject: [PATCH] address review comments --- .../io/json/experimental/byte_range_info.cu | 5 ++-- cpp/tests/io/json_chunked_reader.cpp | 6 ++--- python/cudf/cudf/tests/test_json.py | 24 ++++++++++--------- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/json/experimental/byte_range_info.cu b/cpp/src/io/json/experimental/byte_range_info.cu index 9cfe37fc0c4..d6e30d090a5 100644 --- a/cpp/src/io/json/experimental/byte_range_info.cu +++ b/cpp/src/io/json/experimental/byte_range_info.cu @@ -28,9 +28,8 @@ size_type find_first_delimiter(device_span d_data, char const delimiter, rmm::cuda_stream_view stream) { - auto const is_delimiter = [delimiter] __device__(char c) { return c == delimiter; }; - auto first_delimiter_position = - thrust::find_if(rmm::exec_policy(stream), d_data.begin(), d_data.end(), is_delimiter); + auto const first_delimiter_position = + thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter); return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1; } diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp index 36ba5b61453..a22cdccf3fb 100644 --- a/cpp/tests/io/json_chunked_reader.cpp +++ b/cpp/tests/io/json_chunked_reader.cpp @@ -34,7 +34,7 @@ struct JsonReaderTest : public cudf::test::BaseFixture { std::vector skeleton_for_parellel_chunk_reader( cudf::host_span> sources, cudf::io::json_reader_options const& reader_opts, - int chunk_size, + int32_t chunk_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -42,7 +42,7 @@ std::vector skeleton_for_parellel_chunk_reader( using cudf::size_type; // assuming single source. size_t total_source_size = 0; - for (const auto& source : sources) { + for (auto const& source : sources) { total_source_size += source->size(); } size_t num_chunks = (total_source_size + chunk_size - 1) / chunk_size; @@ -115,7 +115,7 @@ TEST_F(JsonReaderTest, ByteRange) // Test for different chunk sizes for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) { - const auto tables = skeleton_for_parellel_chunk_reader(datasources, + auto const tables = skeleton_for_parellel_chunk_reader(datasources, json_lines_options, chunk_size, cudf::get_default_stream(), diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 6701977b165..2eda71c5c45 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -948,10 +948,6 @@ def test_json_dtypes_nested_data(): class TestNestedJsonReaderCommon: @pytest.mark.parametrize("chunk_size", [10, 100, 1024, 1024 * 1024]) def test_chunked_nested_json_reader(self, tag, data, chunk_size): - if tag == "missing" and chunk_size == 10: - pytest.xfail( - reason="cudf inferences integer with nulls as float64" - ) expected = cudf.read_json( StringIO(data), engine="cudf_experimental", lines=True ) @@ -968,19 +964,25 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size): ) ) df = cudf.concat(chunks, ignore_index=True) - assert expected.to_arrow().equals(df.to_arrow()) + if tag == "missing" and chunk_size == 10: + with pytest.raises(AssertionError): + # nested JSON reader inferences integer with nulls as float64 + assert expected.to_arrow().equals(df.to_arrow()) + else: + assert expected.to_arrow().equals(df.to_arrow()) def test_order_nested_json_reader(self, tag, data): - if tag == "dtype_mismatch": - pytest.xfail( - reason="pandas parses integer values in float representation" - " as integer" - ) expected = pd.read_json(StringIO(data), lines=True) target = cudf.read_json( StringIO(data), engine="cudf_experimental", lines=True ) - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) + if tag == "dtype_mismatch": + with pytest.raises(AssertionError): + # pandas parses integer values in float representation + # as integer + assert pa.Table.from_pandas(expected).equals(target.to_arrow()) + else: + assert pa.Table.from_pandas(expected).equals(target.to_arrow()) def test_json_round_trip_gzip():