address review comments

rapidsai · Nov 16, 2022 · 41e70a5 · 41e70a5
1 parent fe93d20
commit 41e70a5
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 17 deletions.
diff --git a/cpp/src/io/json/experimental/byte_range_info.cu b/cpp/src/io/json/experimental/byte_range_info.cu
@@ -28,9 +28,8 @@ size_type find_first_delimiter(device_span<char const> d_data,
                                char const delimiter,
                                rmm::cuda_stream_view stream)
 {
-  auto const is_delimiter = [delimiter] __device__(char c) { return c == delimiter; };
-  auto first_delimiter_position =
-    thrust::find_if(rmm::exec_policy(stream), d_data.begin(), d_data.end(), is_delimiter);
+  auto const first_delimiter_position =
+    thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter);
   return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1;
 }
 

diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
@@ -34,15 +34,15 @@ struct JsonReaderTest : public cudf::test::BaseFixture {
 std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
   cudf::io::json_reader_options const& reader_opts,
-  int chunk_size,
+  int32_t chunk_size,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   using namespace cudf::io::detail::json::experimental;
   using cudf::size_type;
   // assuming single source.
   size_t total_source_size = 0;
-  for (const auto& source : sources) {
+  for (auto const& source : sources) {
     total_source_size += source->size();
   }
   size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
@@ -115,7 +115,7 @@ TEST_F(JsonReaderTest, ByteRange)
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) {
-    const auto tables = skeleton_for_parellel_chunk_reader(datasources,
+    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
                                                            json_lines_options,
                                                            chunk_size,
                                                            cudf::get_default_stream(),

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
@@ -948,10 +948,6 @@ def test_json_dtypes_nested_data():
 class TestNestedJsonReaderCommon:
     @pytest.mark.parametrize("chunk_size", [10, 100, 1024, 1024 * 1024])
     def test_chunked_nested_json_reader(self, tag, data, chunk_size):
-        if tag == "missing" and chunk_size == 10:
-            pytest.xfail(
-                reason="cudf inferences integer with nulls as float64"
-            )
         expected = cudf.read_json(
             StringIO(data), engine="cudf_experimental", lines=True
         )
@@ -968,19 +964,25 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size):
                 )
             )
         df = cudf.concat(chunks, ignore_index=True)
-        assert expected.to_arrow().equals(df.to_arrow())
+        if tag == "missing" and chunk_size == 10:
+            with pytest.raises(AssertionError):
+                # nested JSON reader inferences integer with nulls as float64
+                assert expected.to_arrow().equals(df.to_arrow())
+        else:
+            assert expected.to_arrow().equals(df.to_arrow())
 
     def test_order_nested_json_reader(self, tag, data):
-        if tag == "dtype_mismatch":
-            pytest.xfail(
-                reason="pandas parses integer values in float representation"
-                " as integer"
-            )
         expected = pd.read_json(StringIO(data), lines=True)
         target = cudf.read_json(
             StringIO(data), engine="cudf_experimental", lines=True
         )
-        assert pa.Table.from_pandas(expected).equals(target.to_arrow())
+        if tag == "dtype_mismatch":
+            with pytest.raises(AssertionError):
+                # pandas parses integer values in float representation
+                # as integer
+                assert pa.Table.from_pandas(expected).equals(target.to_arrow())
+        else:
+            assert pa.Table.from_pandas(expected).equals(target.to_arrow())
 
 
 def test_json_round_trip_gzip():