Skip to content

Commit

Permalink
address review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
karthikeyann committed Nov 16, 2022
1 parent fe93d20 commit 41e70a5
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 17 deletions.
5 changes: 2 additions & 3 deletions cpp/src/io/json/experimental/byte_range_info.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@ size_type find_first_delimiter(device_span<char const> d_data,
char const delimiter,
rmm::cuda_stream_view stream)
{
auto const is_delimiter = [delimiter] __device__(char c) { return c == delimiter; };
auto first_delimiter_position =
thrust::find_if(rmm::exec_policy(stream), d_data.begin(), d_data.end(), is_delimiter);
auto const first_delimiter_position =
thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter);
return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1;
}

Expand Down
6 changes: 3 additions & 3 deletions cpp/tests/io/json_chunked_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ struct JsonReaderTest : public cudf::test::BaseFixture {
std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
cudf::io::json_reader_options const& reader_opts,
int chunk_size,
int32_t chunk_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using namespace cudf::io::detail::json::experimental;
using cudf::size_type;
// assuming single source.
size_t total_source_size = 0;
for (const auto& source : sources) {
for (auto const& source : sources) {
total_source_size += source->size();
}
size_t num_chunks = (total_source_size + chunk_size - 1) / chunk_size;
Expand Down Expand Up @@ -115,7 +115,7 @@ TEST_F(JsonReaderTest, ByteRange)

// Test for different chunk sizes
for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) {
const auto tables = skeleton_for_parellel_chunk_reader(datasources,
auto const tables = skeleton_for_parellel_chunk_reader(datasources,
json_lines_options,
chunk_size,
cudf::get_default_stream(),
Expand Down
24 changes: 13 additions & 11 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,10 +948,6 @@ def test_json_dtypes_nested_data():
class TestNestedJsonReaderCommon:
@pytest.mark.parametrize("chunk_size", [10, 100, 1024, 1024 * 1024])
def test_chunked_nested_json_reader(self, tag, data, chunk_size):
if tag == "missing" and chunk_size == 10:
pytest.xfail(
reason="cudf inferences integer with nulls as float64"
)
expected = cudf.read_json(
StringIO(data), engine="cudf_experimental", lines=True
)
Expand All @@ -968,19 +964,25 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size):
)
)
df = cudf.concat(chunks, ignore_index=True)
assert expected.to_arrow().equals(df.to_arrow())
if tag == "missing" and chunk_size == 10:
with pytest.raises(AssertionError):
# nested JSON reader inferences integer with nulls as float64
assert expected.to_arrow().equals(df.to_arrow())
else:
assert expected.to_arrow().equals(df.to_arrow())

def test_order_nested_json_reader(self, tag, data):
if tag == "dtype_mismatch":
pytest.xfail(
reason="pandas parses integer values in float representation"
" as integer"
)
expected = pd.read_json(StringIO(data), lines=True)
target = cudf.read_json(
StringIO(data), engine="cudf_experimental", lines=True
)
assert pa.Table.from_pandas(expected).equals(target.to_arrow())
if tag == "dtype_mismatch":
with pytest.raises(AssertionError):
# pandas parses integer values in float representation
# as integer
assert pa.Table.from_pandas(expected).equals(target.to_arrow())
else:
assert pa.Table.from_pandas(expected).equals(target.to_arrow())


def test_json_round_trip_gzip():
Expand Down

0 comments on commit 41e70a5

Please sign in to comment.