Skip to content

Commit

Permalink
Fix missing trailing comma in json writer (#12688)
Browse files Browse the repository at this point in the history
Fix missing trailing comma in json writer for non-Lines json format.
Updates default rows_per_chunk because `8` is too small and writer is slow.
closes #12687

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: #12688
  • Loading branch information
karthikeyann authored Feb 13, 2023
1 parent bad94b9 commit 53183cd
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 16 deletions.
23 changes: 11 additions & 12 deletions cpp/src/io/json/write_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -581,14 +581,13 @@ std::unique_ptr<column> make_column_names_column(host_span<column_name_info cons

void write_chunked(data_sink* out_sink,
strings_column_view const& str_column_view,
std::string const& line_terminator,
string_scalar const& d_line_terminator,
json_writer_options const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");

string_scalar d_line_terminator{line_terminator};
auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
d_line_terminator,
string_scalar("", false),
Expand All @@ -609,15 +608,6 @@ void write_chunked(data_sink* out_sink,

out_sink->host_write(h_bytes.data(), total_num_bytes);
}

// Needs newline at the end, to separate from next chunk
if (options.is_enabled_lines()) {
if (out_sink->is_device_write_preferred(d_line_terminator.size())) {
out_sink->device_write(d_line_terminator.data(), d_line_terminator.size(), stream);
} else {
out_sink->host_write(line_terminator.data(), line_terminator.size());
}
}
}

void write_json(data_sink* out_sink,
Expand Down Expand Up @@ -697,7 +687,16 @@ void write_json(data_sink* out_sink,
// struct converter for the table
auto str_concat_col = converter(sub_view.begin(), sub_view.end(), user_column_names);

write_chunked(out_sink, str_concat_col->view(), line_terminator, options, stream, mr);
write_chunked(out_sink, str_concat_col->view(), d_line_terminator, options, stream, mr);

// Needs line_terminator at the end, to separate from next chunk
if (&sub_view != &vector_views.back() or options.is_enabled_lines()) {
if (out_sink->is_device_write_preferred(d_line_terminator.size())) {
out_sink->device_write(d_line_terminator.data(), d_line_terminator.size(), stream);
} else {
out_sink->host_write(line_terminator.data(), line_terminator.size());
}
}
}
} else {
if (options.is_enabled_lines()) {
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def write_json(
bool include_nulls=True,
bool lines=False,
bool index=False,
int rows_per_chunk=8,
int rows_per_chunk=1024*256, # 256K rows
):
"""
Cython function to call into libcudf API, see `write_json`.
Expand Down
15 changes: 12 additions & 3 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,23 @@ def test_json_writer(tmpdir, pdf, gdf):
assert_eq(pdf_string, gdf_string)


def test_cudf_json_writer(pdf):
@pytest.mark.parametrize(
"lines", [True, False], ids=["lines=True", "lines=False"]
)
def test_cudf_json_writer(pdf, lines):
# removing datetime column because pandas doesn't support it
for col_name in pdf.columns:
if "datetime" in col_name:
pdf.drop(col_name, axis=1, inplace=True)
gdf = cudf.DataFrame.from_pandas(pdf)
pdf_string = pdf.to_json(orient="records", lines=True)
gdf_string = gdf.to_json(orient="records", lines=True, engine="cudf")
pdf_string = pdf.to_json(orient="records", lines=lines)
gdf_string = gdf.to_json(orient="records", lines=lines, engine="cudf")

assert_eq(pdf_string, gdf_string)

gdf_string = gdf.to_json(
orient="records", lines=lines, engine="cudf", rows_per_chunk=8
)

assert_eq(pdf_string, gdf_string)

Expand Down

0 comments on commit 53183cd

Please sign in to comment.