diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 49d035e6cb9..9849629015d 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -581,14 +581,13 @@ std::unique_ptr make_column_names_column(host_span 0, "Unexpected empty strings column."); - string_scalar d_line_terminator{line_terminator}; auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, d_line_terminator, string_scalar("", false), @@ -609,15 +608,6 @@ void write_chunked(data_sink* out_sink, out_sink->host_write(h_bytes.data(), total_num_bytes); } - - // Needs newline at the end, to separate from next chunk - if (options.is_enabled_lines()) { - if (out_sink->is_device_write_preferred(d_line_terminator.size())) { - out_sink->device_write(d_line_terminator.data(), d_line_terminator.size(), stream); - } else { - out_sink->host_write(line_terminator.data(), line_terminator.size()); - } - } } void write_json(data_sink* out_sink, @@ -697,7 +687,16 @@ void write_json(data_sink* out_sink, // struct converter for the table auto str_concat_col = converter(sub_view.begin(), sub_view.end(), user_column_names); - write_chunked(out_sink, str_concat_col->view(), line_terminator, options, stream, mr); + write_chunked(out_sink, str_concat_col->view(), d_line_terminator, options, stream, mr); + + // Needs line_terminator at the end, to separate from next chunk + if (&sub_view != &vector_views.back() or options.is_enabled_lines()) { + if (out_sink->is_device_write_preferred(d_line_terminator.size())) { + out_sink->device_write(d_line_terminator.data(), d_line_terminator.size(), stream); + } else { + out_sink->host_write(line_terminator.data(), line_terminator.size()); + } + } } } else { if (options.is_enabled_lines()) { diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index a40ba7862b2..2339b874ea0 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -156,7 +156,7 @@ def write_json( bool include_nulls=True, bool lines=False, bool index=False, - int rows_per_chunk=8, + int rows_per_chunk=1024*256, # 256K rows ): """ Cython function to call into libcudf API, see `write_json`. diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 81acb43ee7d..b778db4465f 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -187,14 +187,23 @@ def test_json_writer(tmpdir, pdf, gdf): assert_eq(pdf_string, gdf_string) -def test_cudf_json_writer(pdf): +@pytest.mark.parametrize( + "lines", [True, False], ids=["lines=True", "lines=False"] +) +def test_cudf_json_writer(pdf, lines): # removing datetime column because pandas doesn't support it for col_name in pdf.columns: if "datetime" in col_name: pdf.drop(col_name, axis=1, inplace=True) gdf = cudf.DataFrame.from_pandas(pdf) - pdf_string = pdf.to_json(orient="records", lines=True) - gdf_string = gdf.to_json(orient="records", lines=True, engine="cudf") + pdf_string = pdf.to_json(orient="records", lines=lines) + gdf_string = gdf.to_json(orient="records", lines=lines, engine="cudf") + + assert_eq(pdf_string, gdf_string) + + gdf_string = gdf.to_json( + orient="records", lines=lines, engine="cudf", rows_per_chunk=8 + ) assert_eq(pdf_string, gdf_string)