Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix write_json failure for zero columns in table/struct #17414

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 41 additions & 22 deletions cpp/src/io/json/write_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ struct validity_fn {
*
* @param strings_columns Table of strings columns
* @param column_names Column of names for each column in the table
* @param num_rows Number of rows in the table
* @param row_prefix Prepend this string to each row
* @param row_suffix Append this string to each row
* @param value_separator Separator between values
Expand All @@ -255,6 +256,7 @@ struct validity_fn {
*/
std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
column_view const& column_names,
size_type const num_rows,
string_view const row_prefix,
string_view const row_suffix,
string_view const value_separator,
Expand All @@ -268,40 +270,53 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
auto const num_columns = strings_columns.num_columns();
CUDF_EXPECTS(num_columns == column_names.size(),
"Number of column names should be equal to number of columns in the table");
auto const strings_count = strings_columns.num_rows();
if (strings_count == 0) // empty begets empty
if (num_rows == 0) // empty begets empty
return make_empty_column(type_id::STRING);
// check all columns are of type string
CUDF_EXPECTS(std::all_of(strings_columns.begin(),
strings_columns.end(),
[](auto const& c) { return c.type().id() == type_id::STRING; }),
"All columns must be of type string");
auto constexpr strviews_per_column = 3; // (for each "column_name:", "value", "separator")
auto const num_strviews_per_row = strings_columns.num_columns() * strviews_per_column + 1;
auto const num_strviews_per_row = 1 + strings_columns.num_columns() * strviews_per_column +
(strings_columns.num_columns() == 0 ? 1 : 0);
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
// e.g. {col1: value, col2: value, col3: value} = 1 + 3 + 3 + (3-1) + 1 = 10

auto tbl_device_view = cudf::table_device_view::create(strings_columns, stream);
auto d_column_names = column_device_view::create(column_names, stream);

// Note for future: chunk it but maximize parallelism, if memory usage is high.
auto const total_strings = num_strviews_per_row * strings_columns.num_rows();
auto const total_rows = strings_columns.num_rows() * strings_columns.num_columns();
auto const total_strings = num_strviews_per_row * num_rows;
auto const total_rows = num_rows * strings_columns.num_columns();
rmm::device_uvector<string_view> d_strviews(total_strings, stream);
struct_scatter_strings_fn scatter_fn{*tbl_device_view,
*d_column_names,
strviews_per_column,
num_strviews_per_row,
row_prefix,
row_suffix,
value_separator,
narep.value(stream),
include_nulls,
d_strviews.begin()};
// scatter row_prefix, row_suffix, column_name:, value, value_separator as string_views
thrust::for_each(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(total_rows),
scatter_fn);
if (strings_columns.num_columns() > 0) {
struct_scatter_strings_fn scatter_fn{*tbl_device_view,
*d_column_names,
strviews_per_column,
num_strviews_per_row,
row_prefix,
row_suffix,
value_separator,
narep.value(stream),
include_nulls,
d_strviews.begin()};
// scatter row_prefix, row_suffix, column_name:, value, value_separator as string_views
thrust::for_each(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(total_rows),
scatter_fn);
} else {
thrust::for_each(
rmm::exec_policy(stream),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we do nosync here? (maybe in the other branch as well)

Suggested change
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed this early. The entire file could use nosync update. Created PR #17445

thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(num_rows),
[d_strviews = d_strviews.begin(), row_prefix, row_suffix, num_strviews_per_row] __device__(
auto idx) {
auto const this_index = idx * num_strviews_per_row;
d_strviews[this_index] = row_prefix;
d_strviews[this_index + num_strviews_per_row - 1] = row_suffix;
});
}
if (!include_nulls) {
// if previous column was null, then we skip the value separator
rmm::device_uvector<bool> d_str_separator(total_rows, stream);
Expand Down Expand Up @@ -341,7 +356,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,

// gather from offset and create a new string column
auto old_offsets = strings_column_view(joined_col->view()).offsets();
rmm::device_uvector<size_type> row_string_offsets(strings_columns.num_rows() + 1, stream, mr);
rmm::device_uvector<size_type> row_string_offsets(num_rows + 1, stream, mr);
auto const d_strview_offsets = cudf::detail::make_counting_transform_iterator(
0, cuda::proclaim_return_type<size_type>([num_strviews_per_row] __device__(size_type const i) {
return i * num_strviews_per_row;
Expand All @@ -353,7 +368,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
row_string_offsets.begin());
auto chars_data = joined_col->release().data;
return make_strings_column(
strings_columns.num_rows(),
num_rows,
std::make_unique<cudf::column>(std::move(row_string_offsets), rmm::device_buffer{}, 0),
std::move(chars_data.release()[0]),
0,
Expand Down Expand Up @@ -677,6 +692,7 @@ struct column_to_strings_fn {
auto col_string = operator()(child_it,
child_it + column.num_children(),
children_names,
column.size(),
struct_row_end_wrap.value(stream_));
col_string->set_null_mask(cudf::detail::copy_bitmask(column, stream_, mr_),
column.null_count());
Expand All @@ -688,6 +704,7 @@ struct column_to_strings_fn {
std::unique_ptr<column> operator()(column_iterator column_begin,
column_iterator column_end,
host_span<column_name_info const> children_names,
size_type num_rows,
cudf::string_view const row_end_wrap_value) const
{
auto const num_columns = std::distance(column_begin, column_end);
Expand Down Expand Up @@ -733,6 +750,7 @@ struct column_to_strings_fn {
//
return struct_to_strings(str_table_view,
column_names_view,
num_rows,
struct_row_begin_wrap.value(stream_),
row_end_wrap_value,
struct_value_separator.value(stream_),
Expand Down Expand Up @@ -908,6 +926,7 @@ void write_json_uncompressed(data_sink* out_sink,
auto str_concat_col = converter(sub_view.begin(),
sub_view.end(),
user_column_names,
sub_view.num_rows(),
d_line_terminator_with_row_end.value(stream));

// Needs line_terminator at the end, to separate from next chunk
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,12 @@ def test_cudf_json_writer_read(gdf_writer_types):
"""{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""",
"""{"a":{"L": [{}, {}]}, "b":1.1}\n""",
),
# empty structs
("""{"A": null}\n {"A": {}}\n {}""", """{}\n{"A":{}}\n{}\n"""),
(
"""{"A": {"B": null}}\n {"A": {"B": {}}}\n {"A": {}}""",
"""{"A":{}}\n{"A":{"B":{}}}\n{"A":{}}\n""",
),
],
)
def test_cudf_json_roundtrip(jsonl_string, expected):
Expand Down
Loading