Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set the null count in output columns in the CSV reader #13221

Merged
merged 4 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
* @param[in] dtypes The data type of the column
* @param[out] columns The output column data
* @param[out] valids The bitmaps indicating whether column fields are valid
* @param[out] valid_counts The number of valid fields in each column
*/
__global__ void __launch_bounds__(csvparse_block_dim)
convert_csv_to_cudf(cudf::io::parse_options_view options,
Expand All @@ -310,16 +311,16 @@ __global__ void __launch_bounds__(csvparse_block_dim)
device_span<uint64_t const> row_offsets,
device_span<cudf::data_type const> dtypes,
device_span<void* const> columns,
device_span<cudf::bitmask_type* const> valids)
device_span<cudf::bitmask_type* const> valids,
device_span<size_type> valid_counts)
{
auto const raw_csv = data.data();
// thread IDs range per block, so also need the block id.
// this is entry into the field array - tid is an elements within the num_entries array
long const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
long const rec_id_next = rec_id + 1;

// we can have more threads than data, make sure we are not past the end of
// the data
// we can have more threads than data, make sure we are not past the end of the data
if (rec_id_next >= row_offsets.size()) return;

auto field_start = raw_csv + row_offsets[rec_id];
Expand Down Expand Up @@ -370,6 +371,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
column_flags[col] & column_parse::as_hexadecimal)) {
// set the valid bitmap - all bits were set to 0 to start
set_bit(valids[actual_col], rec_id);
atomicAdd(&valid_counts[actual_col], 1);
}
}
} else if (dtypes[actual_col].id() == cudf::type_id::STRING) {
Expand Down Expand Up @@ -803,22 +805,23 @@ std::vector<column_type_histogram> detect_column_types(
return detail::make_std_vector_sync(d_stats, stream);
}

void __host__ decode_row_column_data(cudf::io::parse_options_view const& options,
device_span<char const> data,
device_span<column_parse::flags const> column_flags,
device_span<uint64_t const> row_offsets,
device_span<cudf::data_type const> dtypes,
device_span<void* const> columns,
device_span<cudf::bitmask_type* const> valids,
rmm::cuda_stream_view stream)
void decode_row_column_data(cudf::io::parse_options_view const& options,
device_span<char const> data,
device_span<column_parse::flags const> column_flags,
device_span<uint64_t const> row_offsets,
device_span<cudf::data_type const> dtypes,
device_span<void* const> columns,
device_span<cudf::bitmask_type* const> valids,
device_span<size_type> valid_counts,
rmm::cuda_stream_view stream)
{
// Calculate actual block count to use based on records count
auto const block_size = csvparse_block_dim;
auto const num_rows = row_offsets.size() - 1;
auto const grid_size = (num_rows + block_size - 1) / block_size;

convert_csv_to_cudf<<<grid_size, block_size, 0, stream.value()>>>(
options, data, column_flags, row_offsets, dtypes, columns, valids);
options, data, column_flags, row_offsets, dtypes, columns, valids, valid_counts);
}

uint32_t __host__ gather_row_offsets(const parse_options_view& options,
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/io/csv/csv_gpu.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -217,6 +217,7 @@ std::vector<column_type_histogram> detect_column_types(
* @param[in] dtypes List of dtype corresponding to each column
* @param[out] columns Device memory output of column data
* @param[out] valids Device memory output of column valids bitmap data
* @param[out] valid_counts Device memory output of the number of valid fields in each column
* @param[in] stream CUDA stream to use, default 0
*/
void decode_row_column_data(cudf::io::parse_options_view const& options,
Expand All @@ -226,6 +227,7 @@ void decode_row_column_data(cudf::io::parse_options_view const& options,
device_span<cudf::data_type const> dtypes,
device_span<void* const> columns,
device_span<cudf::bitmask_type* const> valids,
device_span<size_type> valid_counts,
rmm::cuda_stream_view stream);

} // namespace gpu
Expand Down
16 changes: 12 additions & 4 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#include <cudf/io/datasource.hpp>
#include <cudf/io/detail/csv.hpp>
#include <cudf/io/types.hpp>
#include <cudf/strings/replace.hpp>
#include <cudf/strings/detail/replace.hpp>
#include <cudf/table/table.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>
Expand Down Expand Up @@ -580,8 +580,7 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
if (column_flags[col] & column_parse::enabled) {
auto out_buffer = column_buffer(column_types[active_col], num_records, true, stream, mr);

out_buffer.name = column_names[col];
out_buffer.null_count() = UNKNOWN_NULL_COUNT;
out_buffer.name = column_names[col];
out_buffers.emplace_back(std::move(out_buffer));
active_col++;
}
Expand All @@ -595,6 +594,9 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
h_valid[i] = out_buffers[i].null_mask();
}

auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<size_type>(
num_active_columns, stream, rmm::mr::get_current_device_resource());

cudf::io::csv::gpu::decode_row_column_data(
parse_opts.view(),
data,
Expand All @@ -603,8 +605,14 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
make_device_uvector_async(column_types, stream, rmm::mr::get_current_device_resource()),
make_device_uvector_async(h_data, stream, rmm::mr::get_current_device_resource()),
make_device_uvector_async(h_valid, stream, rmm::mr::get_current_device_resource()),
d_valid_counts,
stream);

auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream);
for (int i = 0; i < num_active_columns; ++i) {
out_buffers[i].null_count() = num_records - h_valid_counts[i];
}

return out_buffers;
}

Expand Down Expand Up @@ -859,7 +867,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
const std::string dblquotechar(2, parse_opts.quotechar);
std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
out_columns.emplace_back(
cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr));
cudf::strings::detail::replace(col->view(), dblquotechar, quotechar, -1, stream, mr));
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated change, just noticed that the version that the detail API needs to be used here to ensure that the right stream is used.

} else {
out_columns.emplace_back(make_column(out_buffers[i], nullptr, std::nullopt, stream));
}
Expand Down
18 changes: 17 additions & 1 deletion cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ void check_float_column(cudf::column_view const& col_lhs,

CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUIVALENT(col_lhs,
(wrapper<T>{data.begin(), data.end(), validity}));
CUDF_EXPECTS(col_lhs.null_count() == 0, "All elements should be valid");
CUDF_EXPECTS(col_lhs.null_count() == 0 and col_rhs.null_count() == 0,
"All elements should be valid");
Comment on lines +172 to +173
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is one test that was (kind of) checking the null count. With this additional check, it fails when the null count is incorrect.

EXPECT_THAT(cudf::test::to_host<T>(col_lhs).first,
::testing::Pointwise(FloatNearPointwise(tol), data));
}
Expand Down Expand Up @@ -2464,4 +2465,19 @@ TEST_F(CsvReaderTest, BlankLineAfterFirstRow)
}
}

TEST_F(CsvReaderTest, NullCount)
{
std::string buffer = "0,,\n1,1.,\n2,,\n3,,\n4,4.,\n5,5.,\n6,6.,\n7,7.,\n";
cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
.header(-1);
const auto result = cudf::io::read_csv(in_opts);
const auto result_view = result.tbl->view();

EXPECT_EQ(result_view.num_rows(), 8);
EXPECT_EQ(result_view.column(0).null_count(), 0);
EXPECT_EQ(result_view.column(1).null_count(), 3);
EXPECT_EQ(result_view.column(2).null_count(), 8);
}

CUDF_TEST_PROGRAM_MAIN()