Skip to content

Commit

Permalink
Changing cudf::io::source_info to use cudf::host_span<std::byte>
Browse files Browse the repository at this point in the history
…in a non-breaking form (#12730)

Closes #12576

This change converts `cudf::io::source_info` to take a `host_span<std::byte const>`. This version deprecates the original API, but leaves it intact to avoid breaking changes. After being deprecated for a few releases, they will be removed.

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #12730
  • Loading branch information
hyperbolic2346 authored Feb 21, 2023
1 parent 94bbc82 commit c2f0161
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 5 deletions.
10 changes: 10 additions & 0 deletions cpp/include/cudf/io/datasource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,21 @@ class datasource {
/**
* @brief Creates a source from a host memory buffer.
*
# @deprecated Since 23.04
*
* @param[in] buffer Host buffer object
* @return Constructed datasource object
*/
static std::unique_ptr<datasource> create(host_buffer const& buffer);

/**
* @brief Creates a source from a host memory buffer.
*
* @param[in] buffer Host buffer object
* @return Constructed datasource object
*/
static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);

/**
* @brief Creates a source from a device memory buffer.
*
Expand Down
75 changes: 71 additions & 4 deletions cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ struct table_with_metadata {
/**
* @brief Non-owning view of a host memory buffer
*
* @deprecated Since 23.04
*
* Used to describe buffer input in `source_info` objects.
*/
struct host_buffer {
Expand All @@ -166,6 +168,22 @@ struct host_buffer {
host_buffer(const char* data, size_t size) : data(data), size(size) {}
};

/**
* @brief Returns `true` if the type is byte-like, meaning it is reasonable to pass as a pointer to
* bytes.
*
* @tparam T The representation type
* @return `true` if the type is considered a byte-like type
*/
template <typename T>
constexpr inline auto is_byte_like_type()
{
using non_cv_T = std::remove_cv_t<T>;
return std::is_same_v<non_cv_T, int8_t> || std::is_same_v<non_cv_T, char> ||
std::is_same_v<non_cv_T, uint8_t> || std::is_same_v<non_cv_T, unsigned char> ||
std::is_same_v<non_cv_T, std::byte>;
}

/**
* @brief Source information for read interfaces
*/
Expand All @@ -191,21 +209,70 @@ struct source_info {
/**
* @brief Construct a new source info object for multiple buffers in host memory
*
* @deprecated Since 23.04
*
* @param host_buffers Input buffers in host memory
*/
explicit source_info(std::vector<host_buffer> const& host_buffers)
: _type(io_type::HOST_BUFFER), _host_buffers(host_buffers)
explicit source_info(std::vector<host_buffer> const& host_buffers) : _type(io_type::HOST_BUFFER)
{
_host_buffers.reserve(host_buffers.size());
std::transform(host_buffers.begin(),
host_buffers.end(),
std::back_inserter(_host_buffers),
[](auto const hb) {
return cudf::host_span<std::byte const>{
reinterpret_cast<std::byte const*>(hb.data), hb.size};
});
}

/**
* @brief Construct a new source info object for a single buffer
*
* @deprecated Since 23.04
*
* @param host_data Input buffer in host memory
* @param size Size of the buffer
*/
explicit source_info(const char* host_data, size_t size)
: _type(io_type::HOST_BUFFER), _host_buffers({{host_data, size}})
: _type(io_type::HOST_BUFFER),
_host_buffers(
{cudf::host_span<std::byte const>(reinterpret_cast<std::byte const*>(host_data), size)})
{
}

/**
* @brief Construct a new source info object for multiple buffers in host memory
*
* @param host_buffers Input buffers in host memory
*/
template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)
: _type(io_type::HOST_BUFFER)
{
if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {
_host_buffers.reserve(host_buffers.size());
std::transform(host_buffers.begin(),
host_buffers.end(),
std::back_inserter(_host_buffers),
[](auto const s) {
return cudf::host_span<std::byte const>{
reinterpret_cast<std::byte const*>(s.data()), s.size()};
});
} else {
_host_buffers.assign(host_buffers.begin(), host_buffers.end());
}
}

/**
* @brief Construct a new source info object for a single buffer
*
* @param host_data Input buffer in host memory
*/
template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
explicit source_info(cudf::host_span<T> host_data)
: _type(io_type::HOST_BUFFER),
_host_buffers{cudf::host_span<std::byte const>(
reinterpret_cast<std::byte const*>(host_data.data()), host_data.size())}
{
}

Expand Down Expand Up @@ -289,7 +356,7 @@ struct source_info {
private:
io_type _type = io_type::FILEPATH;
std::vector<std::string> _filepaths;
std::vector<host_buffer> _host_buffers;
std::vector<cudf::host_span<std::byte const>> _host_buffers;
std::vector<cudf::device_span<std::byte const>> _device_buffers;
std::vector<cudf::io::datasource*> _user_sources;
};
Expand Down
8 changes: 7 additions & 1 deletion cpp/src/io/utilities/datasource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,10 +329,16 @@ std::unique_ptr<datasource> datasource::create(const std::string& filepath,
}

std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)
{
return create(
cudf::host_span<std::byte const>{reinterpret_cast<std::byte const*>(buffer.data), buffer.size});
}

std::unique_ptr<datasource> datasource::create(cudf::host_span<std::byte const> buffer)
{
// Use Arrow IO buffer class for zero-copy reads of host memory
return std::make_unique<arrow_io_source>(std::make_shared<arrow::io::BufferReader>(
reinterpret_cast<const uint8_t*>(buffer.data), buffer.size));
reinterpret_cast<const uint8_t*>(buffer.data()), buffer.size()));
}

std::unique_ptr<datasource> datasource::create(cudf::device_span<std::byte const> buffer)
Expand Down
74 changes: 74 additions & 0 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,10 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest {
auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
};

template <typename T>
struct ParquetReaderSourceTest : public ParquetReaderTest {
};

// Declare typed test cases
// TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
Expand All @@ -369,6 +373,8 @@ using SupportedTimestampTypes =
cudf::test::Types<cudf::timestamp_ms, cudf::timestamp_us, cudf::timestamp_ns>;
TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes);
TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes);
using ByteLikeTypes = cudf::test::Types<int8_t, char, uint8_t, unsigned char, std::byte>;
TYPED_TEST_SUITE(ParquetReaderSourceTest, ByteLikeTypes);

// Base test fixture for chunked writer tests
struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {
Expand Down Expand Up @@ -5113,4 +5119,72 @@ TEST_P(ParquetSizedTest, DictionaryTest)
EXPECT_EQ(nbits, GetParam());
}

TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes)
{
using T = TypeParam;

srand(31337);
auto table = create_random_fixed_table<int>(5, 5, true);

std::vector<char> out_buffer;
cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table);
cudf::io::write_parquet(out_opts);

{
cudf::io::parquet_reader_options in_opts =
cudf::io::parquet_reader_options::builder(cudf::io::source_info(
cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size())));
const auto result = cudf::io::read_parquet(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view());
}

{
cudf::io::parquet_reader_options in_opts =
cudf::io::parquet_reader_options::builder(cudf::io::source_info(cudf::host_span<T const>(
reinterpret_cast<T const*>(out_buffer.data()), out_buffer.size())));
const auto result = cudf::io::read_parquet(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view());
}
}

TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes)
{
using T = TypeParam;

srand(31337);
auto table = create_random_fixed_table<int>(5, 5, true);

std::vector<char> out_buffer;
cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table);
cudf::io::write_parquet(out_opts);

auto full_table = cudf::concatenate(std::vector<table_view>({*table, *table}));

{
auto spans = std::vector<cudf::host_span<T>>{
cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size()),
cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size())};
cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
cudf::io::source_info(cudf::host_span<cudf::host_span<T>>(spans.data(), spans.size())));
const auto result = cudf::io::read_parquet(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view());
}

{
auto spans = std::vector<cudf::host_span<T const>>{
cudf::host_span<T const>(reinterpret_cast<T const*>(out_buffer.data()), out_buffer.size()),
cudf::host_span<T const>(reinterpret_cast<T const*>(out_buffer.data()), out_buffer.size())};
cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
cudf::io::source_info(cudf::host_span<cudf::host_span<T const>>(spans.data(), spans.size())));
const auto result = cudf::io::read_parquet(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view());
}
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit c2f0161

Please sign in to comment.