Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changing cudf::io::source_info to use cudf::host_span<std::byte> in a non-breaking form #12730

Merged
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cpp/include/cudf/io/datasource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,21 @@ class datasource {
/**
* @brief Creates a source from a host memory buffer.
*
# @deprecated Since 23.04
*
* @param[in] buffer Host buffer object
* @return Constructed datasource object
*/
static std::unique_ptr<datasource> create(host_buffer const& buffer);

/**
* @brief Creates a source from a host memory buffer.
*
* @param[in] buffer Host buffer object
* @return Constructed datasource object
*/
static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);

/**
* @brief Creates a source from a device memory buffer.
*
Expand Down
75 changes: 71 additions & 4 deletions cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ struct table_with_metadata {
/**
* @brief Non-owning view of a host memory buffer
*
* @deprecated Since 23.04
*
* Used to describe buffer input in `source_info` objects.
*/
struct host_buffer {
Expand All @@ -166,6 +168,22 @@ struct host_buffer {
host_buffer(const char* data, size_t size) : data(data), size(size) {}
};

/**
* @brief Returns `true` if the type is byte-like, meaning it is reasonable to pass as a pointer to
* bytes.
*
* @tparam T The representation type
* @return `true` if the type is considered a byte-like type
*/
template <typename T>
constexpr inline auto is_byte_like_type()
{
using non_cv_T = std::remove_cv_t<T>;
return std::is_same_v<non_cv_T , int8_t> || std::is_same_v<non_cv_T , char> ||
std::is_same_v<non_cv_T , uint8_t> || std::is_same_v<non_cv_T , unsigned char> ||
std::is_same_v<non_cv_T , std::byte>;
}

/**
* @brief Source information for read interfaces
*/
Expand All @@ -191,21 +209,70 @@ struct source_info {
/**
* @brief Construct a new source info object for multiple buffers in host memory
*
* @deprecated Since 23.04
*
* @param host_buffers Input buffers in host memory
*/
explicit source_info(std::vector<host_buffer> const& host_buffers)
: _type(io_type::HOST_BUFFER), _host_buffers(host_buffers)
explicit source_info(std::vector<host_buffer> const& host_buffers) : _type(io_type::HOST_BUFFER)
{
_host_buffers.reserve(host_buffers.size());
vuule marked this conversation as resolved.
Show resolved Hide resolved
std::transform(host_buffers.begin(),
host_buffers.end(),
std::back_inserter(_host_buffers),
[](auto const hb) {
return cudf::host_span<std::byte const>{
reinterpret_cast<std::byte const*>(hb.data), hb.size};
});
}

/**
* @brief Construct a new source info object for a single buffer
*
* @deprecated Since 23.04
*
* @param host_data Input buffer in host memory
* @param size Size of the buffer
*/
explicit source_info(const char* host_data, size_t size)
: _type(io_type::HOST_BUFFER), _host_buffers({{host_data, size}})
: _type(io_type::HOST_BUFFER),
_host_buffers(
{cudf::host_span<std::byte const>(reinterpret_cast<std::byte const*>(host_data), size)})
{
}

/**
* @brief Construct a new source info object for multiple buffers in host memory
*
* @param host_buffers Input buffers in host memory
*/
template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)
: _type(io_type::HOST_BUFFER)
{
if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {
vuule marked this conversation as resolved.
Show resolved Hide resolved
_host_buffers.reserve(host_buffers.size());
std::transform(host_buffers.begin(),
host_buffers.end(),
std::back_inserter(_host_buffers),
[](auto const s) {
return cudf::host_span<std::byte const>{
reinterpret_cast<std::byte const*>(s.data()), s.size()};
});
} else {
_host_buffers.assign(host_buffers.begin(), host_buffers.end());
}
}

/**
* @brief Construct a new source info object for a single buffer
*
* @param host_data Input buffer in host memory
*/
template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
explicit source_info(cudf::host_span<T> host_data)
: _type(io_type::HOST_BUFFER),
_host_buffers{cudf::host_span<std::byte const>(
reinterpret_cast<std::byte const*>(host_data.data()), host_data.size())}
{
}

Expand Down Expand Up @@ -289,7 +356,7 @@ struct source_info {
private:
io_type _type = io_type::FILEPATH;
std::vector<std::string> _filepaths;
std::vector<host_buffer> _host_buffers;
std::vector<cudf::host_span<std::byte const>> _host_buffers;
std::vector<cudf::device_span<std::byte const>> _device_buffers;
std::vector<cudf::io::datasource*> _user_sources;
};
Expand Down
8 changes: 7 additions & 1 deletion cpp/src/io/utilities/datasource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,10 +329,16 @@ std::unique_ptr<datasource> datasource::create(const std::string& filepath,
}

std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)
{
return create(
cudf::host_span<std::byte const>{reinterpret_cast<std::byte const*>(buffer.data), buffer.size});
}

std::unique_ptr<datasource> datasource::create(cudf::host_span<std::byte const> buffer)
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
{
// Use Arrow IO buffer class for zero-copy reads of host memory
return std::make_unique<arrow_io_source>(std::make_shared<arrow::io::BufferReader>(
reinterpret_cast<const uint8_t*>(buffer.data), buffer.size));
reinterpret_cast<const uint8_t*>(buffer.data()), buffer.size()));
}

std::unique_ptr<datasource> datasource::create(cudf::device_span<std::byte const> buffer)
Expand Down
51 changes: 51 additions & 0 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,11 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest {
auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
};

template <typename T>
struct ParquetReaderSourceTest : public ParquetReaderTest {
auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
};

// Declare typed test cases
// TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
Expand All @@ -369,6 +374,8 @@ using SupportedTimestampTypes =
cudf::test::Types<cudf::timestamp_ms, cudf::timestamp_us, cudf::timestamp_ns>;
TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes);
TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes);
using ByteLikeTypes = cudf::test::Types<int8_t, char, uint8_t, unsigned char, std::byte>;
TYPED_TEST_SUITE(ParquetReaderSourceTest, ByteLikeTypes);

// Base test fixture for chunked writer tests
struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {
Expand Down Expand Up @@ -5113,4 +5120,48 @@ TEST_P(ParquetSizedTest, DictionaryTest)
EXPECT_EQ(nbits, GetParam());
}

TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes)
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
{
using T = TypeParam;

srand(31337);
auto table = create_random_fixed_table<int>(5, 5, true);

std::vector<char> out_buffer;
cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table);
cudf::io::write_parquet(out_opts);

cudf::io::parquet_reader_options in_opts =
cudf::io::parquet_reader_options::builder(cudf::io::source_info(
cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size())));
const auto result = cudf::io::read_parquet(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view());
}

TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes)
{
using T = TypeParam;

srand(31337);
auto table = create_random_fixed_table<int>(5, 5, true);

std::vector<char> out_buffer;
cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table);
cudf::io::write_parquet(out_opts);

auto spans = std::vector<cudf::host_span<T>>{
cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size()),
cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size())};
cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
cudf::io::source_info(cudf::host_span<cudf::host_span<T>>(spans.data(), spans.size())));
const auto result = cudf::io::read_parquet(in_opts);

auto full_table = cudf::concatenate(std::vector<table_view>({*table, *table}));

CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view());
}

CUDF_TEST_PROGRAM_MAIN()