Skip to content

Commit

Permalink
Partial clean up of ORC writer (#7324)
Browse files Browse the repository at this point in the history
Issue #6763

Clean up of the code surrounding the column data encode in the ORC writer:
1. Add a 2D version of `hostdevice_vector` (single allocation);
2. Add 2D versions of `host_span` and `device_span`;
3. Add implicit conversions from `hostdevice_vector` to `host_span` and `device_span`.
4. Use the new types to represent collections that currently use flattened `hostdevice_vectors`;
5. Separated a part of `EncChunk` into a separate class, `encoder_chunk_streams`, as this is the only part used after data encode;
6. Add `orc_streams` to represent per-column streams and compute offsets.
7. Partial `writer_impl.cu` code "modernization".
8. Removed redundant size parameters (since 2dspan and 2dvector hold the size info).
9. use `device_uvector` instead of `device_vector`.

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - Jake Hemstad (@jrhemstad)
  - Kumar Aatish (@kaatish)
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

URL: #7324
vuule authored Mar 4, 2021

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent 72438d8 commit d619f77
Showing 11 changed files with 965 additions and 765 deletions.
87 changes: 87 additions & 0 deletions cpp/include/cudf/utilities/span.hpp
Original file line number Diff line number Diff line change
@@ -135,6 +135,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
constexpr host_span(C const& in) : base(in.data(), in.size())
{
}

template <typename OtherT,
std::size_t OtherExtent,
typename std::enable_if<(Extent == OtherExtent || Extent == dynamic_extent) &&
std::is_convertible<OtherT (*)[], T (*)[]>::value,
void>::type* = nullptr>
constexpr host_span(const host_span<OtherT, OtherExtent>& other) noexcept
: base(other.data(), other.size())
{
}
};

// ===== device_span ===============================================================================
@@ -174,6 +184,83 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
constexpr device_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
{
}

template <typename OtherT,
std::size_t OtherExtent,
typename std::enable_if<(Extent == OtherExtent || Extent == dynamic_extent) &&
std::is_convertible<OtherT (*)[], T (*)[]>::value,
void>::type* = nullptr>
constexpr device_span(const device_span<OtherT, OtherExtent>& other) noexcept
: base(other.data(), other.size())
{
}
};

namespace detail {

/**
* @brief Generic class for row-major 2D spans. Not compliant with STL container semantics/syntax.
*
* The index operator returns the corresponding row.
*/
template <typename T, template <typename, std::size_t> typename RowType>
class base_2dspan {
public:
using size_type = std::pair<size_t, size_t>;

constexpr base_2dspan() noexcept = default;
constexpr base_2dspan(T* data, size_t rows, size_t columns) noexcept
: _data{data}, _size{rows, columns}
{
}
base_2dspan(T* data, size_type size) noexcept : _data{data}, _size{size} {}

constexpr auto data() const noexcept { return _data; }
constexpr auto size() const noexcept { return _size; }
constexpr auto count() const noexcept { return size().first * size().second; }
constexpr bool is_empty() const noexcept { return count() == 0; }

static constexpr size_t flatten_index(size_t row, size_t column, size_type size) noexcept
{
return row * size.second + column;
}

constexpr RowType<T, dynamic_extent> operator[](size_t row)
{
return {this->data() + flatten_index(row, 0, this->size()), this->size().second};
}

template <typename OtherT,
template <typename, size_t>
typename OtherRowType,
typename std::enable_if<std::is_convertible<OtherRowType<OtherT, dynamic_extent>,
RowType<T, dynamic_extent>>::value,
void>::type* = nullptr>
constexpr base_2dspan(base_2dspan<OtherT, OtherRowType> const& other) noexcept
: _data{other.data()}, _size{other.size()}
{
}

protected:
T* _data = nullptr;
size_type _size{0, 0};
};

/**
* @brief Alias for the 2D span for host data.
*
* Index operator returns rows as `host_span`.
*/
template <class T>
using host_2dspan = base_2dspan<T, host_span>;

/**
* @brief Alias for the 2D span for device data.
*
* Index operator returns rows as `device_span`.
*/
template <class T>
using device_2dspan = base_2dspan<T, device_span>;

} // namespace detail
} // namespace cudf
4 changes: 2 additions & 2 deletions cpp/src/io/orc/dict_enc.cu
Original file line number Diff line number Diff line change
@@ -404,7 +404,7 @@ __global__ void __launch_bounds__(block_size)
* @param[in] chunks DictionaryChunk device array [rowgroup][column]
* @param[in] num_columns Number of columns
* @param[in] num_rowgroups Number of row groups
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void InitDictionaryIndices(DictionaryChunk *chunks,
uint32_t num_columns,
@@ -425,7 +425,7 @@ void InitDictionaryIndices(DictionaryChunk *chunks,
* @param[in] num_stripes Number of stripes
* @param[in] num_rowgroups Number of row groups
* @param[in] num_columns Number of columns
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void BuildStripeDictionaries(StripeDictionary *stripes,
StripeDictionary *stripes_host,
111 changes: 55 additions & 56 deletions cpp/src/io/orc/orc_gpu.h
Original file line number Diff line number Diff line change
@@ -124,19 +124,25 @@ struct RowGroup {
* @brief Struct to describe an encoder data chunk
*/
struct EncChunk {
uint8_t *streams[CI_NUM_STREAMS]; // encoded output
int32_t strm_id[CI_NUM_STREAMS]; // stream id or -1 if not present
uint32_t strm_len[CI_NUM_STREAMS]; // in: max length, out: actual length
const uint32_t *valid_map_base; // base ptr of input valid bit map
size_type column_offset; // index of the first element relative to the base memory
const void *column_data_base; // base ptr of input column data
uint32_t start_row; // start row of this chunk
uint32_t num_rows; // number of rows in this chunk
uint32_t valid_rows; // max number of valid rows
uint8_t encoding_kind; // column encoding kind (orc::ColumnEncodingKind)
uint8_t type_kind; // column data type (orc::TypeKind)
uint8_t dtype_len; // data type length
uint8_t scale; // scale for decimals or timestamps
const uint32_t *valid_map_base; // base ptr of input valid bit map
size_type column_offset; // index of the first element relative to the base memory
const void *column_data_base; // base ptr of input column data
uint32_t start_row; // start row of this chunk
uint32_t num_rows; // number of rows in this chunk
uint32_t valid_rows; // max number of valid rows
uint8_t encoding_kind; // column encoding kind (orc::ColumnEncodingKind)
uint8_t type_kind; // column data type (orc::TypeKind)
uint8_t dtype_len; // data type length
uint8_t scale; // scale for decimals or timestamps
};

/**
* @brief Struct to describe the streams that correspond to a single `EncChunk`.
*/
struct encoder_chunk_streams {
uint8_t *data_ptrs[CI_NUM_STREAMS]; // encoded output
int32_t ids[CI_NUM_STREAMS]; // stream id; -1 if stream is not present
uint32_t lengths[CI_NUM_STREAMS]; // in: max length, out: actual length
};

/**
@@ -193,7 +199,7 @@ struct StripeDictionary {
* @param[in] compression_block_size maximum size of compressed blocks (up to 16M)
* @param[in] log2maxcr log2 of maximum compression ratio (used to infer max uncompressed size from
*compressed size)
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void ParseCompressedStripeData(CompressedStreamInfo *strm_info,
int32_t num_streams,
@@ -206,7 +212,7 @@ void ParseCompressedStripeData(CompressedStreamInfo *strm_info,
*
* @param[in] strm_info List of compressed streams
* @param[in] num_streams Number of compressed streams
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void PostDecompressionReassemble(CompressedStreamInfo *strm_info,
int32_t num_streams,
@@ -221,7 +227,7 @@ void PostDecompressionReassemble(CompressedStreamInfo *strm_info,
* @param[in] num_columns Number of columns
* @param[in] num_stripes Number of stripes
* @param[in] num_rowgroups Number of row groups
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void ParseRowGroupIndex(RowGroup *row_groups,
CompressedStreamInfo *strm_info,
@@ -241,7 +247,7 @@ void ParseRowGroupIndex(RowGroup *row_groups,
* @param[in] num_stripes Number of stripes
* @param[in] max_rows Maximum number of rows to load
* @param[in] first_row Crop all rows below first_row
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
DictionaryEntry *global_dictionary,
@@ -265,9 +271,9 @@ void DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
* @param[in] row_groups Optional row index data
* @param[in] num_rowgroups Number of row groups in row index data
* @param[in] rowidx_stride Row index stride
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void DecodeOrcColumnData(ColumnDesc *chunks,
void DecodeOrcColumnData(ColumnDesc const *chunks,
DictionaryEntry *global_dictionary,
uint32_t num_columns,
uint32_t num_stripes,
@@ -282,79 +288,72 @@ void DecodeOrcColumnData(ColumnDesc *chunks,
/**
* @brief Launches kernel for encoding column data
*
* @param[in] chunks EncChunk device array [rowgroup][column]
* @param[in] num_columns Number of columns
* @param[in] num_rowgroups Number of row groups
* @param[in] stream CUDA stream to use, default 0
* @param[in] chunks encoder chunk device array [column][rowgroup]
* @param[in, out] streams chunk streams device array [column][rowgroup]
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void EncodeOrcColumnData(EncChunk *chunks,
uint32_t num_columns,
uint32_t num_rowgroups,
void EncodeOrcColumnData(detail::device_2dspan<EncChunk const> chunks,
detail::device_2dspan<encoder_chunk_streams> streams,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Launches kernel for encoding column dictionaries
*
* @param[in] stripes Stripe dictionaries device array [stripe][string_column]
* @param[in] chunks EncChunk device array [rowgroup][column]
* @param[in] chunks encoder chunk device array [column][rowgroup]
* @param[in] num_string_columns Number of string columns
* @param[in] num_columns Number of columns
* @param[in] num_stripes Number of stripes
* @param[in] stream CUDA stream to use, default 0
* @param[in,out] enc_streams chunk streams device array [column][rowgroup]
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void EncodeStripeDictionaries(StripeDictionary *stripes,
EncChunk *chunks,
detail::device_2dspan<EncChunk const> chunks,
uint32_t num_string_columns,
uint32_t num_columns,
uint32_t num_stripes,
detail::device_2dspan<encoder_chunk_streams> enc_streams,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Launches kernel for compacting chunked column data prior to compression
*
* @param[in] strm_desc StripeStream device array [stripe][stream]
* @param[in] chunks EncChunk device array [rowgroup][column]
* @param[in] num_stripe_streams Total number of streams
* @param[in] num_columns Number of columns
* @param[in] stream CUDA stream to use, default 0
* @param[in,out] strm_desc StripeStream device array [stripe][stream]
* @param[in,out] enc_streams chunk streams device array [column][rowgroup]
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void CompactOrcDataStreams(StripeStream *strm_desc,
EncChunk *chunks,
uint32_t num_stripe_streams,
uint32_t num_columns,
void CompactOrcDataStreams(detail::device_2dspan<StripeStream> strm_desc,
detail::device_2dspan<encoder_chunk_streams> enc_streams,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Launches kernel(s) for compressing data streams
*
* @param[in] compressed_data Output compressed blocks
* @param[in] strm_desc StripeStream device array [stripe][stream]
* @param[in] chunks EncChunk device array [rowgroup][column]
* @param[in] num_compressed_blocks Total number of compressed blocks
* @param[in] compression Type of compression
* @param[in] comp_blk_size Compression block size
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
* @param[in,out] strm_desc StripeStream device array [stripe][stream]
* @param[in,out] enc_streams chunk streams device array [column][rowgroup]
* @param[out] comp_in Per-block compression input parameters
* @param[out] comp_out Per-block compression status
* @param[in] num_stripe_streams Total number of streams
* @param[in] compression Type of compression
* @param[in] num_compressed_blocks Total number of compressed blocks
* @param[in] stream CUDA stream to use, default 0
*/
void CompressOrcDataStreams(uint8_t *compressed_data,
StripeStream *strm_desc,
EncChunk *chunks,
gpu_inflate_input_s *comp_in,
gpu_inflate_status_s *comp_out,
uint32_t num_stripe_streams,
uint32_t num_compressed_blocks,
CompressionKind compression,
uint32_t comp_blk_size,
detail::device_2dspan<StripeStream> strm_desc,
detail::device_2dspan<encoder_chunk_streams> enc_streams,
gpu_inflate_input_s *comp_in,
gpu_inflate_status_s *comp_out,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);

/**
* @brief Launches kernel for initializing dictionary chunks
*
* @param[in] chunks DictionaryChunk device array [rowgroup][column]
* @param[in,out] chunks DictionaryChunk device array [rowgroup][column]
* @param[in] num_columns Number of columns
* @param[in] num_rowgroups Number of row groups
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void InitDictionaryIndices(DictionaryChunk *chunks,
uint32_t num_columns,
@@ -370,7 +369,7 @@ void InitDictionaryIndices(DictionaryChunk *chunks,
* @param[in] num_stripes Number of stripes
* @param[in] num_rowgroups Number of row groups
* @param[in] num_columns Number of columns
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void BuildStripeDictionaries(StripeDictionary *stripes_dev,
StripeDictionary *stripes_host,
@@ -388,7 +387,7 @@ void BuildStripeDictionaries(StripeDictionary *stripes_dev,
* @param[in] num_columns Number of columns
* @param[in] num_rowgroups Number of rowgroups
* @param[in] row_index_stride Rowgroup size in rows
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void orc_init_statistics_groups(statistics_group *groups,
const stats_column_desc *cols,
@@ -403,7 +402,7 @@ void orc_init_statistics_groups(statistics_group *groups,
* @param[in,out] groups Statistics merge groups
* @param[in] chunks Statistics chunks
* @param[in] statistics_count Number of statistics buffers to encode
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void orc_init_statistics_buffersize(statistics_merge_group *groups,
const statistics_chunk *chunks,
4 changes: 2 additions & 2 deletions cpp/src/io/orc/stats_enc.cu
Original file line number Diff line number Diff line change
@@ -371,7 +371,7 @@ __global__ void __launch_bounds__(encode_threads_per_block)
* @param[in] num_columns Number of columns
* @param[in] num_rowgroups Number of rowgroups
* @param[in] row_index_stride Rowgroup size in rows
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void orc_init_statistics_groups(statistics_group *groups,
const stats_column_desc *cols,
@@ -392,7 +392,7 @@ void orc_init_statistics_groups(statistics_group *groups,
* @param[in,out] groups Statistics merge groups
* @param[in] chunks Statistics chunks
* @param[in] statistics_count Number of statistics buffers to encode
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void orc_init_statistics_buffersize(statistics_merge_group *groups,
const statistics_chunk *chunks,
8 changes: 4 additions & 4 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
@@ -1374,7 +1374,7 @@ static const __device__ __constant__ uint32_t kTimestampNanoScale[8] = {
// blockDim {block_size,1,1}
template <int block_size>
__global__ void __launch_bounds__(block_size)
gpuDecodeOrcColumnData(ColumnDesc *chunks,
gpuDecodeOrcColumnData(ColumnDesc const *chunks,
DictionaryEntry *global_dictionary,
timezone_table_view tz_table,
const RowGroup *row_groups,
@@ -1742,7 +1742,7 @@ __global__ void __launch_bounds__(block_size)
* @param[in] num_stripes Number of stripes
* @param[in] max_rows Maximum number of rows to load
* @param[in] first_row Crop all rows below first_row
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void __host__ DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
DictionaryEntry *global_dictionary,
@@ -1771,9 +1771,9 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
* @param[in] row_groups Optional row index data
* @param[in] num_rowgroups Number of row groups in row index data
* @param[in] rowidx_stride Row index stride
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void __host__ DecodeOrcColumnData(ColumnDesc *chunks,
void __host__ DecodeOrcColumnData(ColumnDesc const *chunks,
DictionaryEntry *global_dictionary,
uint32_t num_columns,
uint32_t num_stripes,
229 changes: 95 additions & 134 deletions cpp/src/io/orc/stripe_enc.cu

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cpp/src/io/orc/stripe_init.cu
Original file line number Diff line number Diff line change
@@ -485,7 +485,7 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo *strm_info,
* @param[in] num_columns Number of columns
* @param[in] num_stripes Number of stripes
* @param[in] num_rowgroups Number of row groups
* @param[in] stream CUDA stream to use, default 0
* @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
*/
void __host__ ParseRowGroupIndex(RowGroup *row_groups,
CompressedStreamInfo *strm_info,
918 changes: 452 additions & 466 deletions cpp/src/io/orc/writer_impl.cu

Large diffs are not rendered by default.

240 changes: 140 additions & 100 deletions cpp/src/io/orc/writer_impl.hpp
Original file line number Diff line number Diff line change
@@ -28,7 +28,9 @@
#include <cudf/table/table.hpp>
#include <cudf/utilities/error.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <memory>
#include <string>
@@ -43,6 +45,75 @@ class orc_column_view;

using namespace cudf::io::orc;
using namespace cudf::io;
using cudf::detail::device_2dspan;
using cudf::detail::host_2dspan;
using cudf::detail::hostdevice_2dvector;

/**
* @brief Indices of rowgroups contained in a stripe.
*
* Provides a container-like interface to iterate over rowgroup indices.
*/
struct stripe_rowgroups {
uint32_t id; // stripe id
uint32_t first; // first rowgroup in the stripe
uint32_t size; // number of rowgroups in the stripe
stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {}
auto cbegin() const { return thrust::make_counting_iterator(first); }
auto cend() const { return thrust::make_counting_iterator(first + size); }
};

/**
* @brief Returns the total number of rowgroups in the list of contigious stripes.
*/
inline auto stripes_size(host_span<stripe_rowgroups const> stripes)
{
return !stripes.empty() ? *stripes.back().cend() - stripes.front().first : 0;
}

/**
* @brief List of per-column ORC streams.
*
* Provides interface to calculate their offsets.
*/
class orc_streams {
public:
orc_streams(std::vector<Stream> streams, std::vector<int32_t> ids)
: streams{std::move(streams)}, ids{std::move(ids)}
{
}
Stream const& operator[](int idx) const { return streams[idx]; }
Stream& operator[](int idx) { return streams[idx]; }
auto id(int idx) const { return ids[idx]; }
auto& id(int idx) { return ids[idx]; }
auto size() const { return streams.size(); }

/**
* @brief List of ORC stream offsets and their total size.
*/
struct orc_stream_offsets {
std::vector<size_t> offsets;
size_t str_data_size = 0;
size_t rle_data_size = 0;
auto data_size() const { return str_data_size + rle_data_size; }
};
orc_stream_offsets compute_offsets(host_span<orc_column_view const> columns,
size_t num_rowgroups) const;

operator std::vector<Stream> const&() const { return streams; }

private:
std::vector<Stream> streams;
std::vector<int32_t> ids;
};

/**
* @brief ORC per-chunk streams of encoded data.
*/
struct encoded_data {
rmm::device_uvector<uint8_t> data; // Owning array of the encoded data
hostdevice_2dvector<gpu::encoder_chunk_streams> streams; // streams of encoded data, per chunk
};

/**
* @brief Implementation for ORC writer
@@ -115,171 +186,140 @@ class writer::impl {
* @brief Builds up column dictionaries indices
*
* @param columns List of columns
* @param num_rows Total number of rows
* @param str_col_ids List of columns that are strings type
* @param dict_data Dictionary data memory
* @param dict_index Dictionary index memory
* @param dict List of dictionary chunks
*/
void init_dictionaries(orc_column_view* columns,
size_t num_rows,
std::vector<int> const& str_col_ids,
uint32_t* dict_data,
uint32_t* dict_index,
hostdevice_vector<gpu::DictionaryChunk>& dict);
hostdevice_vector<gpu::DictionaryChunk>* dict);

/**
* @brief Builds up per-stripe dictionaries for string columns
* @brief Builds up per-stripe dictionaries for string columns.
*
* @param columns List of columns
* @param num_rows Total number of rows
* @param str_col_ids List of columns that are strings type
* @param stripe_list List of stripe boundaries
* @param stripe_bounds List of stripe boundaries
* @param dict List of dictionary chunks
* @param dict_index List of dictionary indices
* @param stripe_dict List of stripe dictionaries
*/
void build_dictionaries(orc_column_view* columns,
size_t num_rows,
std::vector<int> const& str_col_ids,
std::vector<uint32_t> const& stripe_list,
host_span<stripe_rowgroups const> stripe_bounds,
hostdevice_vector<gpu::DictionaryChunk> const& dict,
uint32_t* dict_index,
hostdevice_vector<gpu::StripeDictionary>& stripe_dict);

/**
* @brief Returns stream information for each column
* @brief Builds up per-column streams.
*
* @param columns List of columns
* @param num_columns Total number of columns
* @param num_rows Total number of rows
* @param stripe_list List of stripe boundaries
* @param strm_ids List of unique stream identifiers
* @param[in,out] columns List of columns
* @param[in] stripe_bounds List of stripe boundaries
* @return List of stream descriptors
*/
orc_streams create_streams(host_span<orc_column_view> columns,
host_span<stripe_rowgroups const> stripe_bounds);

/**
* @brief Gathers stripe information.
*
* @return The streams
* @param columns List of columns
* @param num_rowgroups Total number of rowgroups
* @return List of stripe descriptors
*/
std::vector<Stream> gather_streams(orc_column_view* columns,
size_t num_columns,
size_t num_rows,
std::vector<uint32_t> const& stripe_list,
std::vector<int32_t>& strm_ids);
std::vector<stripe_rowgroups> gather_stripe_info(host_span<orc_column_view const> columns,
size_t num_rowgroups);

/**
* @brief Encodes the streams as a series of column data chunks
* @brief Encodes the input columns into streams.
*
* @param columns List of columns
* @param num_columns Total number of columns
* @param num_rows Total number of rows
* @param num_rowgroups Total number of row groups
* @param str_col_ids List of columns that are strings type
* @param stripe_list List of stripe boundaries
* @param streams List of columns' index and data streams
* @param strm_ids List of unique stream identifiers
* @param chunks List of column data chunks
*
* @return Device buffer containing encoded data
* @param stripe_bounds List of stripe boundaries
* @param stream CUDA stream used for device memory operations and kernel launches
* @return Encoded data and per-chunk stream descriptors
*/
rmm::device_buffer encode_columns(orc_column_view* columns,
size_t num_columns,
size_t num_rows,
size_t num_rowgroups,
std::vector<int> const& str_col_ids,
std::vector<uint32_t> const& stripe_list,
std::vector<Stream> const& streams,
std::vector<int32_t> const& strm_ids,
hostdevice_vector<gpu::EncChunk>& chunks);
encoded_data encode_columns(host_span<orc_column_view const> columns,
std::vector<int> const& str_col_ids,
host_span<stripe_rowgroups const> stripe_bounds,
orc_streams const& streams);

/**
* @brief Returns stripe information after compacting columns' individual data
* chunks into contiguous data streams
* chunks into contiguous data streams.
*
* @param num_columns Total number of columns
* @param num_rows Total number of rows
* @param num_index_streams Total number of index streams
* @param num_data_streams Total number of data streams
* @param stripe_list List of stripe boundaries
* @param chunks List of column data chunks
* @param strm_desc List of stream descriptors
* @param[in] num_rows Total number of rows
* @param[in] num_index_streams Total number of index streams
* @param[in] stripe_bounds List of stripe boundaries
* @param[in,out] enc_streams List of encoder chunk streams [column][rowgroup]
* @param[in,out] strm_desc List of stream descriptors [stripe][data_stream]
*
* @return The stripes' information
*/
std::vector<StripeInformation> gather_stripes(size_t num_columns,
size_t num_rows,
size_t num_index_streams,
size_t num_data_streams,
std::vector<uint32_t> const& stripe_list,
hostdevice_vector<gpu::EncChunk>& chunks,
hostdevice_vector<gpu::StripeStream>& strm_desc);
std::vector<StripeInformation> gather_stripes(
size_t num_rows,
size_t num_index_streams,
host_span<stripe_rowgroups const> stripe_bounds,
hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
hostdevice_2dvector<gpu::StripeStream>* strm_desc);

/**
* @brief Returns per-stripe and per-file column statistics encoded
* in ORC protobuf format
* in ORC protobuf format.
*
* @param columns List of columns
* @param num_columns Total number of columns
* @param num_rows Total number of rows
* @param num_rowgroups Total number of row groups
* @param stripe_list Number of rowgroups in each stripe
* @param stripes Stripe information
* @param chunks List of column data chunks
* @param stripe_bounds List of stripe boundaries
*
* @return The statistic blobs
*/
std::vector<std::vector<uint8_t>> gather_statistic_blobs(
orc_column_view const* columns,
size_t num_columns,
size_t num_rows,
size_t num_rowgroups,
std::vector<uint32_t> const& stripe_list,
std::vector<StripeInformation> const& stripes,
hostdevice_vector<gpu::EncChunk>& chunks);
host_span<orc_column_view const> columns, host_span<stripe_rowgroups const> stripe_bounds);

/**
* @brief Write the specified column's row index stream
* @brief Writes the specified column's row index stream.
*
* @param stripe_id Stripe's identifier
* @param stream_id Stream's identifier
* @param columns List of columns
* @param num_columns Total number of columns
* @param num_data_streams Total number of data streams
* @param group Starting row group in the stripe
* @param groups_in_stripe Number of row groups in the stripe
* @param chunks List of all column chunks
* @param strm_desc List of stream descriptors
* @param comp_out Output status for compressed streams
* @param streams List of all streams
* @param pbw Protobuf writer
* @param[in] stripe_id Stripe's identifier
* @param[in] stream_id Stream identifier (column id + 1)
* @param[in] columns List of columns
* @param[in] rowgroups_range Indexes of rowgroups in the stripe
* @param[in] enc_streams List of encoder chunk streams [column][rowgroup]
* @param[in] strm_desc List of stream descriptors
* @param[in] comp_out Output status for compressed streams
* @param[in,out] stripe Stream's parent stripe
* @param[in,out] streams List of all streams
* @param[in,out] pbw Protobuf writer
*/
void write_index_stream(int32_t stripe_id,
int32_t stream_id,
orc_column_view* columns,
size_t num_columns,
size_t num_data_streams,
size_t group,
size_t groups_in_stripe,
hostdevice_vector<gpu::EncChunk> const& chunks,
hostdevice_vector<gpu::StripeStream> const& strm_desc,
hostdevice_vector<gpu_inflate_status_s> const& comp_out,
StripeInformation& stripe,
std::vector<Stream>& streams,
host_span<orc_column_view const> columns,
stripe_rowgroups const& rowgroups_range,
host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
host_2dspan<gpu::StripeStream const> strm_desc,
host_span<gpu_inflate_status_s const> comp_out,
StripeInformation* stripe,
orc_streams* streams,
ProtobufWriter* pbw);

/**
* @brief Write the specified column's data streams
*
* @param strm_desc Stream's descriptor
* @param chunk First column chunk of the stream
* @param compressed_data Compressed stream data
* @param stream_out Temporary host output buffer
* @param stripe Stream's parent stripe
* @param streams List of all streams
* @param[in] strm_desc Stream's descriptor
* @param[in] enc_stream Chunk's streams
* @param[in] compressed_data Compressed stream data
* @param[in,out] stream_out Temporary host output buffer
* @param[in,out] stripe Stream's parent stripe
* @param[in,out] streams List of all streams
*/
void write_data_stream(gpu::StripeStream const& strm_desc,
gpu::EncChunk const& chunk,
gpu::encoder_chunk_streams const& enc_stream,
uint8_t const* compressed_data,
uint8_t* stream_out,
StripeInformation& stripe,
std::vector<Stream>& streams);
StripeInformation* stripe,
orc_streams* streams);

/**
* @brief Insert 3-byte uncompressed block headers in a byte vector
61 changes: 61 additions & 0 deletions cpp/src/io/utilities/hostdevice_vector.hpp
Original file line number Diff line number Diff line change
@@ -17,6 +17,7 @@
#pragma once

#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
@@ -91,6 +92,12 @@ class hostdevice_vector {
return reinterpret_cast<T const *>(d_data.data()) + offset;
}

operator cudf::device_span<T>() { return {d_data.data(), max_elements}; }
operator cudf::device_span<T const>() const { return {d_data.data(), max_elements}; }

operator cudf::host_span<T>() { return {h_data, max_elements}; }
operator cudf::host_span<T const>() const { return {h_data, max_elements}; }

void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false)
{
CUDA_TRY(cudaMemcpyAsync(
@@ -125,3 +132,57 @@ class hostdevice_vector {
T *h_data{};
rmm::device_buffer d_data{};
};

namespace cudf {
namespace detail {

/**
* @brief Wrapper around hostdevice_vector to enable two-dimensional indexing.
*
* Does not incur additional allocations.
*/
template <typename T>
class hostdevice_2dvector {
public:
hostdevice_2dvector(size_t rows,
size_t columns,
rmm::cuda_stream_view stream = rmm::cuda_stream_default)
: _size{rows, columns}, _data{rows * columns, stream}
{
}

operator device_2dspan<T>() { return {_data.device_ptr(), _size}; }
operator device_2dspan<T const>() const { return {_data.device_ptr(), _size}; }

operator host_2dspan<T>() { return {_data.host_ptr(), _size}; }
operator host_2dspan<T const>() const { return {_data.host_ptr(), _size}; }

host_span<T> operator[](size_t row)
{
return {_data.host_ptr() + host_2dspan<T>::flatten_index(row, 0, _size), _size.second};
}

host_span<T const> operator[](size_t row) const
{
return {_data.host_ptr() + host_2dspan<T>::flatten_index(row, 0, _size), _size.second};
}

auto size() const noexcept { return _size; }

void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false)
{
_data.host_to_device(stream, synchronize);
}

void device_to_host(rmm::cuda_stream_view stream, bool synchronize = false)
{
_data.device_to_host(stream, synchronize);
}

private:
hostdevice_vector<T> _data;
typename host_2dspan<T>::size_type _size;
};

} // namespace detail
} // namespace cudf
66 changes: 66 additions & 0 deletions cpp/tests/utilities_tests/span_tests.cu
Original file line number Diff line number Diff line change
@@ -15,6 +15,7 @@
*/

#include <cudf/utilities/span.hpp>
#include <io/utilities/hostdevice_vector.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/cudf_gtest.hpp>
@@ -29,6 +30,9 @@

using cudf::device_span;
using cudf::host_span;
using cudf::detail::device_2dspan;
using cudf::detail::host_2dspan;
using cudf::detail::hostdevice_2dvector;

template <typename T>
void expect_equivolent(host_span<T> a, host_span<T> b)
@@ -237,4 +241,66 @@ TEST(SpanTest, CanUseDeviceSpan)
ASSERT_TRUE(h_message[0]);
}

class MdSpanTest : public cudf::test::BaseFixture {
};

TEST(MdSpanTest, CanDetermineEmptiness)
{
auto const vector = hostdevice_2dvector<int>(1, 2);
auto const no_rows_vector = hostdevice_2dvector<int>(0, 2);
auto const no_columns_vector = hostdevice_2dvector<int>(1, 0);

EXPECT_FALSE(host_2dspan<int const>{vector}.is_empty());
EXPECT_FALSE(device_2dspan<int const>{vector}.is_empty());
EXPECT_TRUE(host_2dspan<int const>{no_rows_vector}.is_empty());
EXPECT_TRUE(device_2dspan<int const>{no_rows_vector}.is_empty());
EXPECT_TRUE(host_2dspan<int const>{no_columns_vector}.is_empty());
EXPECT_TRUE(device_2dspan<int const>{no_columns_vector}.is_empty());
}

__global__ void readwrite_kernel(device_2dspan<int> result)
{
if (result[5][6] == 5) {
result[5][6] *= 6;
} else {
result[5][6] = 5;
}
}

TEST(MdSpanTest, DeviceReadWrite)
{
auto vector = hostdevice_2dvector<int>(11, 23);

readwrite_kernel<<<1, 1>>>(vector);
readwrite_kernel<<<1, 1>>>(vector);
vector.device_to_host(rmm::cuda_stream_default, true);
EXPECT_EQ(vector[5][6], 30);
}

TEST(MdSpanTest, HostReadWrite)
{
auto vector = hostdevice_2dvector<int>(11, 23);
auto span = host_2dspan<int>{vector};
span[5][6] = 5;
if (span[5][6] == 5) { span[5][6] *= 6; }

EXPECT_EQ(vector[5][6], 30);
}

TEST(MdSpanTest, CanGetSize)
{
auto const vector = hostdevice_2dvector<int>(1, 2);

EXPECT_EQ(host_2dspan<int const>{vector}.size(), vector.size());
EXPECT_EQ(device_2dspan<int const>{vector}.size(), vector.size());
}

TEST(MdSpanTest, CanGetCount)
{
auto const vector = hostdevice_2dvector<int>(11, 23);

EXPECT_EQ(host_2dspan<int const>{vector}.count(), 11ul * 23);
EXPECT_EQ(device_2dspan<int const>{vector}.count(), 11ul * 23);
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit d619f77

Please sign in to comment.