Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor host decompression in ORC reader #10764

Merged
merged 24 commits into from
May 16, 2022
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
42252bf
spans
vuule May 2, 2022
ec2654e
remove IO_UNCOMP_STREAM_TYPE
vuule May 2, 2022
75b67a8
Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …
vuule May 2, 2022
172912a
copyright year
vuule May 2, 2022
e617c62
start HostDecompressor removal
vuule May 2, 2022
5c34e8c
style
vuule May 2, 2022
21ddaf3
remove hostdecompressor
vuule May 3, 2022
49d3eab
merge io_uncompress_single_h2d and get_uncompressed_data
vuule May 3, 2022
0163b1c
decompress to vector char ->uint8_t
vuule May 3, 2022
f8be114
Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …
vuule May 3, 2022
7d9283e
throw instead of returning empty output
vuule May 3, 2022
33ba1ee
Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …
vuule May 4, 2022
a61e241
more spans
vuule May 5, 2022
2bdb315
reference
vuule May 5, 2022
71f5f4a
min
vuule May 5, 2022
c819249
docs and style
vuule May 5, 2022
5986de2
Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …
vuule May 5, 2022
5812e8b
Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …
vuule May 7, 2022
274888d
Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …
vuule May 13, 2022
feffa9b
fallthrough
vuule May 13, 2022
1bb89d0
code review
vuule May 13, 2022
4258c78
Apply suggestions from code review
vuule May 13, 2022
c38f7a5
Merge branch 'orc-host-decomp' of http://github.com/vuule/cudf into o…
vuule May 13, 2022
14952a1
style
vuule May 13, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ enum class compression_type {
BZIP2, ///< BZIP2 format, using Burrows-Wheeler transform
BROTLI, ///< BROTLI format, using LZ77 + Huffman + 2nd order context modeling
ZIP, ///< ZIP format, using DEFLATE algorithm
XZ ///< XZ format, using LZMA(2) algorithm
XZ, ///< XZ format, using LZMA(2) algorithm
ZLIB, ///< ZLIB format, using DEFLATE algorithm
LZ4, ///< LZ4 format, using LZ77
LZO, ///< Lempel–Ziv–Oberhumer format
ZSTD ///< Zstandard format
};

/**
Expand Down
41 changes: 13 additions & 28 deletions cpp/src/io/comp/io_uncomp.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2021, NVIDIA CORPORATION.
* Copyright (c) 2018-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,35 +27,20 @@ using cudf::host_span;

namespace cudf {
namespace io {
enum {
IO_UNCOMP_STREAM_TYPE_INFER = 0,
IO_UNCOMP_STREAM_TYPE_GZIP = 1,
IO_UNCOMP_STREAM_TYPE_ZIP = 2,
IO_UNCOMP_STREAM_TYPE_BZIP2 = 3,
IO_UNCOMP_STREAM_TYPE_XZ = 4,
IO_UNCOMP_STREAM_TYPE_INFLATE = 5,
IO_UNCOMP_STREAM_TYPE_SNAPPY = 6,
IO_UNCOMP_STREAM_TYPE_BROTLI = 7,
IO_UNCOMP_STREAM_TYPE_LZ4 = 8,
IO_UNCOMP_STREAM_TYPE_LZO = 9,
IO_UNCOMP_STREAM_TYPE_ZSTD = 10,
};

std::vector<char> io_uncompress_single_h2d(void const* src, size_t src_size, int stream_type);

std::vector<char> get_uncompressed_data(host_span<char const> data, compression_type compression);

class HostDecompressor {
public:
virtual size_t Decompress(uint8_t* dstBytes,
size_t dstLen,
uint8_t const* srcBytes,
size_t srcLen) = 0;
virtual ~HostDecompressor() {}
/**
* @brief Decompresses a system memory buffer.
*
* @param compression Type of compression of the input data
* @param src Compressed host buffer
*
* @return Vector containing the Decompressed output
*/
std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t const> src);

public:
static std::unique_ptr<HostDecompressor> Create(int stream_type);
};
size_t decompress(compression_type compression,
host_span<uint8_t const> src,
host_span<uint8_t> dst);

/**
* @brief GZIP header flags
Expand Down
18 changes: 9 additions & 9 deletions cpp/src/io/comp/nvcomp_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,37 +23,37 @@
namespace cudf::io::nvcomp {

template <typename... Args>
auto batched_decompress_get_temp_size(compression_type type, Args&&... args)
auto batched_decompress_get_temp_size(compression_type compression, Args&&... args)
{
switch (type) {
switch (compression) {
case compression_type::SNAPPY:
return nvcompBatchedSnappyDecompressGetTempSize(std::forward<Args>(args)...);
default: CUDF_FAIL("Unsupported compression type");
}
};

template <typename... Args>
auto batched_decompress_async(compression_type type, Args&&... args)
auto batched_decompress_async(compression_type compression, Args&&... args)
{
switch (type) {
switch (compression) {
case compression_type::SNAPPY:
return nvcompBatchedSnappyDecompressAsync(std::forward<Args>(args)...);
default: CUDF_FAIL("Unsupported compression type");
}
};

size_t get_temp_size(compression_type type, size_t num_chunks, size_t max_uncomp_chunk_size)
size_t get_temp_size(compression_type compression, size_t num_chunks, size_t max_uncomp_chunk_size)
{
size_t temp_size = 0;
nvcompStatus_t nvcomp_status =
batched_decompress_get_temp_size(type, num_chunks, max_uncomp_chunk_size, &temp_size);
batched_decompress_get_temp_size(compression, num_chunks, max_uncomp_chunk_size, &temp_size);
CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
"Unable to get scratch size for decompression");

return temp_size;
}

void batched_decompress(compression_type type,
void batched_decompress(compression_type compression,
device_span<device_span<uint8_t const> const> inputs,
device_span<device_span<uint8_t> const> outputs,
device_span<decompress_status> statuses,
Expand All @@ -67,8 +67,8 @@ void batched_decompress(compression_type type,
rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_chunks, stream);
rmm::device_uvector<nvcompStatus_t> nvcomp_statuses(num_chunks, stream);
// Temporary space required for decompression
rmm::device_buffer scratch(get_temp_size(type, num_chunks, max_uncomp_chunk_size), stream);
auto const nvcomp_status = batched_decompress_async(type,
rmm::device_buffer scratch(get_temp_size(compression, num_chunks, max_uncomp_chunk_size), stream);
auto const nvcomp_status = batched_decompress_async(compression,
nvcomp_args.compressed_data_ptrs.data(),
nvcomp_args.compressed_data_sizes.data(),
nvcomp_args.uncompressed_data_sizes.data(),
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/comp/nvcomp_adapter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ enum class compression_type { SNAPPY };
* @param[in] max_uncomp_page_size maximum size of uncompressed block
* @param[in] stream CUDA stream to use
*/
void batched_decompress(compression_type type,
void batched_decompress(compression_type compression,
device_span<device_span<uint8_t const> const> inputs,
device_span<device_span<uint8_t> const> outputs,
device_span<decompress_status> statuses,
Expand Down
Loading