Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce peak memory use when writing compressed ORC files. #12963

Merged
merged 38 commits into from
Apr 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
f05525e
top iter stripes, bottom row groups
vuule Feb 1, 2023
66feb06
separate sizes from offsets calc
vuule Feb 1, 2023
5e057c7
Merge branch 'branch-23.04' of https://github.com/rapidsai/cudf into …
vuule Mar 1, 2023
a5a1ba0
per-stream enc data buffer
vuule Mar 1, 2023
71f4c4f
per-stripe buffer
vuule Mar 1, 2023
fc3fc2a
Merge branch 'branch-23.04' of https://github.com/rapidsai/cudf into …
vuule Mar 6, 2023
beea72a
Merge branch 'branch-23.04' of https://github.com/rapidsai/cudf into …
vuule Mar 8, 2023
7694c1d
merge
vuule Mar 9, 2023
5bee01e
Merge branch 'branch-23.04' of https://github.com/rapidsai/cudf into …
vuule Mar 15, 2023
866f827
Merge branch 'branch-23.04' of https://github.com/rapidsai/cudf into …
vuule Mar 15, 2023
8ed0857
POC
vuule Mar 15, 2023
d4c5075
optimization
vuule Mar 17, 2023
c120e08
Merge branch 'branch-23.04' of https://github.com/rapidsai/cudf into …
vuule Mar 17, 2023
579fd42
todo comment
vuule Mar 17, 2023
94e18b7
style
vuule Mar 17, 2023
4a2f652
remove compute_offsets
vuule Mar 17, 2023
f6a8765
slight clean up
vuule Mar 17, 2023
bfcc351
minor kernel simplification
vuule Mar 17, 2023
bdfa0b6
bit o' cleanup
vuule Mar 17, 2023
a919cbb
TODO
vuule Mar 17, 2023
f358232
Merge branch 'branch-23.04' into reduce-orc-writer-mem
vuule Mar 17, 2023
b383052
Merge branch 'branch-23.04' of https://github.com/rapidsai/cudf into …
vuule Mar 21, 2023
8bdad8b
still merge
vuule Mar 21, 2023
4c02381
Merge branch 'reduce-orc-writer-mem' of https://github.com/vuule/cudf…
vuule Mar 21, 2023
a57f7bb
Merge branch 'branch-23.04' into reduce-orc-writer-mem
vuule Mar 21, 2023
8d2cc43
style
vuule Mar 22, 2023
5097a23
Merge branch 'reduce-orc-writer-mem' of https://github.com/vuule/cudf…
vuule Mar 22, 2023
8372172
Merge branch 'branch-23.04' of https://github.com/rapidsai/cudf into …
vuule Mar 24, 2023
b48fbf7
Merge branch 'branch-23.04' into reduce-orc-writer-mem
vuule Mar 24, 2023
43aa1f7
add comment
vuule Mar 24, 2023
25af66e
Merge branch 'branch-23.04' of https://github.com/rapidsai/cudf into …
vuule Mar 28, 2023
60a23ed
doc update
vuule Mar 28, 2023
4e30777
pass bool comment
vuule Mar 29, 2023
9efde71
remove magic number
vuule Mar 29, 2023
12655fc
Merge branch 'reduce-orc-writer-mem' of https://github.com/vuule/cudf…
vuule Mar 29, 2023
0522524
style
vuule Mar 29, 2023
ba4c078
Merge branch 'branch-23.06' into reduce-orc-writer-mem
vuule Mar 30, 2023
313131c
Merge branch 'branch-23.06' into reduce-orc-writer-mem
vuule Apr 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/io/orc/orc_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ struct encoder_chunk_streams {
* @brief Struct to describe a column stream within a stripe
*/
struct StripeStream {
uint8_t* data_ptr; // encoded and gathered output
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
size_t bfr_offset; // Offset of this stream in compressed buffer
uint32_t stream_size; // Size of stream in bytes
uint32_t first_chunk_id; // First chunk of the stripe
Expand Down
55 changes: 21 additions & 34 deletions cpp/src/io/orc/stripe_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ namespace gpu {

using cudf::detail::device_2dspan;

constexpr int scratch_buffer_size = 512 * 4;
constexpr int scratch_buffer_size = 512 * 4;
constexpr int compact_streams_block_size = 1024;

// Apache ORC reader does not handle zero-length patch lists for RLEv2 mode2
// Workaround replaces zero-length patch lists by a dummy zero patch
Expand Down Expand Up @@ -1082,51 +1083,37 @@ __global__ void __launch_bounds__(block_size)
* @param[in,out] strm_desc StripeStream device array [stripe][stream]
* @param[in,out] streams List of encoder chunk streams [column][rowgroup]
*/
// blockDim {1024,1,1}
__global__ void __launch_bounds__(1024)
// blockDim {compact_streams_block_size,1,1}
__global__ void __launch_bounds__(compact_streams_block_size)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔥

gpuCompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
device_2dspan<encoder_chunk_streams> streams)
{
__shared__ __align__(16) StripeStream ss;
__shared__ __align__(16) encoder_chunk_streams strm0;
__shared__ uint8_t* volatile ck_curptr_g;
__shared__ uint32_t volatile ck_curlen_g;

auto const stripe_id = blockIdx.x;
auto const stream_id = blockIdx.y;
uint32_t t = threadIdx.x;
auto const t = threadIdx.x;

if (t == 0) {
ss = strm_desc[stripe_id][stream_id];
strm0 = streams[ss.column_id][ss.first_chunk_id];
}
if (t == 0) { ss = strm_desc[stripe_id][stream_id]; }
__syncthreads();

if (ss.data_ptr == nullptr) { return; }

auto const cid = ss.stream_type;
auto dst_ptr = strm0.data_ptrs[cid] + strm0.lengths[cid];
for (auto group = ss.first_chunk_id + 1; group < ss.first_chunk_id + ss.num_chunks; ++group) {
vuule marked this conversation as resolved.
Show resolved Hide resolved
uint8_t* src_ptr;
uint32_t len;
if (t == 0) {
src_ptr = streams[ss.column_id][group].data_ptrs[cid];
len = streams[ss.column_id][group].lengths[cid];
if (src_ptr != dst_ptr) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; }
ck_curptr_g = src_ptr;
ck_curlen_g = len;
}
__syncthreads();
src_ptr = ck_curptr_g;
len = ck_curlen_g;
if (len > 0 && src_ptr != dst_ptr) {
for (uint32_t i = 0; i < len; i += 1024) {
uint8_t v = (i + t < len) ? src_ptr[i + t] : 0;
__syncthreads();
if (i + t < len) { dst_ptr[i + t] = v; }
auto dst_ptr = ss.data_ptr;
for (auto group = ss.first_chunk_id; group < ss.first_chunk_id + ss.num_chunks; ++group) {
auto const len = streams[ss.column_id][group].lengths[cid];
if (len > 0) {
auto const src_ptr = streams[ss.column_id][group].data_ptrs[cid];
for (uint32_t i = t; i < len; i += blockDim.x) {
dst_ptr[i] = src_ptr[i];
}

__syncthreads();
if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; }
dst_ptr += len;
}
dst_ptr += len;
__syncthreads();
}
if (!t) { strm_desc[stripe_id][stream_id].stream_size = dst_ptr - strm0.data_ptrs[cid]; }
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to set the stream size, its been computed on the host

}

/**
Expand Down Expand Up @@ -1299,7 +1286,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
device_2dspan<encoder_chunk_streams> enc_streams,
rmm::cuda_stream_view stream)
{
dim3 dim_block(1024, 1);
dim3 dim_block(compact_streams_block_size, 1);
dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
gpuCompactOrcDataStreams<<<dim_grid, dim_block, 0, stream.value()>>>(strm_desc, enc_streams);
}
Expand Down
Loading