From 260269c3928297165550d102c927f42ac5f81776 Mon Sep 17 00:00:00 2001 From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:43:40 -0500 Subject: [PATCH 1/5] rename workflow repository (#14300) --- .github/workflows/build.yaml | 16 ++++++++-------- .github/workflows/pr.yaml | 28 ++++++++++++++-------------- .github/workflows/test.yaml | 16 ++++++++-------- ci/release/update-version.sh | 2 +- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 0e120d34bb1..666d8844a80 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: ${{ inputs.build_type || 'branch' }} @@ -100,7 +100,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 054ea7968c8..cb9d67639ef 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -26,34 +26,34 @@ jobs: - wheel-build-dask-cudf - wheel-tests-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.10 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.10 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: pull-request test_script: "ci/test_python_cudf.sh" @@ -61,14 +61,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: pull-request test_script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -78,7 +78,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -88,7 +88,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -98,21 +98,21 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: build_type: pull-request script: ci/test_wheel_cudf.sh wheel-build-dask-cudf: needs: wheel-tests-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request @@ -120,7 +120,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 030f2e41db4..1ba0a7491a8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -36,7 +36,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: test_script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: nightly diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 5e735a71994..eac64fe1a0f 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -113,7 +113,7 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_ # CI files for FILE in .github/workflows/*.yaml; do - sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" + sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE}; done sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh From f7ad66f440168fd4eceb3cc900301661023e42a1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 20 Oct 2023 09:31:16 -0700 Subject: [PATCH 2/5] Add DELTA_BINARY_PACKED encoder for Parquet writer (#14100) Part of #13501. Adds ability to fall back on DELTA_BINARY_PACKED encoding when V2 page headers are selected and dictionary encoding is not possible. Authors: - Ed Seidl (https://github.com/etseidl) - Yunsong Wang (https://github.com/PointKernel) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14100 --- cpp/src/io/parquet/delta_binary.cuh | 6 - cpp/src/io/parquet/delta_enc.cuh | 290 ++++++++ cpp/src/io/parquet/page_enc.cu | 998 ++++++++++++++++++++-------- cpp/src/io/parquet/parquet_gpu.hpp | 51 +- cpp/tests/io/parquet_test.cpp | 108 ++- 5 files changed, 1156 insertions(+), 297 deletions(-) create mode 100644 cpp/src/io/parquet/delta_enc.cuh diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index a513e6674b4..e3b23f4c0a0 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -46,12 +46,6 @@ namespace cudf::io::parquet::detail { // encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix // lengths, followed by the concatenated suffix data. -// TODO: The delta encodings use ULEB128 integers, but for now we're only -// using max 64 bits. Need to see what the performance impact is of using -// __int128_t rather than int64_t. -using uleb128_t = uint64_t; -using zigzag128_t = int64_t; - // we decode one mini-block at a time. max mini-block size seen is 64. constexpr int delta_rolling_buf_size = 128; diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh new file mode 100644 index 00000000000..28f8cdfe2c1 --- /dev/null +++ b/cpp/src/io/parquet/delta_enc.cuh @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "parquet_gpu.hpp" + +#include +#include + +#include + +namespace cudf::io::parquet::detail { + +namespace delta { + +inline __device__ void put_uleb128(uint8_t*& p, uleb128_t v) +{ + while (v > 0x7f) { + *(p++) = v | 0x80; + v >>= 7; + } + *(p++) = v; +} + +inline __device__ void put_zz128(uint8_t*& p, zigzag128_t v) +{ + zigzag128_t s = (v < 0); + put_uleb128(p, (v ^ -s) * 2 + s); +} + +// A block size of 128, with 4 mini-blocks of 32 values each fits nicely without consuming +// too much shared memory. +// The parquet spec requires block_size to be a multiple of 128, and values_per_mini_block +// to be a multiple of 32. +constexpr int block_size = 128; +constexpr int num_mini_blocks = 4; +constexpr int values_per_mini_block = block_size / num_mini_blocks; +constexpr int buffer_size = 2 * block_size; + +// An extra sanity checks to enforce compliance with the parquet specification. +static_assert(block_size % 128 == 0); +static_assert(values_per_mini_block % 32 == 0); + +using block_reduce = cub::BlockReduce; +using warp_reduce = cub::WarpReduce; +using index_scan = cub::BlockScan; + +constexpr int rolling_idx(int index) { return rolling_index(index); } + +// Version of bit packer that can handle up to 64 bits values. +// T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long +// (not uint64_t because of atomicOr's typing). allowing this to be selectable since there's a +// measurable impact to using the wider types. +template +inline __device__ void bitpack_mini_block( + uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space) +{ + using wide_type = + std::conditional_t, __uint128_t, uint64_t>; + using cudf::detail::warp_size; + scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1; + auto constexpr div = sizeof(scratch_type) * 8; + + auto const lane_id = threadIdx.x % warp_size; + auto const warp_id = threadIdx.x / warp_size; + + auto const scratch = reinterpret_cast(temp_space) + warp_id * warp_size; + + // zero out scratch + scratch[lane_id] = 0; + __syncwarp(); + + // TODO: see if there is any savings using special packing for easy bitwidths (1,2,4,8,16...) + // like what's done for the RLE encoder. + if (nbits == div) { + if (lane_id < count) { + for (int i = 0; i < sizeof(scratch_type); i++) { + dst[lane_id * sizeof(scratch_type) + i] = val & 0xff; + val >>= 8; + } + } + return; + } + + if (lane_id <= count) { + // Shift symbol left by up to mask bits. + wide_type v2 = val; + v2 <<= (lane_id * nbits) & mask; + + // Copy N bit word into two N/2 bit words while following C++ strict aliasing rules. + scratch_type v1[2]; + memcpy(&v1, &v2, sizeof(wide_type)); + + // Atomically write result to scratch. + if (v1[0]) { atomicOr(scratch + ((lane_id * nbits) / div), v1[0]); } + if (v1[1]) { atomicOr(scratch + ((lane_id * nbits) / div) + 1, v1[1]); } + } + __syncwarp(); + + // Copy scratch data to final destination. + auto const available_bytes = util::div_rounding_up_safe(count * nbits, 8U); + auto const scratch_bytes = reinterpret_cast(scratch); + + for (uint32_t i = lane_id; i < available_bytes; i += warp_size) { + dst[i] = scratch_bytes[i]; + } + __syncwarp(); +} + +} // namespace delta + +// Object used to turn a stream of integers into a DELTA_BINARY_PACKED stream. This takes as input +// 128 values with validity at a time, saving them until there are enough values for a block +// to be written. +// T is the input data type (either zigzag128_t or uleb128_t). +template +class delta_binary_packer { + private: + uint8_t* _dst; // sink to dump encoded values to + T* _buffer; // buffer to store values to be encoded + size_type _current_idx; // index of first value in buffer + uint32_t _num_values; // total number of values to encode + size_type _values_in_buffer; // current number of values stored in _buffer + uint8_t _mb_bits[delta::num_mini_blocks]; // bitwidth for each mini-block + + // pointers to shared scratch memory for the warp and block scans/reduces + delta::index_scan::TempStorage* _scan_tmp; + delta::warp_reduce::TempStorage* _warp_tmp; + delta::block_reduce::TempStorage* _block_tmp; + + void* _bitpack_tmp; // pointer to shared scratch memory used in bitpacking + + // Write the delta binary header. Only call from thread 0. + inline __device__ void write_header() + { + delta::put_uleb128(_dst, delta::block_size); + delta::put_uleb128(_dst, delta::num_mini_blocks); + delta::put_uleb128(_dst, _num_values); + delta::put_zz128(_dst, _buffer[0]); + } + + // Write the block header. Only call from thread 0. + inline __device__ void write_block_header(zigzag128_t block_min) + { + delta::put_zz128(_dst, block_min); + memcpy(_dst, _mb_bits, 4); + _dst += 4; + } + + // Signed subtraction with defined wrapping behavior. + inline __device__ zigzag128_t subtract(zigzag128_t a, zigzag128_t b) + { + return static_cast(static_cast(a) - static_cast(b)); + } + + public: + inline __device__ auto num_values() const { return _num_values; } + + // Initialize the object. Only call from thread 0. + inline __device__ void init(uint8_t* dest, uint32_t num_values, T* buffer, void* temp_storage) + { + _dst = dest; + _num_values = num_values; + _buffer = buffer; + _scan_tmp = reinterpret_cast(temp_storage); + _warp_tmp = reinterpret_cast(temp_storage); + _block_tmp = reinterpret_cast(temp_storage); + _bitpack_tmp = _buffer + delta::buffer_size; + _current_idx = 0; + _values_in_buffer = 0; + } + + // Each thread calls this to add its current value. + inline __device__ void add_value(T value, bool is_valid) + { + // Figure out the correct position for the given value. + size_type const valid = is_valid; + size_type pos; + size_type num_valid; + delta::index_scan(*_scan_tmp).ExclusiveSum(valid, pos, num_valid); + + if (is_valid) { _buffer[delta::rolling_idx(pos + _current_idx + _values_in_buffer)] = value; } + __syncthreads(); + + if (threadIdx.x == 0) { + _values_in_buffer += num_valid; + // if first pass write header + if (_current_idx == 0) { + write_header(); + _current_idx = 1; + _values_in_buffer -= 1; + } + } + __syncthreads(); + + if (_values_in_buffer >= delta::block_size) { flush(); } + } + + // Called by each thread to flush data to the sink. + inline __device__ uint8_t const* flush() + { + using cudf::detail::warp_size; + __shared__ zigzag128_t block_min; + + int const t = threadIdx.x; + int const warp_id = t / warp_size; + int const lane_id = t % warp_size; + + if (_values_in_buffer <= 0) { return _dst; } + + // Calculate delta for this thread. + size_type const idx = _current_idx + t; + zigzag128_t const delta = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)], + _buffer[delta::rolling_idx(idx - 1)]) + : std::numeric_limits::max(); + + // Find min delta for the block. + auto const min_delta = delta::block_reduce(*_block_tmp).Reduce(delta, cub::Min()); + + if (t == 0) { block_min = min_delta; } + __syncthreads(); + + // Compute frame of reference for the block. + uleb128_t const norm_delta = idx < _num_values ? subtract(delta, block_min) : 0; + + // Get max normalized delta for each warp, and use that to determine how many bits to use + // for the bitpacking of this warp. + zigzag128_t const warp_max = + delta::warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max()); + __syncwarp(); + + if (lane_id == 0) { _mb_bits[warp_id] = sizeof(zigzag128_t) * 8 - __clzll(warp_max); } + __syncthreads(); + + // write block header + if (t == 0) { write_block_header(block_min); } + __syncthreads(); + + // Now each warp encodes its data...can calculate starting offset with _mb_bits. + // NOTE: using a switch here rather than a loop because the compiler produces code that + // uses fewer registers. + int cumulative_bits = 0; + switch (warp_id) { + case 3: cumulative_bits += _mb_bits[2]; [[fallthrough]]; + case 2: cumulative_bits += _mb_bits[1]; [[fallthrough]]; + case 1: cumulative_bits += _mb_bits[0]; + } + uint8_t* const mb_ptr = _dst + cumulative_bits * delta::values_per_mini_block / 8; + + // encoding happens here + auto const warp_idx = _current_idx + warp_id * delta::values_per_mini_block; + if (warp_idx < _num_values) { + auto const num_enc = min(delta::values_per_mini_block, _num_values - warp_idx); + if (_mb_bits[warp_id] > 32) { + delta::bitpack_mini_block( + mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp); + } else { + delta::bitpack_mini_block( + mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp); + } + } + __syncthreads(); + + // Last warp updates global delta ptr. + if (warp_id == delta::num_mini_blocks - 1 && lane_id == 0) { + _dst = mb_ptr + _mb_bits[warp_id] * delta::values_per_mini_block / 8; + _current_idx = min(warp_idx + delta::values_per_mini_block, _num_values); + _values_in_buffer = max(_values_in_buffer - delta::block_size, 0U); + } + __syncthreads(); + + return _dst; + } +}; + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 78873d5e8ca..1e4f061d2e0 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "delta_enc.cuh" #include "parquet_gpu.cuh" #include @@ -21,6 +22,7 @@ #include #include #include +#include #include #include @@ -41,13 +43,19 @@ #include #include +#include + namespace cudf::io::parquet::detail { namespace { using ::cudf::detail::device_2dspan; -constexpr uint32_t rle_buffer_size = (1 << 9); +constexpr int encode_block_size = 128; +constexpr int rle_buffer_size = 2 * encode_block_size; +constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size; + +constexpr int rolling_idx(int pos) { return rolling_index(pos); } // do not truncate statistics constexpr int32_t NO_TRUNC_STATS = 0; @@ -69,6 +77,7 @@ struct frag_init_state_s { PageFragment frag; }; +template struct page_enc_state_s { uint8_t* cur; //!< current output ptr uint8_t* rle_out; //!< current RLE write ptr @@ -81,14 +90,15 @@ struct page_enc_state_s { uint32_t rle_rpt_count; uint32_t page_start_val; uint32_t chunk_start_val; - volatile uint32_t rpt_map[4]; - volatile uint32_t scratch_red[32]; + volatile uint32_t rpt_map[num_encode_warps]; EncPage page; EncColumnChunk ck; parquet_column_device_view col; - uint32_t vals[rle_buffer_size]; + uint32_t vals[rle_buf_size]; }; +using rle_page_enc_state_s = page_enc_state_s; + /** * @brief Returns the size of the type in the Parquet file. */ @@ -205,6 +215,12 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t) } } +/** + * @brief Determine the correct page encoding for the given page parameters. + * + * This is only used by the plain and dictionary encoders. Delta encoders will set the page + * encoding directly. + */ Encoding __device__ determine_encoding(PageType page_type, Type physical_type, bool use_dictionary, @@ -216,7 +232,6 @@ Encoding __device__ determine_encoding(PageType page_type, switch (page_type) { case PageType::DATA_PAGE: return use_dictionary ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN; case PageType::DATA_PAGE_V2: - // TODO need to work in delta encodings here when they're added return physical_type == BOOLEAN ? Encoding::RLE : use_dictionary ? Encoding::RLE_DICTIONARY : Encoding::PLAIN; @@ -236,6 +251,50 @@ struct BitwiseOr { } }; +// I is the column type from the input table +template +__device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, + uint32_t valid_count, + uint64_t* buffer, + void* temp_space) +{ + using output_type = std::conditional_t, zigzag128_t, uleb128_t>; + __shared__ delta_binary_packer packer; + + auto const t = threadIdx.x; + if (t == 0) { + packer.init(s->cur, valid_count, reinterpret_cast(buffer), temp_space); + } + __syncthreads(); + + // TODO(ets): in the plain encoder the scaling is a little different for INT32 than INT64. + // might need to modify this if there's a big performance hit in the 32-bit case. + int32_t const scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale; + for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { + uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, delta::block_size); + + size_type const val_idx_in_block = cur_val_idx + t; + size_type const val_idx = s->page_start_val + val_idx_in_block; + + bool const is_valid = + (val_idx < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values) + ? s->col.leaf_column->is_valid(val_idx) + : false; + + cur_val_idx += nvals; + + output_type v = s->col.leaf_column->element(val_idx); + if (scale < 0) { + v /= -scale; + } else { + v *= scale; + } + packer.add_value(v, is_valid); + } + + return packer.flush(); +} + } // anonymous namespace // blockDim {512,1,1} @@ -323,6 +382,29 @@ __global__ void __launch_bounds__(128) } } +__device__ size_t delta_data_len(Type physical_type, cudf::type_id type_id, uint32_t num_values) +{ + auto const dtype_len_out = physical_type_len(physical_type, type_id); + auto const dtype_len = [&]() -> uint32_t { + if (physical_type == INT32) { return int32_logical_len(type_id); } + if (physical_type == INT96) { return sizeof(int64_t); } + return dtype_len_out; + }(); + + auto const vals_per_block = delta::block_size; + size_t const num_blocks = util::div_rounding_up_unsafe(num_values, vals_per_block); + // need max dtype_len + 1 bytes for min_delta + // one byte per mini block for the bitwidth + // and block_size * dtype_len bytes for the actual encoded data + auto const block_size = dtype_len + 1 + delta::num_mini_blocks + vals_per_block * dtype_len; + + // delta header is 2 bytes for the block_size, 1 byte for number of mini-blocks, + // max 5 bytes for number of values, and max dtype_len + 1 for first value. + auto const header_size = 2 + 1 + 5 + dtype_len + 1; + + return header_size + num_blocks * block_size; +} + // blockDim {128,1,1} __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, @@ -354,6 +436,14 @@ __global__ void __launch_bounds__(128) page_g = {}; } __syncthreads(); + + // if writing delta encoded values, we're going to need to know the data length to get a guess + // at the worst case number of bytes needed to encode. + auto const physical_type = col_g.physical_type; + auto const type_id = col_g.leaf_column->type().id(); + auto const is_use_delta = + write_v2_headers && !ck_g.use_dictionary && (physical_type == INT32 || physical_type == INT64); + if (t < 32) { uint32_t fragments_in_chunk = 0; uint32_t rows_in_page = 0; @@ -403,9 +493,12 @@ __global__ void __launch_bounds__(128) } __syncwarp(); if (t == 0) { - if (not pages.empty()) pages[ck_g.first_page] = page_g; - if (not page_sizes.empty()) page_sizes[ck_g.first_page] = page_g.max_data_size; - if (page_grstats) page_grstats[ck_g.first_page] = pagestats_g; + if (not pages.empty()) { + page_g.kernel_mask = encode_kernel_mask::PLAIN; + pages[ck_g.first_page] = page_g; + } + if (not page_sizes.empty()) { page_sizes[ck_g.first_page] = page_g.max_data_size; } + if (page_grstats) { page_grstats[ck_g.first_page] = pagestats_g; } } num_pages = 1; } @@ -505,7 +598,12 @@ __global__ void __launch_bounds__(128) page_g.num_values = values_in_page; auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page); auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page); - auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad; + // get a different bound if using delta encoding + if (is_use_delta) { + page_size = + max(page_size, delta_data_len(physical_type, type_id, page_g.num_leaf_values)); + } + auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad; // page size must fit in 32-bit signed integer if (max_data_size > std::numeric_limits::max()) { CUDF_UNREACHABLE("page size exceeds maximum for i32"); @@ -525,7 +623,16 @@ __global__ void __launch_bounds__(128) } __syncwarp(); if (t == 0) { - if (not pages.empty()) { pages[ck_g.first_page + num_pages] = page_g; } + if (not pages.empty()) { + if (is_use_delta) { + page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY; + } else if (ck_g.use_dictionary || physical_type == BOOLEAN) { + page_g.kernel_mask = encode_kernel_mask::DICTIONARY; + } else { + page_g.kernel_mask = encode_kernel_mask::PLAIN; + } + pages[ck_g.first_page + num_pages] = page_g; + } if (not page_sizes.empty()) { page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size; } @@ -789,8 +896,12 @@ inline __device__ void PackLiterals( * @param[in] t thread id (0..127) */ static __device__ void RleEncode( - page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t) + rle_page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t) { + using cudf::detail::warp_size; + auto const lane_id = t % warp_size; + auto const warp_id = t / warp_size; + uint32_t rle_pos = s->rle_pos; uint32_t rle_run = s->rle_run; @@ -798,20 +909,20 @@ static __device__ void RleEncode( uint32_t pos = rle_pos + t; if (rle_run > 0 && !(rle_run & 1)) { // Currently in a long repeat run - uint32_t mask = ballot(pos < numvals && s->vals[pos & (rle_buffer_size - 1)] == s->run_val); + uint32_t mask = ballot(pos < numvals && s->vals[rolling_idx(pos)] == s->run_val); uint32_t rle_rpt_count, max_rpt_count; - if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; } + if (lane_id == 0) { s->rpt_map[warp_id] = mask; } __syncthreads(); - if (t < 32) { + if (t < warp_size) { uint32_t c32 = ballot(t >= 4 || s->rpt_map[t] != 0xffff'ffffu); - if (!t) { + if (t == 0) { uint32_t last_idx = __ffs(c32) - 1; s->rle_rpt_count = - last_idx * 32 + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0); + last_idx * warp_size + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0); } } __syncthreads(); - max_rpt_count = min(numvals - rle_pos, 128); + max_rpt_count = min(numvals - rle_pos, encode_block_size); rle_rpt_count = s->rle_rpt_count; rle_run += rle_rpt_count << 1; rle_pos += rle_rpt_count; @@ -828,17 +939,17 @@ static __device__ void RleEncode( } } else { // New run or in a literal run - uint32_t v0 = s->vals[pos & (rle_buffer_size - 1)]; - uint32_t v1 = s->vals[(pos + 1) & (rle_buffer_size - 1)]; + uint32_t v0 = s->vals[rolling_idx(pos)]; + uint32_t v1 = s->vals[rolling_idx(pos + 1)]; uint32_t mask = ballot(pos + 1 < numvals && v0 == v1); - uint32_t maxvals = min(numvals - rle_pos, 128); + uint32_t maxvals = min(numvals - rle_pos, encode_block_size); uint32_t rle_lit_count, rle_rpt_count; - if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; } + if (lane_id == 0) { s->rpt_map[warp_id] = mask; } __syncthreads(); - if (t < 32) { + if (t < warp_size) { // Repeat run can only start on a multiple of 8 values - uint32_t idx8 = (t * 8) >> 5; - uint32_t pos8 = (t * 8) & 0x1f; + uint32_t idx8 = (t * 8) / warp_size; + uint32_t pos8 = (t * 8) % warp_size; uint32_t m0 = (idx8 < 4) ? s->rpt_map[idx8] : 0; uint32_t m1 = (idx8 < 3) ? s->rpt_map[idx8 + 1] : 0; uint32_t needed_mask = kRleRunMask[nbits - 1]; @@ -847,8 +958,8 @@ static __device__ void RleEncode( uint32_t rle_run_start = (mask != 0) ? min((__ffs(mask) - 1) * 8, maxvals) : maxvals; uint32_t rpt_len = 0; if (rle_run_start < maxvals) { - uint32_t idx_cur = rle_run_start >> 5; - uint32_t idx_ofs = rle_run_start & 0x1f; + uint32_t idx_cur = rle_run_start / warp_size; + uint32_t idx_ofs = rle_run_start % warp_size; while (idx_cur < 4) { m0 = (idx_cur < 4) ? s->rpt_map[idx_cur] : 0; m1 = (idx_cur < 3) ? s->rpt_map[idx_cur + 1] : 0; @@ -857,7 +968,7 @@ static __device__ void RleEncode( rpt_len += __ffs(mask) - 1; break; } - rpt_len += 32; + rpt_len += warp_size; idx_cur++; } } @@ -928,7 +1039,7 @@ static __device__ void RleEncode( * @param[in] flush nonzero if last batch in block * @param[in] t thread id (0..127) */ -static __device__ void PlainBoolEncode(page_enc_state_s* s, +static __device__ void PlainBoolEncode(rle_page_enc_state_s* s, uint32_t numvals, uint32_t flush, uint32_t t) @@ -938,7 +1049,7 @@ static __device__ void PlainBoolEncode(page_enc_state_s* s, while (rle_pos < numvals) { uint32_t pos = rle_pos + t; - uint32_t v = (pos < numvals) ? s->vals[pos & (rle_buffer_size - 1)] : 0; + uint32_t v = (pos < numvals) ? s->vals[rolling_idx(pos)] : 0; uint32_t n = min(numvals - rle_pos, 128); uint32_t nbytes = (n + ((flush) ? 7 : 0)) >> 3; if (!nbytes) { break; } @@ -992,28 +1103,22 @@ __device__ auto julian_days_with_time(int64_t v) return std::make_pair(dur_time_of_day_nanos, julian_days); } +// this has been split out into its own kernel because of the amount of shared memory required +// for the state buffer. encode kernels that don't use the RLE buffer can get started while +// the level data is encoded. // blockDim(128, 1, 1) template -__global__ void __launch_bounds__(128, 8) - gpuEncodePages(device_span pages, - device_span> comp_in, - device_span> comp_out, - device_span comp_results, - bool write_v2_headers) +__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span pages, + bool write_v2_headers, + encode_kernel_mask kernel_mask) { - __shared__ __align__(8) page_enc_state_s state_g; - using block_reduce = cub::BlockReduce; - using block_scan = cub::BlockScan; - __shared__ union { - typename block_reduce::TempStorage reduce_storage; - typename block_scan::TempStorage scan_storage; - } temp_storage; + __shared__ __align__(8) rle_page_enc_state_s state_g; - page_enc_state_s* const s = &state_g; - auto const t = threadIdx.x; + auto* const s = &state_g; + uint32_t const t = threadIdx.x; if (t == 0) { - state_g = page_enc_state_s{}; + state_g = rle_page_enc_state_s{}; s->page = pages[blockIdx.x]; s->ck = *s->page.chunk; s->col = *s->ck.col_desc; @@ -1026,6 +1131,8 @@ __global__ void __launch_bounds__(128, 8) } __syncthreads(); + if (BitAnd(s->page.kernel_mask, kernel_mask) == 0) { return; } + auto const is_v2 = s->page.page_type == PageType::DATA_PAGE_V2; // Encode Repetition and Definition levels @@ -1078,23 +1185,24 @@ __global__ void __launch_bounds__(128, 8) } while (is_col_struct); return def; }(); - s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl; + s->vals[rolling_idx(rle_numvals + t)] = def_lvl; __syncthreads(); rle_numvals += nrows; RleEncode(s, rle_numvals, def_lvl_bits, (rle_numvals == s->page.num_rows), t); __syncthreads(); } if (t < 32) { - uint8_t* const cur = s->cur; - uint8_t* const rle_out = s->rle_out; - uint32_t const rle_bytes = static_cast(rle_out - cur) - (is_v2 ? 0 : 4); - if (is_v2 && t == 0) { + uint8_t* const cur = s->cur; + uint8_t* const rle_out = s->rle_out; + // V2 does not write the RLE length field + uint32_t const rle_bytes = + static_cast(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN); + if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); } + __syncwarp(); + if (t == 0) { + s->cur = rle_out; s->page.def_lvl_bytes = rle_bytes; - } else if (not is_v2 && t < 4) { - cur[t] = rle_bytes >> (t * 8); } - __syncwarp(); - if (t == 0) { s->cur = rle_out; } } } } else if (s->page.page_type != PageType::DICTIONARY_PAGE && @@ -1121,29 +1229,121 @@ __global__ void __launch_bounds__(128, 8) uint32_t idx = page_first_val_idx + rle_numvals + t; uint32_t lvl_val = (rle_numvals + t < s->page.num_values && idx < col_last_val_idx) ? lvl_val_data[idx] : 0; - s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = lvl_val; + s->vals[rolling_idx(rle_numvals + t)] = lvl_val; __syncthreads(); rle_numvals += nvals; RleEncode(s, rle_numvals, nbits, (rle_numvals == s->page.num_values), t); __syncthreads(); } if (t < 32) { - uint8_t* const cur = s->cur; - uint8_t* const rle_out = s->rle_out; - uint32_t const rle_bytes = static_cast(rle_out - cur) - (is_v2 ? 0 : 4); - if (is_v2 && t == 0) { + uint8_t* const cur = s->cur; + uint8_t* const rle_out = s->rle_out; + // V2 does not write the RLE length field + uint32_t const rle_bytes = + static_cast(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN); + if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); } + __syncwarp(); + if (t == 0) { + s->cur = rle_out; lvl_bytes = rle_bytes; - } else if (not is_v2 && t < 4) { - cur[t] = rle_bytes >> (t * 8); } - __syncwarp(); - if (t == 0) { s->cur = rle_out; } } }; encode_levels(s->col.rep_values, s->col.num_rep_level_bits(), s->page.rep_lvl_bytes); __syncthreads(); encode_levels(s->col.def_values, s->col.num_def_level_bits(), s->page.def_lvl_bytes); } + + if (t == 0) { pages[blockIdx.x] = s->page; } +} + +template +__device__ void finish_page_encode(state_buf* s, + uint32_t valid_count, + uint8_t const* end_ptr, + device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results, + bool write_v2_headers) +{ + auto const t = threadIdx.x; + + // V2 does not compress rep and def level data + size_t const skip_comp_size = + write_v2_headers ? s->page.def_lvl_bytes + s->page.rep_lvl_bytes : 0; + + if (t == 0) { + // only need num_nulls for v2 data page headers + if (write_v2_headers) { s->page.num_nulls = s->page.num_values - valid_count; } + uint8_t const* const base = s->page.page_data + s->page.max_hdr_size; + auto const actual_data_size = static_cast(end_ptr - base); + if (actual_data_size > s->page.max_data_size) { + CUDF_UNREACHABLE("detected possible page data corruption"); + } + s->page.max_data_size = actual_data_size; + if (not comp_in.empty()) { + comp_in[blockIdx.x] = {base + skip_comp_size, actual_data_size - skip_comp_size}; + comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size, + 0}; // size is unused + } + pages[blockIdx.x] = s->page; + if (not comp_results.empty()) { + comp_results[blockIdx.x] = {0, compression_status::FAILURE}; + pages[blockIdx.x].comp_res = &comp_results[blockIdx.x]; + } + } + + // copy uncompressed bytes over + if (skip_comp_size != 0 && not comp_in.empty()) { + uint8_t* const src = s->page.page_data + s->page.max_hdr_size; + uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size; + for (int i = t; i < skip_comp_size; i += block_size) { + dst[i] = src[i]; + } + } +} + +// PLAIN page data encoder +// blockDim(128, 1, 1) +template +__global__ void __launch_bounds__(block_size, 8) + gpuEncodePages(device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results, + bool write_v2_headers) +{ + __shared__ __align__(8) page_enc_state_s<0> state_g; + using block_reduce = cub::BlockReduce; + using block_scan = cub::BlockScan; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename block_scan::TempStorage scan_storage; + } temp_storage; + + auto* const s = &state_g; + uint32_t t = threadIdx.x; + + if (t == 0) { + state_g = page_enc_state_s<0>{}; + s->page = pages[blockIdx.x]; + s->ck = *s->page.chunk; + s->col = *s->ck.col_desc; + s->rle_len_pos = nullptr; + // get s->cur back to where it was at the end of encoding the rep and def level data + s->cur = + s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + // if V1 data page, need space for the RLE length fields + if (s->page.page_type == PageType::DATA_PAGE) { + if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + } + } + __syncthreads(); + + if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; } + // Encode data values __syncthreads(); auto const physical_type = s->col.physical_type; @@ -1155,10 +1355,6 @@ __global__ void __launch_bounds__(128, 8) return dtype_len_out; }(); - auto const dict_bits = (physical_type == BOOLEAN) ? 1 - : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE) - ? s->ck.dict_rle_bits - : -1; if (t == 0) { uint8_t* dst = s->cur; s->rle_run = 0; @@ -1167,219 +1363,315 @@ __global__ void __launch_bounds__(128, 8) s->rle_out = dst; s->page.encoding = determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers); - if (dict_bits >= 0 && physical_type != BOOLEAN) { - dst[0] = dict_bits; - s->rle_out = dst + 1; - } else if (is_v2 && physical_type == BOOLEAN) { - // save space for RLE length. we don't know the total length yet. - s->rle_out = dst + RLE_LENGTH_FIELD_LEN; - s->rle_len_pos = dst; - } s->page_start_val = row_to_value_idx(s->page.start_row, s->col); s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col); } __syncthreads(); + uint32_t num_valid = 0; for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { - uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128); + uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size); uint32_t len, pos; auto [is_valid, val_idx] = [&]() { uint32_t val_idx; uint32_t is_valid; - size_type val_idx_in_block = cur_val_idx + t; + size_type const val_idx_in_block = cur_val_idx + t; if (s->page.page_type == PageType::DICTIONARY_PAGE) { val_idx = val_idx_in_block; is_valid = (val_idx < s->page.num_leaf_values); if (is_valid) { val_idx = s->ck.dict_data[val_idx]; } } else { - size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values) ? s->col.leaf_column->is_valid(val_idx_in_leaf_col) : 0; - val_idx = - (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col; + val_idx = val_idx_in_leaf_col; } return std::make_tuple(is_valid, val_idx); }(); - if (is_valid) num_valid++; - + if (is_valid) { num_valid++; } cur_val_idx += nvals; - if (dict_bits >= 0) { - // Dictionary encoding - if (dict_bits > 0) { - uint32_t rle_numvals; - uint32_t rle_numvals_in_block; - block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block); - rle_numvals = s->rle_numvals; - if (is_valid) { - uint32_t v; - if (physical_type == BOOLEAN) { - v = s->col.leaf_column->element(val_idx); - } else { - v = s->ck.dict_index[val_idx]; - } - s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v; - } - rle_numvals += rle_numvals_in_block; - __syncthreads(); - if (!is_v2 && physical_type == BOOLEAN) { - PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t); - } else { - RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t); + + // Non-dictionary encoding + uint8_t* dst = s->cur; + + if (is_valid) { + len = dtype_len_out; + if (physical_type == BYTE_ARRAY) { + if (type_id == type_id::STRING) { + len += s->col.leaf_column->element(val_idx).size_bytes(); + } else if (s->col.output_as_byte_array && type_id == type_id::LIST) { + len += + get_element(*s->col.leaf_column, val_idx).size_bytes(); } - __syncthreads(); } - if (t == 0) { s->cur = s->rle_out; } - __syncthreads(); } else { - // Non-dictionary encoding - uint8_t* dst = s->cur; - - if (is_valid) { - len = dtype_len_out; - if (physical_type == BYTE_ARRAY) { - if (type_id == type_id::STRING) { - len += s->col.leaf_column->element(val_idx).size_bytes(); - } else if (s->col.output_as_byte_array && type_id == type_id::LIST) { - len += - get_element(*s->col.leaf_column, val_idx).size_bytes(); + len = 0; + } + uint32_t total_len = 0; + block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len); + __syncthreads(); + if (t == 0) { s->cur = dst + total_len; } + if (is_valid) { + switch (physical_type) { + case INT32: [[fallthrough]]; + case FLOAT: { + auto const v = [dtype_len = dtype_len_in, + idx = val_idx, + col = s->col.leaf_column, + scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t { + switch (dtype_len) { + case 8: return col->element(idx) * scale; + case 4: return col->element(idx) * scale; + case 2: return col->element(idx) * scale; + default: return col->element(idx) * scale; + } + }(); + + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + } break; + case INT64: { + int64_t v = s->col.leaf_column->element(val_idx); + int32_t ts_scale = s->col.ts_scale; + if (ts_scale != 0) { + if (ts_scale < 0) { + v /= -ts_scale; + } else { + v *= ts_scale; + } + } + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + dst[pos + 4] = v >> 32; + dst[pos + 5] = v >> 40; + dst[pos + 6] = v >> 48; + dst[pos + 7] = v >> 56; + } break; + case INT96: { + int64_t v = s->col.leaf_column->element(val_idx); + int32_t ts_scale = s->col.ts_scale; + if (ts_scale != 0) { + if (ts_scale < 0) { + v /= -ts_scale; + } else { + v *= ts_scale; + } } - } - } else { - len = 0; - } - uint32_t total_len = 0; - block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len); - __syncthreads(); - if (t == 0) { s->cur = dst + total_len; } - if (is_valid) { - switch (physical_type) { - case INT32: [[fallthrough]]; - case FLOAT: { - auto const v = [dtype_len = dtype_len_in, - idx = val_idx, - col = s->col.leaf_column, - scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t { - switch (dtype_len) { - case 8: return col->element(idx) * scale; - case 4: return col->element(idx) * scale; - case 2: return col->element(idx) * scale; - default: return col->element(idx) * scale; - } - }(); - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - } break; - case INT64: { - int64_t v = s->col.leaf_column->element(val_idx); - int32_t ts_scale = s->col.ts_scale; - if (ts_scale != 0) { - if (ts_scale < 0) { - v /= -ts_scale; - } else { - v *= ts_scale; - } + auto const [last_day_nanos, julian_days] = [&] { + using namespace cuda::std::chrono; + switch (s->col.leaf_column->type().id()) { + case type_id::TIMESTAMP_SECONDS: + case type_id::TIMESTAMP_MILLISECONDS: { + return julian_days_with_time(v); + } break; + case type_id::TIMESTAMP_MICROSECONDS: + case type_id::TIMESTAMP_NANOSECONDS: { + return julian_days_with_time(v); + } break; } - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - dst[pos + 4] = v >> 32; - dst[pos + 5] = v >> 40; - dst[pos + 6] = v >> 48; - dst[pos + 7] = v >> 56; - } break; - case INT96: { - int64_t v = s->col.leaf_column->element(val_idx); - int32_t ts_scale = s->col.ts_scale; - if (ts_scale != 0) { - if (ts_scale < 0) { - v /= -ts_scale; - } else { - v *= ts_scale; - } + return julian_days_with_time(0); + }(); + + // the 12 bytes of fixed length data. + v = last_day_nanos.count(); + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + dst[pos + 4] = v >> 32; + dst[pos + 5] = v >> 40; + dst[pos + 6] = v >> 48; + dst[pos + 7] = v >> 56; + uint32_t w = julian_days.count(); + dst[pos + 8] = w; + dst[pos + 9] = w >> 8; + dst[pos + 10] = w >> 16; + dst[pos + 11] = w >> 24; + } break; + + case DOUBLE: { + auto v = s->col.leaf_column->element(val_idx); + memcpy(dst + pos, &v, 8); + } break; + case BYTE_ARRAY: { + auto const bytes = [](cudf::type_id const type_id, + column_device_view const* leaf_column, + uint32_t const val_idx) -> void const* { + switch (type_id) { + case type_id::STRING: + return reinterpret_cast( + leaf_column->element(val_idx).data()); + case type_id::LIST: + return reinterpret_cast( + get_element(*(leaf_column), val_idx).data()); + default: CUDF_UNREACHABLE("invalid type id for byte array writing!"); } + }(type_id, s->col.leaf_column, val_idx); + uint32_t v = len - 4; // string length + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + if (v != 0) memcpy(dst + pos + 4, bytes, v); + } break; + case FIXED_LEN_BYTE_ARRAY: { + if (type_id == type_id::DECIMAL128) { + // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian + auto const v = s->col.leaf_column->element(val_idx).value(); + auto const v_char_ptr = reinterpret_cast(&v); + thrust::copy(thrust::seq, + thrust::make_reverse_iterator(v_char_ptr + sizeof(v)), + thrust::make_reverse_iterator(v_char_ptr), + dst + pos); + } + } break; + } + } + __syncthreads(); + } - auto const [last_day_nanos, julian_days] = [&] { - using namespace cuda::std::chrono; - switch (s->col.leaf_column->type().id()) { - case type_id::TIMESTAMP_SECONDS: - case type_id::TIMESTAMP_MILLISECONDS: { - return julian_days_with_time(v); - } break; - case type_id::TIMESTAMP_MICROSECONDS: - case type_id::TIMESTAMP_NANOSECONDS: { - return julian_days_with_time(v); - } break; - } - return julian_days_with_time(0); - }(); - - // the 12 bytes of fixed length data. - v = last_day_nanos.count(); - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - dst[pos + 4] = v >> 32; - dst[pos + 5] = v >> 40; - dst[pos + 6] = v >> 48; - dst[pos + 7] = v >> 56; - uint32_t w = julian_days.count(); - dst[pos + 8] = w; - dst[pos + 9] = w >> 8; - dst[pos + 10] = w >> 16; - dst[pos + 11] = w >> 24; - } break; + uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid); - case DOUBLE: { - auto v = s->col.leaf_column->element(val_idx); - memcpy(dst + pos, &v, 8); - } break; - case BYTE_ARRAY: { - auto const bytes = [](cudf::type_id const type_id, - column_device_view const* leaf_column, - uint32_t const val_idx) -> void const* { - switch (type_id) { - case type_id::STRING: - return reinterpret_cast( - leaf_column->element(val_idx).data()); - case type_id::LIST: - return reinterpret_cast( - get_element(*(leaf_column), val_idx).data()); - default: CUDF_UNREACHABLE("invalid type id for byte array writing!"); - } - }(type_id, s->col.leaf_column, val_idx); - uint32_t v = len - 4; // string length - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - if (v != 0) memcpy(dst + pos + 4, bytes, v); - } break; - case FIXED_LEN_BYTE_ARRAY: { - if (type_id == type_id::DECIMAL128) { - // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian - auto const v = s->col.leaf_column->element(val_idx).value(); - auto const v_char_ptr = reinterpret_cast(&v); - thrust::copy(thrust::seq, - thrust::make_reverse_iterator(v_char_ptr + sizeof(v)), - thrust::make_reverse_iterator(v_char_ptr), - dst + pos); - } - } break; + finish_page_encode( + s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers); +} + +// DICTIONARY page data encoder +// blockDim(128, 1, 1) +template +__global__ void __launch_bounds__(block_size, 8) + gpuEncodeDictPages(device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results, + bool write_v2_headers) +{ + __shared__ __align__(8) rle_page_enc_state_s state_g; + using block_reduce = cub::BlockReduce; + using block_scan = cub::BlockScan; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename block_scan::TempStorage scan_storage; + } temp_storage; + + auto* const s = &state_g; + uint32_t t = threadIdx.x; + + if (t == 0) { + state_g = rle_page_enc_state_s{}; + s->page = pages[blockIdx.x]; + s->ck = *s->page.chunk; + s->col = *s->ck.col_desc; + s->rle_len_pos = nullptr; + // get s->cur back to where it was at the end of encoding the rep and def level data + s->cur = + s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + // if V1 data page, need space for the RLE length fields + if (s->page.page_type == PageType::DATA_PAGE) { + if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + } + } + __syncthreads(); + + if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DICTIONARY) == 0) { return; } + + // Encode data values + __syncthreads(); + auto const physical_type = s->col.physical_type; + auto const type_id = s->col.leaf_column->type().id(); + auto const dtype_len_out = physical_type_len(physical_type, type_id); + auto const dtype_len_in = [&]() -> uint32_t { + if (physical_type == INT32) { return int32_logical_len(type_id); } + if (physical_type == INT96) { return sizeof(int64_t); } + return dtype_len_out; + }(); + + // TODO assert dict_bits >= 0 + auto const dict_bits = (physical_type == BOOLEAN) ? 1 + : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE) + ? s->ck.dict_rle_bits + : -1; + if (t == 0) { + uint8_t* dst = s->cur; + s->rle_run = 0; + s->rle_pos = 0; + s->rle_numvals = 0; + s->rle_out = dst; + s->page.encoding = + determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers); + if (dict_bits >= 0 && physical_type != BOOLEAN) { + dst[0] = dict_bits; + s->rle_out = dst + 1; + } else if (write_v2_headers && physical_type == BOOLEAN) { + // save space for RLE length. we don't know the total length yet. + s->rle_out = dst + RLE_LENGTH_FIELD_LEN; + s->rle_len_pos = dst; + } + s->page_start_val = row_to_value_idx(s->page.start_row, s->col); + s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col); + } + __syncthreads(); + + uint32_t num_valid = 0; + for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { + uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size); + + auto [is_valid, val_idx] = [&]() { + size_type const val_idx_in_block = cur_val_idx + t; + size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + + uint32_t const is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() && + val_idx_in_block < s->page.num_leaf_values) + ? s->col.leaf_column->is_valid(val_idx_in_leaf_col) + : 0; + // need to test for use_dictionary because it might be boolean + uint32_t const val_idx = + (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col; + return std::make_tuple(is_valid, val_idx); + }(); + + if (is_valid) { num_valid++; } + cur_val_idx += nvals; + + // Dictionary encoding + if (dict_bits > 0) { + uint32_t rle_numvals; + uint32_t rle_numvals_in_block; + uint32_t pos; + block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block); + rle_numvals = s->rle_numvals; + if (is_valid) { + uint32_t v; + if (physical_type == BOOLEAN) { + v = s->col.leaf_column->element(val_idx); + } else { + v = s->ck.dict_index[val_idx]; } + s->vals[rolling_idx(rle_numvals + pos)] = v; + } + rle_numvals += rle_numvals_in_block; + __syncthreads(); + if ((!write_v2_headers) && (physical_type == BOOLEAN)) { + PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t); + } else { + RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t); } __syncthreads(); } + if (t == 0) { s->cur = s->rle_out; } + __syncthreads(); } uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid); @@ -1392,37 +1684,137 @@ __global__ void __launch_bounds__(128, 8) __syncwarp(); } - // V2 does not compress rep and def level data - size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + finish_page_encode( + s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers); +} + +// DELTA_BINARY_PACKED page data encoder +// blockDim(128, 1, 1) +template +__global__ void __launch_bounds__(block_size, 8) + gpuEncodeDeltaBinaryPages(device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results) +{ + // block of shared memory for value storage and bit packing + __shared__ uleb128_t delta_shared[delta::buffer_size + delta::block_size]; + __shared__ __align__(8) page_enc_state_s<0> state_g; + using block_reduce = cub::BlockReduce; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename delta::index_scan::TempStorage delta_index_tmp; + typename delta::block_reduce::TempStorage delta_reduce_tmp; + typename delta::warp_reduce::TempStorage delta_warp_red_tmp[delta::num_mini_blocks]; + } temp_storage; + + auto* const s = &state_g; + uint32_t t = threadIdx.x; if (t == 0) { - s->page.num_nulls = s->page.num_values - valid_count; - uint8_t* const base = s->page.page_data + s->page.max_hdr_size; - auto const actual_data_size = static_cast(s->cur - base); - if (actual_data_size > s->page.max_data_size) { - CUDF_UNREACHABLE("detected possible page data corruption"); - } - s->page.max_data_size = actual_data_size; - if (not comp_in.empty()) { - comp_in[blockIdx.x] = {base + skip_comp_size, actual_data_size - skip_comp_size}; - comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size, - 0}; // size is unused - } - pages[blockIdx.x] = s->page; - if (not comp_results.empty()) { - comp_results[blockIdx.x] = {0, compression_status::FAILURE}; - pages[blockIdx.x].comp_res = &comp_results[blockIdx.x]; + state_g = page_enc_state_s<0>{}; + s->page = pages[blockIdx.x]; + s->ck = *s->page.chunk; + s->col = *s->ck.col_desc; + s->rle_len_pos = nullptr; + // get s->cur back to where it was at the end of encoding the rep and def level data + s->cur = + s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + } + __syncthreads(); + + if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DELTA_BINARY) == 0) { return; } + + // Encode data values + __syncthreads(); + auto const physical_type = s->col.physical_type; + auto const type_id = s->col.leaf_column->type().id(); + auto const dtype_len_out = physical_type_len(physical_type, type_id); + auto const dtype_len_in = [&]() -> uint32_t { + if (physical_type == INT32) { return int32_logical_len(type_id); } + if (physical_type == INT96) { return sizeof(int64_t); } + return dtype_len_out; + }(); + + if (t == 0) { + uint8_t* dst = s->cur; + s->rle_run = 0; + s->rle_pos = 0; + s->rle_numvals = 0; + s->rle_out = dst; + s->page.encoding = Encoding::DELTA_BINARY_PACKED; + s->page_start_val = row_to_value_idx(s->page.start_row, s->col); + s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col); + } + __syncthreads(); + + // need to know the number of valid values for the null values calculation and to size + // the delta binary encoder. + uint32_t valid_count = 0; + if (not s->col.leaf_column->nullable()) { + valid_count = s->page.num_leaf_values; + } else { + uint32_t num_valid = 0; + for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { + uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, block_size); + size_type const val_idx_in_block = cur_val_idx + t; + size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + + if (val_idx_in_leaf_col < s->col.leaf_column->size() && + val_idx_in_block < s->page.num_leaf_values && + s->col.leaf_column->is_valid(val_idx_in_leaf_col)) { + num_valid++; + } + cur_val_idx += nvals; } + valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid); } - // copy over uncompressed data - if (skip_comp_size != 0 && not comp_in.empty()) { - uint8_t const* const src = s->page.page_data + s->page.max_hdr_size; - uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size; - for (int i = t; i < skip_comp_size; i += block_size) { - dst[i] = src[i]; + uint8_t const* delta_ptr = nullptr; // this will be the end of delta block pointer + + if (physical_type == INT32) { + switch (dtype_len_in) { + case 8: { + // only DURATIONS map to 8 bytes, so safe to just use signed here? + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + break; + } + case 4: { + if (type_id == type_id::UINT32) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } + break; + } + case 2: { + if (type_id == type_id::UINT16) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } + break; + } + case 1: { + if (type_id == type_id::UINT8) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } + break; + } + default: CUDF_UNREACHABLE("invalid dtype_len_in when encoding DELTA_BINARY_PACKED"); + } + } else { + if (type_id == type_id::UINT64) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); } } + + finish_page_encode( + s, valid_count, delta_ptr, pages, comp_in, comp_out, comp_results, true); } constexpr int decide_compression_warps_in_block = 4; @@ -1457,7 +1849,8 @@ __global__ void __launch_bounds__(decide_compression_block_size) for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) { auto const& curr_page = ck_g[warp_id].pages[page_id]; auto const page_data_size = curr_page.max_data_size; - auto const lvl_bytes = curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes; + auto const is_v2 = curr_page.page_type == PageType::DATA_PAGE_V2; + auto const lvl_bytes = is_v2 ? curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes : 0; uncompressed_data_size += page_data_size; if (auto comp_res = curr_page.comp_res; comp_res != nullptr) { compressed_data_size += comp_res->bytes_written + lvl_bytes; @@ -1920,7 +2313,8 @@ __global__ void __launch_bounds__(128) } uncompressed_page_size = page_g.max_data_size; if (ck_g.is_compressed) { - auto const lvl_bytes = page_g.def_lvl_bytes + page_g.rep_lvl_bytes; + auto const is_v2 = page_g.page_type == PageType::DATA_PAGE_V2; + auto const lvl_bytes = is_v2 ? page_g.def_lvl_bytes + page_g.rep_lvl_bytes : 0; hdr_start = page_g.compressed_data; compressed_page_size = static_cast(comp_results[blockIdx.x].bytes_written) + lvl_bytes; @@ -2155,6 +2549,10 @@ constexpr __device__ void* align8(void* ptr) return static_cast(ptr) - algn; } +struct mask_tform { + __device__ uint32_t operator()(EncPage const& p) { return static_cast(p.kernel_mask); } +}; + } // namespace // blockDim(1, 1, 1) @@ -2257,8 +2655,9 @@ void InitFragmentStatistics(device_span groups, rmm::cuda_stream_view stream) { int const num_fragments = fragments.size(); - int const dim = util::div_rounding_up_safe(num_fragments, 128 / cudf::detail::warp_size); - gpuInitFragmentStats<<>>(groups, fragments); + int const dim = + util::div_rounding_up_safe(num_fragments, encode_block_size / cudf::detail::warp_size); + gpuInitFragmentStats<<>>(groups, fragments); } void InitEncoderPages(device_2dspan chunks, @@ -2277,18 +2676,18 @@ void InitEncoderPages(device_2dspan chunks, { auto num_rowgroups = chunks.size().first; dim3 dim_grid(num_columns, num_rowgroups); // 1 threadblock per rowgroup - gpuInitPages<<>>(chunks, - pages, - page_sizes, - comp_page_sizes, - col_desc, - page_grstats, - chunk_grstats, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_align, - write_v2_headers); + gpuInitPages<<>>(chunks, + pages, + page_sizes, + comp_page_sizes, + col_desc, + page_grstats, + chunk_grstats, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_align, + write_v2_headers); } void EncodePages(device_span pages, @@ -2299,10 +2698,43 @@ void EncodePages(device_span pages, rmm::cuda_stream_view stream) { auto num_pages = pages.size(); + + // determine which kernels to invoke + auto mask_iter = thrust::make_transform_iterator(pages.begin(), mask_tform{}); + uint32_t kernel_mask = thrust::reduce( + rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); + + // get the number of streams we need from the pool + int nkernels = std::bitset<32>(kernel_mask).count(); + auto streams = cudf::detail::fork_streams(stream, nkernels); + // A page is part of one column. This is launching 1 block per page. 1 block will exclusively // deal with one datatype. - gpuEncodePages<128><<>>( - pages, comp_in, comp_out, comp_results, write_v2_headers); + + int s_idx = 0; + if (BitAnd(kernel_mask, encode_kernel_mask::PLAIN) != 0) { + auto const strm = streams[s_idx++]; + gpuEncodePageLevels<<>>( + pages, write_v2_headers, encode_kernel_mask::PLAIN); + gpuEncodePages<<>>( + pages, comp_in, comp_out, comp_results, write_v2_headers); + } + if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) { + auto const strm = streams[s_idx++]; + gpuEncodePageLevels<<>>( + pages, write_v2_headers, encode_kernel_mask::DELTA_BINARY); + gpuEncodeDeltaBinaryPages + <<>>(pages, comp_in, comp_out, comp_results); + } + if (BitAnd(kernel_mask, encode_kernel_mask::DICTIONARY) != 0) { + auto const strm = streams[s_idx++]; + gpuEncodePageLevels<<>>( + pages, write_v2_headers, encode_kernel_mask::DICTIONARY); + gpuEncodeDictPages<<>>( + pages, comp_in, comp_out, comp_results, write_v2_headers); + } + + cudf::detail::join_streams(streams, stream); } void DecideCompression(device_span chunks, rmm::cuda_stream_view stream) @@ -2320,7 +2752,7 @@ void EncodePageHeaders(device_span pages, { // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the // threads to coop load structs - gpuEncodePageHeaders<<>>( + gpuEncodePageHeaders<<>>( pages, comp_results, page_stats, chunk_stats); } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 6a93fec0c46..048f1a73a9c 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -88,6 +88,37 @@ struct input_column_info { auto nesting_depth() const { return nesting.size(); } }; +// The delta encodings use ULEB128 integers, but parquet only uses max 64 bits. +using uleb128_t = uint64_t; +using zigzag128_t = int64_t; + +// this is in C++23 +#if !defined(__cpp_lib_is_scoped_enum) +template > +struct is_scoped_enum { + static const bool value = not std::is_convertible_v>; +}; + +template +struct is_scoped_enum { + static const bool value = false; +}; +#else +using std::is_scoped_enum; +#endif + +// helpers to do bit operations on scoped enums +template ::value and std::is_same_v) or + (is_scoped_enum::value and std::is_same_v) or + (is_scoped_enum::value and std::is_same_v)>* = + nullptr> +constexpr uint32_t BitAnd(T1 a, T2 b) +{ + return static_cast(a) & static_cast(b); +} + /** * @brief Enums for the flags in the page header */ @@ -371,6 +402,17 @@ constexpr uint32_t encoding_to_mask(Encoding encoding) return 1 << static_cast(encoding); } +/** + * @brief Enum of mask bits for the EncPage kernel_mask + * + * Used to control which encode kernels to run. + */ +enum class encode_kernel_mask { + PLAIN = (1 << 0), // Run plain encoding kernel + DICTIONARY = (1 << 1), // Run dictionary encoding kernel + DELTA_BINARY = (1 << 2) // Run DELTA_BINARY_PACKED encoding kernel +}; + /** * @brief Struct describing an encoder column chunk */ @@ -429,10 +471,11 @@ struct EncPage { uint32_t num_leaf_values; //!< Values in page. Different from num_rows in case of nested types uint32_t num_values; //!< Number of def/rep level values in page. Includes null/empty elements in //!< non-leaf levels - uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data (V2 only) - uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data (V2 only) - compression_result* comp_res; //!< Ptr to compression result - uint32_t num_nulls; //!< Number of null values (V2 only) (down here for alignment) + uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data (V2 only) + uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data (V2 only) + compression_result* comp_res; //!< Ptr to compression result + uint32_t num_nulls; //!< Number of null values (V2 only) (down here for alignment) + encode_kernel_mask kernel_mask; //!< Mask used to control which encoding kernels to run }; /** diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index fa85e3a4a1d..2a654bd7e8c 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -353,6 +353,9 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest { template struct ParquetReaderSourceTest : public ParquetReaderTest {}; +template +struct ParquetWriterDeltaTest : public ParquetWriterTest {}; + // Declare typed test cases // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 using SupportedTypes = cudf::test::Types; @@ -384,7 +387,6 @@ TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes); class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam {}; // test the allowed bit widths for dictionary encoding -// values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest, ParquetSizedTest, testing::Range(1, 25), @@ -6698,7 +6700,7 @@ TEST_P(ParquetV2Test, CheckEncodings) // data should be PLAIN for v1, RLE for V2 auto col0_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; }); - // data should be PLAIN for both + // data should be PLAIN for v1, DELTA_BINARY_PACKED for v2 auto col1_data = random_values(num_rows); // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2 auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); @@ -6733,10 +6735,10 @@ TEST_P(ParquetV2Test, CheckEncodings) // col0 should have RLE for rep/def and data EXPECT_TRUE(chunk0_enc.size() == 1); EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE)); - // col1 should have RLE for rep/def and PLAIN for data + // col1 should have RLE for rep/def and DELTA_BINARY_PACKED for data EXPECT_TRUE(chunk1_enc.size() == 2); EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); - EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN)); + EXPECT_TRUE(contains(chunk1_enc, Encoding::DELTA_BINARY_PACKED)); // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data EXPECT_TRUE(chunk2_enc.size() == 3); EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); @@ -6758,6 +6760,104 @@ TEST_P(ParquetV2Test, CheckEncodings) } } +// removing duration_D, duration_s, and timestamp_s as they don't appear to be supported properly. +// see definition of UnsupportedChronoTypes above. +using DeltaDecimalTypes = cudf::test::Types; +using DeltaBinaryTypes = + cudf::test::Concat; +using SupportedDeltaTestTypes = + cudf::test::RemoveIf, DeltaBinaryTypes>; +TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes); + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes) +{ + using T = TypeParam; + auto col0 = testdata::ascending(); + auto col1 = testdata::unordered(); + + auto const expected = table_view{{col0, col1}}; + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPacked.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypesSliced) +{ + using T = TypeParam; + constexpr int num_rows = 4'000; + auto col0 = testdata::ascending(); + auto col1 = testdata::unordered(); + + auto const expected = table_view{{col0, col1}}; + auto expected_slice = cudf::slice(expected, {num_rows, 2 * num_rows}); + ASSERT_EQ(expected_slice[0].num_rows(), num_rows); + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedSliced.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced) +{ + using T = TypeParam; + + constexpr int num_slice = 4'000; + constexpr int num_rows = 32 * 1024; + + std::mt19937 gen(6542); + std::bernoulli_distribution bn(0.7f); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + auto values = thrust::make_counting_iterator(0); + + // list + constexpr int vals_per_row = 4; + auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( + 0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; }); + cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, + c1_offset_iter + num_rows + 1); + cudf::test::fixed_width_column_wrapper c1_vals( + values, values + (num_rows * vals_per_row), valids); + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); + + auto _c1 = cudf::make_lists_column( + num_rows, c1_offsets.release(), c1_vals.release(), null_count, std::move(null_mask)); + auto c1 = cudf::purge_nonempty_nulls(*_c1); + + auto const expected = table_view{{*c1}}; + auto expected_slice = cudf::slice(expected, {num_slice, 2 * num_slice}); + ASSERT_EQ(expected_slice[0].num_rows(), num_slice); + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedListSliced.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); +} + TEST_F(ParquetWriterTest, EmptyMinStringStatistics) { char const* const min_val = ""; From 0341bb7cebfab1fb45d4a53cfc495265bb96ee3a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 20 Oct 2023 10:46:02 -0700 Subject: [PATCH 3/5] Expose streams in public null mask APIs (#14263) Contributes to #925 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/14263 --- cpp/include/cudf/lists/detail/scatter.cuh | 6 +- cpp/include/cudf/null_mask.hpp | 24 +++++- cpp/src/binaryop/binaryop.cpp | 2 +- cpp/src/bitmask/null_mask.cu | 38 +++++++--- cpp/src/copying/concatenate.cu | 2 +- cpp/src/copying/scatter.cu | 5 +- cpp/src/groupby/hash/groupby.cu | 3 +- cpp/src/lists/contains.cu | 16 ++-- cpp/src/merge/merge.cu | 2 +- cpp/src/round/round.cu | 16 +++- cpp/src/search/contains_column.cu | 2 +- cpp/src/strings/replace/multi.cu | 2 +- cpp/src/strings/split/split_re.cu | 2 +- cpp/src/strings/split/split_record.cu | 6 +- cpp/src/unary/cast_ops.cu | 8 +- cpp/src/unary/math_ops.cu | 8 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/null_mask_test.cpp | 92 +++++++++++++++++++++++ 18 files changed, 191 insertions(+), 44 deletions(-) create mode 100644 cpp/tests/streams/null_mask_test.cpp diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index f04b2fda2bf..ff148c59a23 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -20,9 +20,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -130,8 +130,8 @@ std::unique_ptr scatter_impl(rmm::device_uvector cons std::vector> children; children.emplace_back(std::move(offsets_column)); children.emplace_back(std::move(child_column)); - auto null_mask = - target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr}; + auto null_mask = target.has_nulls() ? cudf::detail::copy_bitmask(target, stream, mr) + : rmm::device_buffer{0, stream, mr}; // The output column from this function only has null masks copied from the target columns. // That is still not a correct final null mask for the scatter result. diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 672f479ad53..524296e60ca 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -80,6 +81,7 @@ size_type num_bitmask_words(size_type number_of_bits); * * @param size The number of elements to be represented by the mask * @param state The desired state of the mask + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A `device_buffer` for use as a null bitmask * satisfying the desired size and state @@ -87,6 +89,7 @@ size_type num_bitmask_words(size_type number_of_bits); rmm::device_buffer create_null_mask( size_type size, mask_state state, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -100,8 +103,13 @@ rmm::device_buffer create_null_mask( * @param begin_bit Index of the first bit to set (inclusive) * @param end_bit Index of the last bit to set (exclusive) * @param valid If true set all entries to valid; otherwise, set all to null + * @param stream CUDA stream used for device memory operations and kernel launches */ -void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid); +void set_null_mask(bitmask_type* bitmask, + size_type begin_bit, + size_type end_bit, + bool valid, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Creates a `device_buffer` from a slice of bitmask defined by a range @@ -115,6 +123,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit * @param mask Bitmask residing in device memory whose bits will be copied * @param begin_bit Index of the first bit to be copied (inclusive) * @param end_bit Index of the last bit to be copied (exclusive) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A `device_buffer` containing the bits * `[begin_bit, end_bit)` from `mask`. @@ -123,6 +132,7 @@ rmm::device_buffer copy_bitmask( bitmask_type const* mask, size_type begin_bit, size_type end_bit, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -132,12 +142,14 @@ rmm::device_buffer copy_bitmask( * Returns empty `device_buffer` if the column is not nullable * * @param view Column view whose bitmask needs to be copied + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A `device_buffer` containing the bits * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask. */ rmm::device_buffer copy_bitmask( column_view const& view, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -148,11 +160,13 @@ rmm::device_buffer copy_bitmask( * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A pair of resulting bitmask and count of unset bits */ std::pair bitmask_and( table_view const& view, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -163,11 +177,13 @@ std::pair bitmask_and( * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A pair of resulting bitmask and count of unset bits */ std::pair bitmask_or( table_view const& view, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -183,8 +199,12 @@ std::pair bitmask_or( * @param bitmask Validity bitmask residing in device memory. * @param start Index of the first bit to count (inclusive). * @param stop Index of the last bit to count (exclusive). + * @param stream CUDA stream used for device memory operations and kernel launches * @return The number of null elements in the specified range. */ -cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop); +cudf::size_type null_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 6b413ab2be4..53b04c4ca80 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -366,7 +366,7 @@ std::unique_ptr binary_operation(column_view const& lhs, CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match"); - auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr); + auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); auto out = make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr); diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 5a0d3e4f120..3ff56eabe1e 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -157,16 +157,21 @@ void set_null_mask(bitmask_type* bitmask, // Create a device_buffer for a null mask rmm::device_buffer create_null_mask(size_type size, mask_state state, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::create_null_mask(size, state, cudf::get_default_stream(), mr); + return detail::create_null_mask(size, state, stream, mr); } // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true, // or null, otherwise; -void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid) +void set_null_mask(bitmask_type* bitmask, + size_type begin_bit, + size_type end_bit, + bool valid, + rmm::cuda_stream_view stream) { - return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, cudf::get_default_stream()); + return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, stream); } namespace detail { @@ -511,33 +516,46 @@ std::pair bitmask_or(table_view const& view, rmm::device_buffer copy_bitmask(bitmask_type const* mask, size_type begin_bit, size_type end_bit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr); } // Create a bitmask from a column view -rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr) +rmm::device_buffer copy_bitmask(column_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return detail::copy_bitmask(view, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::copy_bitmask(view, stream, mr); } std::pair bitmask_and(table_view const& view, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::bitmask_and(view, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::bitmask_and(view, stream, mr); } std::pair bitmask_or(table_view const& view, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::bitmask_or(view, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::bitmask_or(view, stream, mr); } // Count non-zero bits in the specified range -cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop) +cudf::size_type null_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream) { - return detail::null_count(bitmask, start, stop, cudf::get_default_stream()); + CUDF_FUNC_RANGE(); + return detail::null_count(bitmask, start, stop, stream); } } // namespace cudf diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index d08c3025553..9b9e780965a 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -563,7 +563,7 @@ rmm::device_buffer concatenate_masks(host_span views, }); rmm::device_buffer null_mask = - create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr); + cudf::detail::create_null_mask(total_element_count, mask_state::UNINITIALIZED, stream, mr); detail::concatenate_masks(views, static_cast(null_mask.data()), stream); diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 11c27fc86e3..879ddb5048e 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -268,8 +268,9 @@ struct column_scalar_scatterer_impl { // Compute null mask rmm::device_buffer null_mask = - target.nullable() ? copy_bitmask(target, stream, mr) - : create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr); + target.nullable() + ? detail::copy_bitmask(target, stream, mr) + : detail::create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr); column null_mask_stub(data_type{type_id::STRUCT}, target.size(), rmm::device_buffer{}, diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 506832881a9..195c8924c9a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -410,7 +410,8 @@ void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first; + auto row_bitmask = + cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first; bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; bitmask_type const* row_bitmask_ptr = skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index 4733a5d63a8..cd2bc493bc7 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -274,12 +275,13 @@ std::unique_ptr index_of(lists_column_view const& lists, rmm::mr::device_memory_resource* mr) { if (!search_key.is_valid(stream)) { - return make_numeric_column(data_type{cudf::type_to_id()}, - lists.size(), - cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr), - lists.size(), - stream, - mr); + return make_numeric_column( + data_type{cudf::type_to_id()}, + lists.size(), + cudf::detail::create_null_mask(lists.size(), mask_state::ALL_NULL, stream, mr), + lists.size(), + stream, + mr); } if (lists.size() == 0) { return make_numeric_column( @@ -337,7 +339,7 @@ std::unique_ptr contains_nulls(lists_column_view const& lists, auto const lists_cv = lists.parent(); auto output = make_numeric_column(data_type{type_to_id()}, lists.size(), - copy_bitmask(lists_cv, stream, mr), + cudf::detail::copy_bitmask(lists_cv, stream, mr), lists_cv.null_count(), stream, mr); diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index c0765b48205..00a2f0bee8f 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -381,7 +381,7 @@ std::unique_ptr column_merger::operator()( // materialize the output buffer rmm::device_buffer validity = lcol.has_nulls() || rcol.has_nulls() - ? create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr) + ? detail::create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr) : rmm::device_buffer{}; if (lcol.has_nulls() || rcol.has_nulls()) { materialize_bitmask(lcol, diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 41cce57d55b..8a6367a1f87 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -219,8 +219,12 @@ std::unique_ptr round_with(column_view const& input, if (decimal_places >= 0 && std::is_integral_v) return std::make_unique(input, stream, mr); - auto result = cudf::make_fixed_width_column( - input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + auto result = cudf::make_fixed_width_column(input.type(), + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); auto out_view = result->mutable_view(); T const n = std::pow(10, std::abs(decimal_places)); @@ -256,8 +260,12 @@ std::unique_ptr round_with(column_view const& input, if (input.type().scale() > -decimal_places) return cudf::detail::cast(input, result_type, stream, mr); - auto result = cudf::make_fixed_width_column( - result_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + auto result = cudf::make_fixed_width_column(result_type, + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); auto out_view = result->mutable_view(); diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu index 85971647434..b8c7d058535 100644 --- a/cpp/src/search/contains_column.cu +++ b/cpp/src/search/contains_column.cu @@ -42,7 +42,7 @@ struct contains_column_dispatch { stream, mr); return std::make_unique( - std::move(result_v), copy_bitmask(needles, stream, mr), needles.null_count()); + std::move(result_v), detail::copy_bitmask(needles, stream, mr), needles.null_count()); } }; diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index ee47932100a..f80ace57c69 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -383,7 +383,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in std::move(offsets), std::move(chars->release().children.back()), input.null_count(), - copy_bitmask(input.parent(), stream, mr)); + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } /** diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 913aec79758..045aac279e6 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -290,7 +290,7 @@ std::unique_ptr split_record_re(strings_column_view const& input, std::move(offsets), std::move(strings_output), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 52f27c68111..7a0cfb9ef41 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -57,7 +57,7 @@ std::unique_ptr split_record_fn(strings_column_view const& input, std::move(offsets), std::move(results), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } @@ -72,7 +72,7 @@ std::unique_ptr split_record_fn(strings_column_view const& input, std::move(offsets), std::move(strings_child), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } @@ -160,7 +160,7 @@ std::unique_ptr whitespace_split_record_fn(strings_column_view const& in std::move(offsets), std::move(strings_output), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index 1c81f266200..6fa87b1f709 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -194,7 +194,7 @@ std::unique_ptr rescale(column_view input, auto const scalar = make_fixed_point_scalar(0, scale_type{scale}, stream); auto output_column = make_column_from_scalar(*scalar, input.size(), stream, mr); if (input.nullable()) { - auto const null_mask = copy_bitmask(input, stream, mr); + auto const null_mask = detail::copy_bitmask(input, stream, mr); output_column->set_null_mask(std::move(null_mask), input.null_count()); } return output_column; @@ -255,7 +255,7 @@ struct dispatch_unary_cast_to { std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count()); mutable_column_view output_mutable = *output; @@ -285,7 +285,7 @@ struct dispatch_unary_cast_to { std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count()); mutable_column_view output_mutable = *output; @@ -334,7 +334,7 @@ struct dispatch_unary_cast_to { auto output = std::make_unique(cudf::data_type{type.id(), input.type().scale()}, size, rmm::device_buffer{size * cudf::size_of(type), stream}, - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count()); mutable_column_view output_mutable = *output; diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index d0cae81a9c8..d84e0171b49 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -291,8 +291,12 @@ std::unique_ptr unary_op_with(column_view const& input, std::is_same_v>)) return std::make_unique(input, stream, mr); - auto result = cudf::make_fixed_width_column( - input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + auto result = cudf::make_fixed_width_column(input.type(), + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); auto out_view = result->mutable_view(); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3e30db7abcb..16e7239ebd8 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -629,6 +629,7 @@ ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp new file mode 100644 index 00000000000..7e59201c8cf --- /dev/null +++ b/cpp/tests/streams/null_mask_test.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include +#include + +class NullMaskTest : public cudf::test::BaseFixture {}; + +TEST_F(NullMaskTest, CreateNullMask) +{ + cudf::create_null_mask(10, cudf::mask_state::ALL_VALID, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, SetNullMask) +{ + cudf::test::fixed_width_column_wrapper col({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + + cudf::set_null_mask(static_cast(col).null_mask(), + 0, + 3, + false, + cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, CopyBitmask) +{ + cudf::test::fixed_width_column_wrapper const col({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + + cudf::copy_bitmask( + static_cast(col).null_mask(), 0, 3, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, CopyBitmaskFromColumn) +{ + cudf::test::fixed_width_column_wrapper const col({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + + cudf::copy_bitmask(col, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, BitMaskAnd) +{ + cudf::test::fixed_width_column_wrapper const col1({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + cudf::test::fixed_width_column_wrapper const col2({0, 1, 0, 1, 1}, + {true, true, false, false, true}); + + auto tbl = cudf::table_view{{col1, col2}}; + cudf::bitmask_and(tbl, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, BitMaskOr) +{ + cudf::test::fixed_width_column_wrapper const col1({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + cudf::test::fixed_width_column_wrapper const col2({0, 1, 0, 1, 1}, + {true, true, false, false, true}); + + auto tbl = cudf::table_view{{col1, col2}}; + cudf::bitmask_or(tbl, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, NullCount) +{ + cudf::test::fixed_width_column_wrapper const col({0, 1, 0, 1, 1}, + {true, true, false, false, true}); + + cudf::null_count( + static_cast(col).null_mask(), 0, 4, cudf::test::get_default_stream()); +} From e7c6365a4976881dc3cf0bcbfa254eb664cfe877 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 20 Oct 2023 11:32:38 -0700 Subject: [PATCH 4/5] Detect and report errors in Parquet header parsing (#14237) Fixes #13656. Uses the error reporting introduced in #14167 to report errors in header parsing. Authors: - Ed Seidl (https://github.com/etseidl) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14237 --- cpp/src/io/parquet/error.hpp | 77 ++++++++++++++++++ cpp/src/io/parquet/page_data.cu | 5 +- cpp/src/io/parquet/page_delta_decode.cu | 5 +- cpp/src/io/parquet/page_hdr.cu | 58 ++++++------- cpp/src/io/parquet/page_string_decode.cu | 5 +- cpp/src/io/parquet/parquet_gpu.hpp | 32 +++++++- cpp/src/io/parquet/reader_impl.cpp | 11 ++- cpp/src/io/parquet/reader_impl_preprocess.cu | 44 ++++------ .../cudf/tests/data/parquet/bad_dict.parquet | Bin 0 -> 2850 bytes python/cudf/cudf/tests/test_parquet.py | 8 ++ 10 files changed, 170 insertions(+), 75 deletions(-) create mode 100644 cpp/src/io/parquet/error.hpp create mode 100644 python/cudf/cudf/tests/data/parquet/bad_dict.parquet diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp new file mode 100644 index 00000000000..92b5eebe9fd --- /dev/null +++ b/cpp/src/io/parquet/error.hpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace cudf::io::parquet { + +/** + * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in + * kernel calls. + * + * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout + * the object's lifetime. + */ +class kernel_error { + private: + rmm::device_scalar _error_code; + + public: + /** + * @brief Construct a new `kernel_error` with an initial value of 0. + * + * Note: the initial value is set asynchronously. + * + * @throws `rmm::bad_alloc` if allocating the device memory for `initial_value` fails. + * @throws `rmm::cuda_error` if copying `initial_value` to device memory fails. + * + * @param CUDA stream to use + */ + kernel_error(rmm::cuda_stream_view stream) : _error_code{0, stream} {} + + /** + * @brief Return a pointer to the device memory for the error + */ + [[nodiscard]] auto data() { return _error_code.data(); } + + /** + * @brief Return the current value of the error + * + * This uses the stream used to create this instance. This does a synchronize on the stream + * this object was instantiated with. + */ + [[nodiscard]] auto value() const { return _error_code.value(_error_code.stream()); } + + /** + * @brief Return a hexadecimal string representation of the current error code + * + * Returned string will have "0x" prepended. + */ + [[nodiscard]] std::string str() const + { + std::stringstream sstream; + sstream << std::hex << value(); + return "0x" + sstream.str(); + } +}; + +} // namespace cudf::io::parquet diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index cce3659b902..a783b489c02 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -599,10 +599,7 @@ __global__ void __launch_bounds__(decode_block_size) } __syncthreads(); } - if (t == 0 and s->error != 0) { - cuda::atomic_ref ref{*error_code}; - ref.fetch_or(s->error, cuda::std::memory_order_relaxed); - } + if (t == 0 and s->error != 0) { set_error(s->error, error_code); } } struct mask_tform { diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index d25684a59f3..bb5e5066b69 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -151,10 +151,7 @@ __global__ void __launch_bounds__(96) __syncthreads(); } - if (t == 0 and s->error != 0) { - cuda::atomic_ref ref{*error_code}; - ref.fetch_or(s->error, cuda::std::memory_order_relaxed); - } + if (t == 0 and s->error != 0) { set_error(s->error, error_code); } } } // anonymous namespace diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index eae8e05e61e..22add2fffc6 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -16,6 +16,9 @@ #include "parquet_gpu.hpp" #include + +#include + #include #include @@ -25,23 +28,6 @@ namespace cudf::io::parquet::detail { // Minimal thrift implementation for parsing page headers // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md -static const __device__ __constant__ uint8_t g_list2struct[16] = {0, - 1, - 2, - ST_FLD_BYTE, - ST_FLD_DOUBLE, - 5, - ST_FLD_I16, - 7, - ST_FLD_I32, - 9, - ST_FLD_I64, - ST_FLD_BINARY, - ST_FLD_STRUCT, - ST_FLD_MAP, - ST_FLD_SET, - ST_FLD_LIST}; - struct byte_stream_s { uint8_t const* cur{}; uint8_t const* end{}; @@ -140,12 +126,13 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) case ST_FLD_SET: { // NOTE: skipping a list of lists is not handled auto const c = getb(bs); int n = c >> 4; - if (n == 0xf) n = get_u32(bs); - field_type = g_list2struct[c & 0xf]; - if (field_type == ST_FLD_STRUCT) + if (n == 0xf) { n = get_u32(bs); } + field_type = c & 0xf; + if (field_type == ST_FLD_STRUCT) { struct_depth += n; - else + } else { rep_cnt = n; + } } break; case ST_FLD_STRUCT: struct_depth++; break; } @@ -356,16 +343,20 @@ struct gpuParsePageHeader { */ // blockDim {128,1,1} __global__ void __launch_bounds__(128) - gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks) + gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, int32_t* error_code) { + using cudf::detail::warp_size; gpuParsePageHeader parse_page_header; __shared__ byte_stream_s bs_g[4]; - int lane_id = threadIdx.x % 32; - int chunk = (blockIdx.x * 4) + (threadIdx.x / 32); - byte_stream_s* const bs = &bs_g[threadIdx.x / 32]; + int32_t error[4] = {0}; + auto const lane_id = threadIdx.x % warp_size; + auto const warp_id = threadIdx.x / warp_size; + auto const chunk = (blockIdx.x * 4) + warp_id; + auto const bs = &bs_g[warp_id]; - if (chunk < num_chunks and lane_id == 0) bs->ck = chunks[chunk]; + if (chunk < num_chunks and lane_id == 0) { bs->ck = chunks[chunk]; } + if (lane_id == 0) { error[warp_id] = 0; } __syncthreads(); if (chunk < num_chunks) { @@ -376,7 +367,7 @@ __global__ void __launch_bounds__(128) int32_t num_dict_pages = bs->ck.num_dict_pages; PageInfo* page_info; - if (!lane_id) { + if (lane_id == 0) { bs->base = bs->cur = bs->ck.compressed_data; bs->end = bs->base + bs->ck.compressed_size; bs->page.chunk_idx = chunk; @@ -412,6 +403,9 @@ __global__ void __launch_bounds__(128) bs->page.lvl_bytes[level_type::DEFINITION] = 0; bs->page.lvl_bytes[level_type::REPETITION] = 0; if (parse_page_header(bs) && bs->page.compressed_page_size >= 0) { + if (not is_supported_encoding(bs->page.encoding)) { + error[warp_id] |= static_cast(decode_error::UNSUPPORTED_ENCODING); + } switch (bs->page_type) { case PageType::DATA_PAGE: index_out = num_dict_pages + data_page_count; @@ -440,20 +434,25 @@ __global__ void __launch_bounds__(128) } bs->page.page_data = const_cast(bs->cur); bs->cur += bs->page.compressed_page_size; + if (bs->cur > bs->end) { + error[warp_id] |= static_cast(decode_error::DATA_STREAM_OVERRUN); + } bs->page.kernel_mask = kernel_mask_for_page(bs->page, bs->ck); } else { bs->cur = bs->end; } } index_out = shuffle(index_out); - if (index_out >= 0 && index_out < max_num_pages && lane_id == 0) + if (index_out >= 0 && index_out < max_num_pages && lane_id == 0) { page_info[index_out] = bs->page; + } num_values = shuffle(num_values); __syncwarp(); } if (lane_id == 0) { chunks[chunk].num_data_pages = data_page_count; chunks[chunk].num_dict_pages = dictionary_page_count; + if (error[warp_id] != 0) { set_error(error[warp_id], error_code); } } } } @@ -509,11 +508,12 @@ __global__ void __launch_bounds__(128) void __host__ DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, + int32_t* error_code, rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); dim3 dim_grid((num_chunks + 3) >> 2, 1); // 1 chunk per warp, 4 warps per block - gpuDecodePageHeaders<<>>(chunks, num_chunks); + gpuDecodePageHeaders<<>>(chunks, num_chunks, error_code); } void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks, diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 4d79770ec34..4c7d8e3c20a 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -745,10 +745,7 @@ __global__ void __launch_bounds__(decode_block_size) auto const offptr = reinterpret_cast(nesting_info_base[leaf_level_index].data_out); block_excl_sum(offptr, value_count, s->page.str_offset); - if (t == 0 and s->error != 0) { - cuda::atomic_ref ref{*error_code}; - ref.fetch_or(s->error, cuda::std::memory_order_relaxed); - } + if (t == 0 and s->error != 0) { set_error(s->error, error_code); } } } // anonymous namespace diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 048f1a73a9c..164e2cea2ed 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -31,6 +31,8 @@ #include #include +#include + #include #include @@ -54,6 +56,30 @@ constexpr int rolling_index(int index) return index % rolling_size; } +// see setupLocalPageInfo() in page_decode.cuh for supported page encodings +constexpr bool is_supported_encoding(Encoding enc) +{ + switch (enc) { + case Encoding::PLAIN: + case Encoding::PLAIN_DICTIONARY: + case Encoding::RLE: + case Encoding::RLE_DICTIONARY: + case Encoding::DELTA_BINARY_PACKED: return true; + default: return false; + } +} + +/** + * @brief Atomically OR `error` into `error_code`. + */ +constexpr void set_error(int32_t error, int32_t* error_code) +{ + if (error != 0) { + cuda::atomic_ref ref{*error_code}; + ref.fetch_or(error, cuda::std::memory_order_relaxed); + } +} + /** * @brief Enum for the different types of errors that can occur during decoding. * @@ -495,9 +521,13 @@ constexpr bool is_string_col(ColumnChunkDesc const& chunk) * * @param[in] chunks List of column chunks * @param[in] num_chunks Number of column chunks + * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream); +void DecodePageHeaders(ColumnChunkDesc* chunks, + int32_t num_chunks, + int32_t* error_code, + rmm::cuda_stream_view stream); /** * @brief Launches kernel for building the dictionary index for the column diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index db81222157a..11c20d0e540 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -15,6 +15,7 @@ */ #include "reader_impl.hpp" +#include "error.hpp" #include #include @@ -163,7 +164,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) chunk_nested_valids.host_to_device_async(_stream); chunk_nested_data.host_to_device_async(_stream); - rmm::device_scalar error_code(0, _stream); + // create this before we fork streams + kernel_error error_code(_stream); // get the number of streams we need from the pool and tell them to wait on the H2D copies int const nkernels = std::bitset<32>(kernel_mask).count(); @@ -199,11 +201,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) page_nesting.device_to_host_async(_stream); page_nesting_decode.device_to_host_async(_stream); - auto const decode_error = error_code.value(_stream); - if (decode_error != 0) { - std::stringstream stream; - stream << std::hex << decode_error; - CUDF_FAIL("Parquet data decode failed with code(s) 0x" + stream.str()); + if (error_code.value() != 0) { + CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str()); } // for list columns, add the final offset to every offset buffer. diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index ce45f709ee1..8494dc72a1d 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "error.hpp" #include "reader_impl.hpp" #include @@ -263,10 +264,15 @@ void generate_depth_remappings(std::map, std::ve { size_t total_pages = 0; + kernel_error error_code(stream); chunks.host_to_device_async(stream); - DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream); chunks.device_to_host_sync(stream); + if (error_code.value() != 0) { + CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str()); + } + for (size_t c = 0; c < chunks.size(); c++) { total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages; } @@ -274,19 +280,6 @@ void generate_depth_remappings(std::map, std::ve return total_pages; } -// see setupLocalPageInfo() in page_data.cu for supported page encodings -constexpr bool is_supported_encoding(Encoding enc) -{ - switch (enc) { - case Encoding::PLAIN: - case Encoding::PLAIN_DICTIONARY: - case Encoding::RLE: - case Encoding::RLE_DICTIONARY: - case Encoding::DELTA_BINARY_PACKED: return true; - default: return false; - } -} - /** * @brief Decode the page information from the given column chunks. * @@ -307,8 +300,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector& chunks page_count += chunks[c].max_num_pages; } + kernel_error error_code(stream); chunks.host_to_device_async(stream); - DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream); + + if (error_code.value() != 0) { + // TODO(ets): if an unsupported encoding was detected, do extra work to figure out which one + CUDF_FAIL("Parquet header parsing failed with code(s)" + error_code.str()); + } // compute max bytes needed for level data auto level_bit_size = @@ -318,22 +317,13 @@ int decode_page_headers(cudf::detail::hostdevice_vector& chunks max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION])); }); // max level data bit size. - int const max_level_bits = thrust::reduce(rmm::exec_policy(stream), + int const max_level_bits = thrust::reduce(rmm::exec_policy(stream), level_bit_size, level_bit_size + chunks.size(), 0, thrust::maximum()); - auto const level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8)); - - pages.device_to_host_sync(stream); - // validate page encodings - CUDF_EXPECTS(std::all_of(pages.begin(), - pages.end(), - [](auto const& page) { return is_supported_encoding(page.encoding); }), - "Unsupported page encoding detected"); - - return level_type_size; + return std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8)); } /** @@ -771,6 +761,7 @@ void reader::impl::load_and_decompress_data() // decoding of column/page information _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream); + pages.device_to_host_sync(_stream); if (has_compressed_data) { decomp_page_data = decompress_page_data(chunks, pages, _stream); // Free compressed data @@ -795,7 +786,6 @@ void reader::impl::load_and_decompress_data() // std::vector output_info = build_output_column_info(); // the following two allocate functions modify the page data - pages.device_to_host_sync(_stream); { // nesting information (sizes, etc) stored -per page- // note : even for flat schemas, we allocate 1 level of "nesting" info diff --git a/python/cudf/cudf/tests/data/parquet/bad_dict.parquet b/python/cudf/cudf/tests/data/parquet/bad_dict.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5008ac0b22b622cbcf37d5e286c749324fc26535 GIT binary patch literal 2850 zcmb7G&2Af26n0F~Hfi|Brja6Y7Z7#fSP&MJN~pStO)_mfb{z*P0aRU&ubpYebEosu z#$t!MKs*8qR@ng0z>-bZJOz)yito(KnX$tHVfi>WbIxG@;>Aa{l8rMmA*#^4*%p<9v!s%mL$)zf6%rEmMtmdUSFPBN3`0PwrojCrG0SP z8&DqY!J*adS*hZeeQEEv$rT)RqI~1aSa*QmfStqtNXt%^8%RXpzJJiDb$?-rA+Q&fUUhN*ivpf&a%BuK5 z{`mRbU;g;%yN_R4zx_`AM1%hWO+u7}GN%tAOF}4<&~p-AV_AHzvw@jG_OyotUwwdY zj^?L$B+k`j;_0SvRH(;^=gq{}^_3?^IxvwNM_9z#iAChiu%{-FJxy?N7zR*W?u2S~U*sh;78E3`k*L0Ok_ezU>Z$QcY7|9qH=8BN zg3v|#RS1%VaFRo&FN09Sp^9eHAU0{6I3cD*PTtgucrh%vi+`@qD4k5uHvcPxApWaG#x`#T;Ba@bX|{ns~7rcrKXUEJ(Ca ztFIKieadgbAGMu=s0tm2k!s9T~s%| zIb53J9kEvZsCsA3SFvmSCpZ8rwRJpL_SID9d+5u;-fhQBrvT@PW4uazcVlzoX#3V? zt-7(YG4H5`z53rv_-^CHZu|S{W^J8RQrGF`%a!`e6>MPl>LxoVqs=+E!rR#P&#SqI k`2

tF8Lg7WV0tw7y%pyB4SrY>0QJVdXCX(Zk#EKmWZ<*#H0l literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index b892cc62ac4..d2c08246518 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2830,6 +2830,14 @@ def test_parquet_reader_unsupported_page_encoding(datadir): cudf.read_parquet(fname) +def test_parquet_reader_detect_bad_dictionary(datadir): + fname = datadir / "bad_dict.parquet" + + # expect a failure when reading the whole file + with pytest.raises(RuntimeError): + cudf.read_parquet(fname) + + @pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}]) @pytest.mark.parametrize("force_nullable_schema", [True, False]) def test_parquet_writer_schema_nullability(data, force_nullable_schema): From 253f6a6d5b19387c05368e073954ff773b3d6a39 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 20 Oct 2023 14:05:53 -0700 Subject: [PATCH 5/5] Refactor LogicalType for Parquet (#14264) Continuation of #14097, this PR refactors the LogicalType struct to use the new way of treating unions defined in the parquet thrift (more enum like than struct like). Authors: - Ed Seidl (https://github.com/etseidl) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14264 --- .../io/parquet/compact_protocol_reader.cpp | 95 +++-------- .../io/parquet/compact_protocol_writer.cpp | 81 +++++---- cpp/src/io/parquet/page_decode.cuh | 3 +- cpp/src/io/parquet/parquet.hpp | 156 +++++++++++------- cpp/src/io/parquet/parquet_gpu.hpp | 30 ++-- cpp/src/io/parquet/reader_impl_chunking.cu | 13 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 104 ++++++------ cpp/src/io/parquet/writer_impl.cu | 107 +++++++----- cpp/tests/io/parquet_test.cpp | 7 +- 9 files changed, 293 insertions(+), 303 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 1a345ee0750..5a2b8aa8f2a 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -339,61 +339,6 @@ struct parquet_field_struct_list : public parquet_field_list { } }; -// TODO(ets): replace current union handling (which mirrors thrift) to use std::optional fields -// in a struct -/** - * @brief Functor to read a union member from CompactProtocolReader - * - * @tparam is_empty True if tparam `T` type is empty type, else false. - * - * @return True if field types mismatch or if the process of reading a - * union member fails - */ -template -class ParquetFieldUnionFunctor : public parquet_field { - bool& is_set; - T& val; - - public: - ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) { - return true; - } else { - is_set = true; - return !cpr->read(&val); - } - } -}; - -template -class ParquetFieldUnionFunctor : public parquet_field { - bool& is_set; - T& val; - - public: - ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) { - return true; - } else { - is_set = true; - cpr->skip_struct_field(field_type); - return false; - } - } -}; - -template -ParquetFieldUnionFunctor> ParquetFieldUnion(int f, bool& b, T& v) -{ - return ParquetFieldUnionFunctor>(f, b, v); -} - /** * @brief Functor to read a binary from CompactProtocolReader * @@ -595,34 +540,38 @@ bool CompactProtocolReader::read(FileMetaData* f) bool CompactProtocolReader::read(SchemaElement* s) { + using optional_converted_type = + parquet_field_optional>; + using optional_logical_type = + parquet_field_optional>; auto op = std::make_tuple(parquet_field_enum(1, s->type), parquet_field_int32(2, s->type_length), parquet_field_enum(3, s->repetition_type), parquet_field_string(4, s->name), parquet_field_int32(5, s->num_children), - parquet_field_enum(6, s->converted_type), + optional_converted_type(6, s->converted_type), parquet_field_int32(7, s->decimal_scale), parquet_field_int32(8, s->decimal_precision), parquet_field_optional(9, s->field_id), - parquet_field_struct(10, s->logical_type)); + optional_logical_type(10, s->logical_type)); return function_builder(this, op); } bool CompactProtocolReader::read(LogicalType* l) { - auto op = - std::make_tuple(ParquetFieldUnion(1, l->isset.STRING, l->STRING), - ParquetFieldUnion(2, l->isset.MAP, l->MAP), - ParquetFieldUnion(3, l->isset.LIST, l->LIST), - ParquetFieldUnion(4, l->isset.ENUM, l->ENUM), - ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL), // read the struct - ParquetFieldUnion(6, l->isset.DATE, l->DATE), - ParquetFieldUnion(7, l->isset.TIME, l->TIME), // read the struct - ParquetFieldUnion(8, l->isset.TIMESTAMP, l->TIMESTAMP), // read the struct - ParquetFieldUnion(10, l->isset.INTEGER, l->INTEGER), // read the struct - ParquetFieldUnion(11, l->isset.UNKNOWN, l->UNKNOWN), - ParquetFieldUnion(12, l->isset.JSON, l->JSON), - ParquetFieldUnion(13, l->isset.BSON, l->BSON)); + auto op = std::make_tuple( + parquet_field_union_enumerator(1, l->type), + parquet_field_union_enumerator(2, l->type), + parquet_field_union_enumerator(3, l->type), + parquet_field_union_enumerator(4, l->type), + parquet_field_union_struct(5, l->type, l->decimal_type), + parquet_field_union_enumerator(6, l->type), + parquet_field_union_struct(7, l->type, l->time_type), + parquet_field_union_struct(8, l->type, l->timestamp_type), + parquet_field_union_struct(10, l->type, l->int_type), + parquet_field_union_enumerator(11, l->type), + parquet_field_union_enumerator(12, l->type), + parquet_field_union_enumerator(13, l->type)); return function_builder(this, op); } @@ -648,9 +597,9 @@ bool CompactProtocolReader::read(TimestampType* t) bool CompactProtocolReader::read(TimeUnit* u) { - auto op = std::make_tuple(ParquetFieldUnion(1, u->isset.MILLIS, u->MILLIS), - ParquetFieldUnion(2, u->isset.MICROS, u->MICROS), - ParquetFieldUnion(3, u->isset.NANOS, u->NANOS)); + auto op = std::make_tuple(parquet_field_union_enumerator(1, u->type), + parquet_field_union_enumerator(2, u->type), + parquet_field_union_enumerator(3, u->type)); return function_builder(this, op); } diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 00810269d3c..fbeda7f1099 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -16,6 +16,8 @@ #include "compact_protocol_writer.hpp" +#include + namespace cudf::io::parquet::detail { /** @@ -46,13 +48,11 @@ size_t CompactProtocolWriter::write(DecimalType const& decimal) size_t CompactProtocolWriter::write(TimeUnit const& time_unit) { CompactProtocolFieldWriter c(*this); - auto const isset = time_unit.isset; - if (isset.MILLIS) { - c.field_struct(1, time_unit.MILLIS); - } else if (isset.MICROS) { - c.field_struct(2, time_unit.MICROS); - } else if (isset.NANOS) { - c.field_struct(3, time_unit.NANOS); + switch (time_unit.type) { + case TimeUnit::MILLIS: + case TimeUnit::MICROS: + case TimeUnit::NANOS: c.field_empty_struct(time_unit.type); break; + default: CUDF_FAIL("Trying to write an invalid TimeUnit " + std::to_string(time_unit.type)); } return c.value(); } @@ -84,31 +84,29 @@ size_t CompactProtocolWriter::write(IntType const& integer) size_t CompactProtocolWriter::write(LogicalType const& logical_type) { CompactProtocolFieldWriter c(*this); - auto const isset = logical_type.isset; - if (isset.STRING) { - c.field_struct(1, logical_type.STRING); - } else if (isset.MAP) { - c.field_struct(2, logical_type.MAP); - } else if (isset.LIST) { - c.field_struct(3, logical_type.LIST); - } else if (isset.ENUM) { - c.field_struct(4, logical_type.ENUM); - } else if (isset.DECIMAL) { - c.field_struct(5, logical_type.DECIMAL); - } else if (isset.DATE) { - c.field_struct(6, logical_type.DATE); - } else if (isset.TIME) { - c.field_struct(7, logical_type.TIME); - } else if (isset.TIMESTAMP) { - c.field_struct(8, logical_type.TIMESTAMP); - } else if (isset.INTEGER) { - c.field_struct(10, logical_type.INTEGER); - } else if (isset.UNKNOWN) { - c.field_struct(11, logical_type.UNKNOWN); - } else if (isset.JSON) { - c.field_struct(12, logical_type.JSON); - } else if (isset.BSON) { - c.field_struct(13, logical_type.BSON); + switch (logical_type.type) { + case LogicalType::STRING: + case LogicalType::MAP: + case LogicalType::LIST: + case LogicalType::ENUM: + case LogicalType::DATE: + case LogicalType::UNKNOWN: + case LogicalType::JSON: + case LogicalType::BSON: c.field_empty_struct(logical_type.type); break; + case LogicalType::DECIMAL: + c.field_struct(LogicalType::DECIMAL, logical_type.decimal_type.value()); + break; + case LogicalType::TIME: + c.field_struct(LogicalType::TIME, logical_type.time_type.value()); + break; + case LogicalType::TIMESTAMP: + c.field_struct(LogicalType::TIMESTAMP, logical_type.timestamp_type.value()); + break; + case LogicalType::INTEGER: + c.field_struct(LogicalType::INTEGER, logical_type.int_type.value()); + break; + default: + CUDF_FAIL("Trying to write an invalid LogicalType " + std::to_string(logical_type.type)); } return c.value(); } @@ -124,20 +122,15 @@ size_t CompactProtocolWriter::write(SchemaElement const& s) c.field_string(4, s.name); if (s.type == UNDEFINED_TYPE) { c.field_int(5, s.num_children); } - if (s.converted_type != UNKNOWN) { - c.field_int(6, s.converted_type); + if (s.converted_type.has_value()) { + c.field_int(6, s.converted_type.value()); if (s.converted_type == DECIMAL) { c.field_int(7, s.decimal_scale); c.field_int(8, s.decimal_precision); } } - if (s.field_id) { c.field_int(9, s.field_id.value()); } - auto const isset = s.logical_type.isset; - // TODO: add handling for all logical types - // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or - // isset.TIME or isset.TIMESTAMP or isset.INTEGER or isset.UNKNOWN or isset.JSON or isset.BSON) - // { - if (isset.TIMESTAMP or isset.TIME) { c.field_struct(10, s.logical_type); } + if (s.field_id.has_value()) { c.field_int(9, s.field_id.value()); } + if (s.logical_type.has_value()) { c.field_struct(10, s.logical_type.value()); } return c.value(); } @@ -223,9 +216,9 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s) size_t CompactProtocolWriter::write(ColumnOrder const& co) { CompactProtocolFieldWriter c(*this); - switch (co) { - case ColumnOrder::TYPE_ORDER: c.field_empty_struct(1); break; - default: break; + switch (co.type) { + case ColumnOrder::TYPE_ORDER: c.field_empty_struct(co.type); break; + default: CUDF_FAIL("Trying to write an invalid ColumnOrder " + std::to_string(co.type)); } return c.value(); } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 7c866fd8b9e..ab1cc68923d 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -1143,7 +1143,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, units = cudf::timestamp_ms::period::den; } else if (s->col.converted_type == TIMESTAMP_MICROS) { units = cudf::timestamp_us::period::den; - } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) { + } else if (s->col.logical_type.has_value() and + s->col.logical_type->is_timestamp_nanos()) { units = cudf::timestamp_ns::period::den; } if (units and units != s->col.ts_clock_rate) { diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 1cd16ac6102..699cad89703 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -46,79 +46,98 @@ struct file_ender_s { uint32_t magic; }; -// thrift generated code simplified. -struct StringType {}; -struct MapType {}; -struct ListType {}; -struct EnumType {}; +// thrift inspired code simplified. struct DecimalType { int32_t scale = 0; int32_t precision = 0; }; -struct DateType {}; - -struct MilliSeconds {}; -struct MicroSeconds {}; -struct NanoSeconds {}; -using TimeUnit_isset = struct TimeUnit_isset { - bool MILLIS{false}; - bool MICROS{false}; - bool NANOS{false}; -}; struct TimeUnit { - TimeUnit_isset isset; - MilliSeconds MILLIS; - MicroSeconds MICROS; - NanoSeconds NANOS; + enum Type { UNDEFINED, MILLIS, MICROS, NANOS }; + Type type; }; struct TimeType { bool isAdjustedToUTC = false; TimeUnit unit; }; + struct TimestampType { bool isAdjustedToUTC = false; TimeUnit unit; }; + struct IntType { int8_t bitWidth = 0; bool isSigned = false; }; -struct NullType {}; -struct JsonType {}; -struct BsonType {}; - -// thrift generated code simplified. -using LogicalType_isset = struct LogicalType_isset { - bool STRING{false}; - bool MAP{false}; - bool LIST{false}; - bool ENUM{false}; - bool DECIMAL{false}; - bool DATE{false}; - bool TIME{false}; - bool TIMESTAMP{false}; - bool INTEGER{false}; - bool UNKNOWN{false}; - bool JSON{false}; - bool BSON{false}; -}; struct LogicalType { - LogicalType_isset isset; - StringType STRING; - MapType MAP; - ListType LIST; - EnumType ENUM; - DecimalType DECIMAL; - DateType DATE; - TimeType TIME; - TimestampType TIMESTAMP; - IntType INTEGER; - NullType UNKNOWN; - JsonType JSON; - BsonType BSON; + enum Type { + UNDEFINED, + STRING, + MAP, + LIST, + ENUM, + DECIMAL, + DATE, + TIME, + TIMESTAMP, + // 9 is reserved + INTEGER = 10, + UNKNOWN, + JSON, + BSON + }; + Type type; + thrust::optional decimal_type; + thrust::optional time_type; + thrust::optional timestamp_type; + thrust::optional int_type; + + LogicalType(Type tp = UNDEFINED) : type(tp) {} + LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {} + LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {} + LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {} + LogicalType(IntType&& it) : type(INTEGER), int_type(it) {} + + constexpr bool is_time_millis() const + { + return type == TIME and time_type->unit.type == TimeUnit::MILLIS; + } + + constexpr bool is_time_micros() const + { + return type == TIME and time_type->unit.type == TimeUnit::MICROS; + } + + constexpr bool is_time_nanos() const + { + return type == TIME and time_type->unit.type == TimeUnit::NANOS; + } + + constexpr bool is_timestamp_millis() const + { + return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS; + } + + constexpr bool is_timestamp_micros() const + { + return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS; + } + + constexpr bool is_timestamp_nanos() const + { + return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS; + } + + constexpr int8_t bit_width() const { return type == INTEGER ? int_type->bitWidth : -1; } + + constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; } + + constexpr int32_t scale() const { return type == DECIMAL ? decimal_type->scale : -1; } + + constexpr int32_t precision() const { return type == DECIMAL ? decimal_type->precision : -1; } }; /** @@ -127,8 +146,6 @@ struct LogicalType { struct ColumnOrder { enum Type { UNDEFINED, TYPE_ORDER }; Type type; - - operator Type() const { return type; } }; /** @@ -138,18 +155,29 @@ struct ColumnOrder { * as a schema tree. */ struct SchemaElement { - Type type = UNDEFINED_TYPE; - ConvertedType converted_type = UNKNOWN; - LogicalType logical_type; - int32_t type_length = - 0; // Byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types + // 1: parquet physical type for output + Type type = UNDEFINED_TYPE; + // 2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types + int32_t type_length = 0; + // 3: repetition of the field FieldRepetitionType repetition_type = REQUIRED; - std::string name = ""; - int32_t num_children = 0; - int32_t decimal_scale = 0; - int32_t decimal_precision = 0; - thrust::optional field_id = thrust::nullopt; - bool output_as_byte_array = false; + // 4: name of the field + std::string name = ""; + // 5: nested fields + int32_t num_children = 0; + // 6: DEPRECATED: record the original type before conversion to parquet type + thrust::optional converted_type; + // 7: DEPRECATED: record the scale for DECIMAL converted type + int32_t decimal_scale = 0; + // 8: DEPRECATED: record the precision for DECIMAL converted type + int32_t decimal_precision = 0; + // 9: save field_id from original schema + thrust::optional field_id; + // 10: replaces converted type + thrust::optional logical_type; + + // extra cudf specific fields + bool output_as_byte_array = false; // The following fields are filled in later during schema initialization int max_definition_level = 0; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 164e2cea2ed..68851e72663 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -313,7 +313,7 @@ struct ColumnChunkDesc { uint8_t rep_level_bits_, int8_t codec_, int8_t converted_type_, - LogicalType logical_type_, + thrust::optional logical_type_, int8_t decimal_precision_, int32_t ts_clock_rate_, int32_t src_col_index_, @@ -355,20 +355,20 @@ struct ColumnChunkDesc { uint16_t data_type{}; // basic column data type, ((type_length << 3) | // parquet::Type) uint8_t - level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels - int32_t num_data_pages{}; // number of data pages - int32_t num_dict_pages{}; // number of dictionary pages - int32_t max_num_pages{}; // size of page_info array - PageInfo* page_info{}; // output page info for up to num_dict_pages + - // num_data_pages (dictionary pages first) - string_index_pair* str_dict_index{}; // index for string dictionary - bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column - void** column_data_base{}; // base pointers of column data - void** column_string_base{}; // base pointers of column string data - int8_t codec{}; // compressed codec enum - int8_t converted_type{}; // converted type enum - LogicalType logical_type{}; // logical type - int8_t decimal_precision{}; // Decimal precision + level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels + int32_t num_data_pages{}; // number of data pages + int32_t num_dict_pages{}; // number of dictionary pages + int32_t max_num_pages{}; // size of page_info array + PageInfo* page_info{}; // output page info for up to num_dict_pages + + // num_data_pages (dictionary pages first) + string_index_pair* str_dict_index{}; // index for string dictionary + bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column + void** column_data_base{}; // base pointers of column data + void** column_string_base{}; // base pointers of column string data + int8_t codec{}; // compressed codec enum + int8_t converted_type{}; // converted type enum + thrust::optional logical_type{}; // logical type + int8_t decimal_precision{}; // Decimal precision int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) int32_t src_col_index{}; // my input column index diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index ad52a7dfcc1..213fc380a34 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -304,11 +304,12 @@ std::vector find_splits(std::vector const& * * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. */ -[[nodiscard]] std::tuple conversion_info(type_id column_type_id, - type_id timestamp_type_id, - Type physical, - int8_t converted, - int32_t length) +[[nodiscard]] std::tuple conversion_info( + type_id column_type_id, + type_id timestamp_type_id, + Type physical, + thrust::optional converted, + int32_t length) { int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; int32_t clock_rate = 0; @@ -322,7 +323,7 @@ std::vector find_splits(std::vector const& clock_rate = to_clockrate(timestamp_type_id); } - int8_t converted_type = converted; + int8_t converted_type = converted.value_or(UNKNOWN); if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && not cudf::is_fixed_point(data_type{column_type_id})) { converted_type = UNKNOWN; // Not converting to float64 or decimal diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 040c6403f57..a9c84143e1a 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -25,44 +25,42 @@ namespace cudf::io::parquet::detail { namespace { -ConvertedType logical_type_to_converted_type(LogicalType const& logical) +ConvertedType logical_type_to_converted_type(thrust::optional const& logical) { - if (logical.isset.STRING) { - return UTF8; - } else if (logical.isset.MAP) { - return MAP; - } else if (logical.isset.LIST) { - return LIST; - } else if (logical.isset.ENUM) { - return ENUM; - } else if (logical.isset.DECIMAL) { - return DECIMAL; // TODO set decimal values - } else if (logical.isset.DATE) { - return DATE; - } else if (logical.isset.TIME) { - if (logical.TIME.unit.isset.MILLIS) - return TIME_MILLIS; - else if (logical.TIME.unit.isset.MICROS) - return TIME_MICROS; - } else if (logical.isset.TIMESTAMP) { - if (logical.TIMESTAMP.unit.isset.MILLIS) - return TIMESTAMP_MILLIS; - else if (logical.TIMESTAMP.unit.isset.MICROS) - return TIMESTAMP_MICROS; - } else if (logical.isset.INTEGER) { - switch (logical.INTEGER.bitWidth) { - case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8; - case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16; - case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32; - case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64; - default: break; - } - } else if (logical.isset.UNKNOWN) { - return NA; - } else if (logical.isset.JSON) { - return JSON; - } else if (logical.isset.BSON) { - return BSON; + if (not logical.has_value()) { return UNKNOWN; } + switch (logical->type) { + case LogicalType::STRING: return UTF8; + case LogicalType::MAP: return MAP; + case LogicalType::LIST: return LIST; + case LogicalType::ENUM: return ENUM; + case LogicalType::DECIMAL: return DECIMAL; // TODO use decimal scale/precision + case LogicalType::DATE: return DATE; + case LogicalType::TIME: + if (logical->is_time_millis()) { + return TIME_MILLIS; + } else if (logical->is_time_micros()) { + return TIME_MICROS; + } + break; + case LogicalType::TIMESTAMP: + if (logical->is_timestamp_millis()) { + return TIMESTAMP_MILLIS; + } else if (logical->is_timestamp_micros()) { + return TIMESTAMP_MICROS; + } + break; + case LogicalType::INTEGER: + switch (logical->bit_width()) { + case 8: return logical->is_signed() ? INT_8 : UINT_8; + case 16: return logical->is_signed() ? INT_16 : UINT_16; + case 32: return logical->is_signed() ? INT_32 : UINT_32; + case 64: return logical->is_signed() ? INT_64 : UINT_64; + default: break; + } + case LogicalType::UNKNOWN: return NA; + case LogicalType::JSON: return JSON; + case LogicalType::BSON: return BSON; + default: break; } return UNKNOWN; } @@ -76,20 +74,20 @@ type_id to_type_id(SchemaElement const& schema, bool strings_to_categorical, type_id timestamp_type_id) { - Type const physical = schema.type; - LogicalType const logical_type = schema.logical_type; - ConvertedType converted_type = schema.converted_type; - int32_t decimal_precision = schema.decimal_precision; + auto const physical = schema.type; + auto const logical_type = schema.logical_type; + auto converted_type = schema.converted_type; + int32_t decimal_precision = schema.decimal_precision; + // FIXME(ets): this should just use logical type to deduce the type_id. then fall back to + // converted_type if logical_type isn't set // Logical type used for actual data interpretation; the legacy converted type // is superseded by 'logical' type whenever available. auto const inferred_converted_type = logical_type_to_converted_type(logical_type); if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; } - if (inferred_converted_type == DECIMAL) { - decimal_precision = schema.logical_type.DECIMAL.precision; - } + if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type->precision(); } - switch (converted_type) { + switch (converted_type.value_or(UNKNOWN)) { case UINT_8: return type_id::UINT8; case INT_8: return type_id::INT8; case UINT_16: return type_id::UINT16; @@ -140,15 +138,13 @@ type_id to_type_id(SchemaElement const& schema, default: break; } - if (inferred_converted_type == UNKNOWN and physical == INT64 and - logical_type.TIMESTAMP.unit.isset.NANOS) { - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_NANOSECONDS; - } - - if (inferred_converted_type == UNKNOWN and physical == INT64 and - logical_type.TIME.unit.isset.NANOS) { - return type_id::DURATION_NANOSECONDS; + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.has_value()) { + if (logical_type->is_timestamp_nanos()) { + return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id + : type_id::TIMESTAMP_NANOSECONDS; + } else if (logical_type->is_time_nanos()) { + return type_id::DURATION_NANOSECONDS; + } } // is it simply a struct? diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 50589f23626..c06acc1690b 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -284,6 +284,7 @@ struct leaf_schema_fn { { col_schema.type = Type::BOOLEAN; col_schema.stats_dtype = statistics_dtype::dtype_bool; + // BOOLEAN needs no converted or logical type } template @@ -292,6 +293,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::INT_8; col_schema.stats_dtype = statistics_dtype::dtype_int8; + col_schema.logical_type = LogicalType{IntType{8, true}}; } template @@ -300,6 +302,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::INT_16; col_schema.stats_dtype = statistics_dtype::dtype_int16; + col_schema.logical_type = LogicalType{IntType{16, true}}; } template @@ -307,6 +310,7 @@ struct leaf_schema_fn { { col_schema.type = Type::INT32; col_schema.stats_dtype = statistics_dtype::dtype_int32; + // INT32 needs no converted or logical type } template @@ -314,6 +318,7 @@ struct leaf_schema_fn { { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_int64; + // INT64 needs no converted or logical type } template @@ -322,6 +327,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::UINT_8; col_schema.stats_dtype = statistics_dtype::dtype_int8; + col_schema.logical_type = LogicalType{IntType{8, false}}; } template @@ -330,6 +336,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::UINT_16; col_schema.stats_dtype = statistics_dtype::dtype_int16; + col_schema.logical_type = LogicalType{IntType{16, false}}; } template @@ -338,6 +345,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::UINT_32; col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.logical_type = LogicalType{IntType{32, false}}; } template @@ -346,6 +354,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT64; col_schema.converted_type = ConvertedType::UINT_64; col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type = LogicalType{IntType{64, false}}; } template @@ -353,6 +362,7 @@ struct leaf_schema_fn { { col_schema.type = Type::FLOAT; col_schema.stats_dtype = statistics_dtype::dtype_float32; + // FLOAT needs no converted or logical type } template @@ -360,6 +370,7 @@ struct leaf_schema_fn { { col_schema.type = Type::DOUBLE; col_schema.stats_dtype = statistics_dtype::dtype_float64; + // DOUBLE needs no converted or logical type } template @@ -367,11 +378,12 @@ struct leaf_schema_fn { { col_schema.type = Type::BYTE_ARRAY; if (col_meta.is_enabled_output_as_binary()) { - col_schema.converted_type = ConvertedType::UNKNOWN; - col_schema.stats_dtype = statistics_dtype::dtype_byte_array; + col_schema.stats_dtype = statistics_dtype::dtype_byte_array; + // BYTE_ARRAY needs no converted or logical type } else { col_schema.converted_type = ConvertedType::UTF8; col_schema.stats_dtype = statistics_dtype::dtype_string; + col_schema.logical_type = LogicalType{LogicalType::STRING}; } } @@ -381,49 +393,55 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::DATE; col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.logical_type = LogicalType{LogicalType::DATE}; } template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = - (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; col_schema.ts_scale = 1000; + if (not timestamp_is_int96) { + col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; + col_schema.logical_type = LogicalType{TimestampType{false, TimeUnit::MILLIS}}; + } } template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = - (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + if (not timestamp_is_int96) { + col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; + col_schema.logical_type = LogicalType{TimestampType{false, TimeUnit::MILLIS}}; + } } template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = - (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MICROS; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + if (not timestamp_is_int96) { + col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS; + col_schema.logical_type = LogicalType{TimestampType{false, TimeUnit::MICROS}}; + } } template std::enable_if_t, void> operator()() { col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = ConvertedType::UNKNOWN; + col_schema.converted_type = thrust::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; if (timestamp_is_int96) { col_schema.ts_scale = -1000; // negative value indicates division by absolute value } // set logical type if it's not int96 else { - col_schema.logical_type.isset.TIMESTAMP = true; - col_schema.logical_type.TIMESTAMP.unit.isset.NANOS = true; + col_schema.logical_type = LogicalType{TimestampType{false, TimeUnit::NANOS}}; } } @@ -431,53 +449,48 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 24 * 60 * 60 * 1000; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.MILLIS = true; + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 24 * 60 * 60 * 1000; + col_schema.logical_type = LogicalType{TimeType{false, TimeUnit::MILLIS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 1000; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.MILLIS = true; + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 1000; + col_schema.logical_type = LogicalType{TimeType{false, TimeUnit::MILLIS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.MILLIS = true; + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.logical_type = LogicalType{TimeType{false, TimeUnit::MILLIS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.converted_type = ConvertedType::TIME_MICROS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.MICROS = true; + col_schema.type = Type::INT64; + col_schema.converted_type = ConvertedType::TIME_MICROS; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type = LogicalType{TimeType{false, TimeUnit::MICROS}}; } // unsupported outside cudf for parquet 1.0. template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.NANOS = true; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type = LogicalType{TimeType{false, TimeUnit::NANOS}}; } template @@ -487,27 +500,32 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.stats_dtype = statistics_dtype::dtype_int32; col_schema.decimal_precision = MAX_DECIMAL32_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}}; } else if (std::is_same_v) { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_decimal64; col_schema.decimal_precision = MAX_DECIMAL64_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}}; } else if (std::is_same_v) { col_schema.type = Type::FIXED_LEN_BYTE_ARRAY; col_schema.type_length = sizeof(__int128_t); col_schema.stats_dtype = statistics_dtype::dtype_decimal128; col_schema.decimal_precision = MAX_DECIMAL128_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}}; } else { CUDF_FAIL("Unsupported fixed point type for parquet writer"); } col_schema.converted_type = ConvertedType::DECIMAL; col_schema.decimal_scale = -col->type().scale(); // parquet and cudf disagree about scale signs + col_schema.logical_type->decimal_type->scale = -col->type().scale(); if (col_meta.is_decimal_precision_set()) { CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale, "Precision must be equal to or greater than scale!"); if (col_schema.type == Type::INT64 and col_meta.get_decimal_precision() < 10) { CUDF_LOG_WARN("Parquet writer: writing a decimal column with precision < 10 as int64"); } - col_schema.decimal_precision = col_meta.get_decimal_precision(); + col_schema.decimal_precision = col_meta.get_decimal_precision(); + col_schema.logical_type->decimal_type->precision = col_meta.get_decimal_precision(); } } @@ -593,7 +611,7 @@ std::vector construct_schema_tree( schema_tree_node col_schema{}; col_schema.type = Type::BYTE_ARRAY; - col_schema.converted_type = ConvertedType::UNKNOWN; + col_schema.converted_type = thrust::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_byte_array; col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -762,7 +780,10 @@ struct parquet_column_view { [[nodiscard]] column_view cudf_column_view() const { return cudf_col; } [[nodiscard]] Type physical_type() const { return schema_node.type; } - [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; } + [[nodiscard]] ConvertedType converted_type() const + { + return schema_node.converted_type.value_or(UNKNOWN); + } std::vector const& get_path_in_schema() { return path_in_schema; } diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 2a654bd7e8c..fece83f891b 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -4075,11 +4075,12 @@ int32_t compare(T& v1, T& v2) int32_t compare_binary(std::vector const& v1, std::vector const& v2, cudf::io::parquet::detail::Type ptype, - cudf::io::parquet::detail::ConvertedType ctype) + thrust::optional const& ctype) { + auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN); switch (ptype) { case cudf::io::parquet::detail::INT32: - switch (ctype) { + switch (ctype_val) { case cudf::io::parquet::detail::UINT_8: case cudf::io::parquet::detail::UINT_16: case cudf::io::parquet::detail::UINT_32: @@ -4091,7 +4092,7 @@ int32_t compare_binary(std::vector const& v1, } case cudf::io::parquet::detail::INT64: - if (ctype == cudf::io::parquet::detail::UINT_64) { + if (ctype_val == cudf::io::parquet::detail::UINT_64) { return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); }