Skip to content

Commit

Permalink
Support DurationType in cudf parquet reader via arrow:schema (#15617
Browse files Browse the repository at this point in the history
)

This PR adds the support for reading and using the `arrow:schema` struct from the serialized `arrow:ipc` message written at the key-value metadata section of the Parquet file with `ARROW:schema` key. This allows cudf to read and interop with arrow for non-standard parquet types (`DurationType` in this PR). 

Arrow uses Google flatbuffers (inside Schema.fbs) to serialize the `arrow:Schema` structure (containing column descriptors) and puts it (padded for 8 byte alignment) into the header of an empty `ipc:Message` (also a flatbuffer-serialized structure inside Message.fbs). The `ipc:Message` is prepended with two integers containing a `validity` message and the `size of the header` (the `arrow:Schema` + padding). The final message is endoded as a base64 string and written to Parquet file footer key-value metadata using `"ARROW:schema"` key. 

In this PR, we base64-decode the `ipc:Message`, then we decode the `validity` message and the header size, and offset pointers to the `arrow:Schema` flatbuffer. We then use Flatbuffer structs to walk the `arrow:Schema` and collect information on columns of interest as an unordered_map (using column name as key).  This unordered_map is used inside `select_columns` function to build cudf Table columns and get the correct `dtype`.

Closes #13410

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: #15617
  • Loading branch information
mhaseeb123 authored May 15, 2024
1 parent fa9d028 commit c5c95b7
Show file tree
Hide file tree
Showing 18 changed files with 5,152 additions and 15 deletions.
8 changes: 5 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,11 @@ repos:
hooks:
- id: verify-copyright
exclude: |
(?x)
cpp/include/cudf_test/cxxopts[.]hpp$
(?x)^(
cpp/include/cudf_test/cxxopts[.]hpp$|
cpp/src/io/parquet/ipc/Message_generated[.]h$|
cpp/src/io/parquet/ipc/Schema_generated[.]h$
)
default_language_version:
python: python3
4 changes: 4 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ include(cmake/thirdparty/get_cccl.cmake)
include(cmake/thirdparty/get_rmm.cmake)
# find arrow
include(cmake/thirdparty/get_arrow.cmake)
# find flatbuffers
include(cmake/thirdparty/get_flatbuffers.cmake)
# find dlpack
include(cmake/thirdparty/get_dlpack.cmake)
# find cuCollections, should come after including CCCL
Expand Down Expand Up @@ -429,6 +431,7 @@ add_library(
src/io/text/bgzip_utils.cpp
src/io/text/multibyte_split.cu
src/io/utilities/arrow_io_source.cpp
src/io/utilities/base64_utilities.cpp
src/io/utilities/column_buffer.cpp
src/io/utilities/column_buffer_strings.cu
src/io/utilities/config_utils.cpp
Expand Down Expand Up @@ -742,6 +745,7 @@ target_include_directories(
"$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
"$<BUILD_INTERFACE:${nanoarrow_SOURCE_DIR}/src>"
"$<BUILD_INTERFACE:${FlatBuffers_SOURCE_DIR}/include>"
INTERFACE "$<INSTALL_INTERFACE:include>"
)

Expand Down
33 changes: 33 additions & 0 deletions cpp/cmake/thirdparty/get_flatbuffers.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# =============================================================================
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# =============================================================================

# Use CPM to find or clone flatbuffers
function(find_and_configure_flatbuffers VERSION)

rapids_cpm_find(
flatbuffers ${VERSION}
GLOBAL_TARGETS flatbuffers
CPM_ARGS
GIT_REPOSITORY https://github.com/google/flatbuffers.git
GIT_TAG v${VERSION}
GIT_SHALLOW TRUE
)

rapids_export_find_package_root(
BUILD flatbuffers "${flatbuffers_BINARY_DIR}" EXPORT_SET cudf-exports
)

endfunction()

find_and_configure_flatbuffers(24.3.25)
28 changes: 28 additions & 0 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class parquet_reader_options {
bool _convert_strings_to_categories = false;
// Whether to use PANDAS metadata to load columns
bool _use_pandas_metadata = true;
// Whether to read and use ARROW schema
bool _use_arrow_schema = true;
// Cast timestamp columns to a specific type
data_type _timestamp_type{type_id::EMPTY};

Expand Down Expand Up @@ -126,6 +128,13 @@ class parquet_reader_options {
*/
[[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }

/**
* @brief Returns true/false depending whether to use arrow schema while reading.
*
* @return `true` if arrow schema is used while reading
*/
[[nodiscard]] bool is_enabled_use_arrow_schema() const { return _use_arrow_schema; }

/**
* @brief Returns optional tree of metadata.
*
Expand Down Expand Up @@ -214,6 +223,13 @@ class parquet_reader_options {
*/
void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; }

/**
* @brief Sets to enable/disable use of arrow schema to read.
*
* @param val Boolean value whether to use arrow schema
*/
void enable_use_arrow_schema(bool val) { _use_arrow_schema = val; }

/**
* @brief Sets reader column schema.
*
Expand Down Expand Up @@ -328,6 +344,18 @@ class parquet_reader_options_builder {
return *this;
}

/**
* @brief Sets to enable/disable use of arrow schema to read.
*
* @param val Boolean value whether to use arrow schema
* @return this for chaining
*/
parquet_reader_options_builder& use_arrow_schema(bool val)
{
options._use_arrow_schema = val;
return *this;
}

/**
* @brief Sets reader metadata.
*
Expand Down
Loading

0 comments on commit c5c95b7

Please sign in to comment.