Skip to content

Commit

Permalink
Merge branch 'branch-23.12' into bug-merge-overflow
Browse files Browse the repository at this point in the history
  • Loading branch information
divyegala authored Oct 31, 2023
2 parents bec7feb + f4c95aa commit 829d9ea
Show file tree
Hide file tree
Showing 30 changed files with 631 additions and 168 deletions.
1 change: 1 addition & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ jobs:
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
build_type: pull-request
build-2_28-wheels: "true"
script: "ci/build_wheel_cudf.sh"
wheel-tests-cudf:
needs: wheel-build-cudf
Expand Down
3 changes: 1 addition & 2 deletions ci/build_wheel_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"

./ci/build_wheel.sh cudf ${package_dir}

mkdir -p ${package_dir}/final_dist
python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
15 changes: 14 additions & 1 deletion ci/test_wheel_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,21 @@

set -eou pipefail

# Set the manylinux version used for downloading the wheels so that we test the
# newer ABI wheels on the newer images that support their installation.
# Need to disable pipefail for the head not to fail, see
# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
set +o pipefail
glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
set -o pipefail
manylinux_version="2_17"
if [[ ${glibc_minor_version} -ge 28 ]]; then
manylinux_version="2_28"
fi
manylinux="manylinux_${manylinux_version}"

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist

# echo to expand wildcard before adding `[extra]` requires for pip
python -m pip install $(echo ./dist/cudf*.whl)[test]
Expand Down
15 changes: 14 additions & 1 deletion ci/test_wheel_dask_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,20 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist

# Download the cudf built in the previous step
RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
# Set the manylinux version used for downloading the wheels so that we test the
# newer ABI wheels on the newer images that support their installation.
# Need to disable pipefail for the head not to fail, see
# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
set +o pipefail
glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
set -o pipefail
manylinux_version="2_17"
if [[ ${glibc_minor_version} -ge 28 ]]; then
manylinux_version="2_28"
fi
manylinux="manylinux_${manylinux_version}"

RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
python -m pip install --no-deps ./local-cudf-dep/cudf*.whl

# Always install latest dask for testing
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies:
- c-compiler
- cachetools
- cmake>=3.26.4
- cramjam
- cubinlinker
- cuda-nvtx=11.8
- cuda-python>=11.7.1,<12.0a0
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-120_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies:
- c-compiler
- cachetools
- cmake>=3.26.4
- cramjam
- cuda-cudart-dev
- cuda-gdb
- cuda-nvcc
Expand Down
62 changes: 34 additions & 28 deletions cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,35 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION)
find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
add_library(arrow_shared ALIAS Arrow::Arrow)

# When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's
# `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note
# that these flags will often be redundant because we build wheels in manylinux containers that
# actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent
# behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer
# containers. Note that tests will not build successfully without also propagating these options
# to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly
# NVBench) builds. We are currently ignoring these limitations since we don't anticipate using
# this feature except for building wheels.
target_compile_options(
Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
# When using the libarrow inside a wheel, whether or not libcudf may be built using the new C++11
# ABI is dependent on whether the libarrow inside the wheel was compiled using that ABI because we
# need the arrow library that we bundle in cudf to be ABI-compatible with the one inside pyarrow.
# We determine what options to use by checking the glibc version on the current system, which is
# also how pip determines which manylinux-versioned pyarrow wheel to install. Note that tests will
# not build successfully without also propagating these options to builds of GTest. Similarly,
# benchmarks will not work without updating GBench (and possibly NVBench) builds. We are currently
# ignoring these limitations since we don't anticipate using this feature except for building
# wheels.
EXECUTE_PROCESS(
COMMAND ${CMAKE_C_COMPILER} -print-file-name=libc.so.6
OUTPUT_VARIABLE GLIBC_EXECUTABLE
OUTPUT_STRIP_TRAILING_WHITESPACE
)
EXECUTE_PROCESS(
COMMAND ${GLIBC_EXECUTABLE}
OUTPUT_VARIABLE GLIBC_OUTPUT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
STRING(REGEX MATCH "stable release version ([0-9]+\\.[0-9]+)" GLIBC_VERSION ${GLIBC_OUTPUT})
STRING(REPLACE "stable release version " "" GLIBC_VERSION ${GLIBC_VERSION})
STRING(REPLACE "." ";" GLIBC_VERSION_LIST ${GLIBC_VERSION})
LIST(GET GLIBC_VERSION_LIST 1 GLIBC_VERSION_MINOR)
if(GLIBC_VERSION_MINOR LESS 28)
target_compile_options(
Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
)
endif()

rapids_export_package(BUILD Arrow cudf-exports)
rapids_export_package(INSTALL Arrow cudf-exports)
Expand Down Expand Up @@ -408,22 +424,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
endfunction()

if(NOT DEFINED CUDF_VERSION_Arrow)
# Temporarily use Arrow 12.0.1 in wheels and Arrow 13.0.0 otherwise
if(USE_LIBARROW_FROM_PYARROW)
set(CUDF_VERSION_Arrow
# This version must be kept in sync with the libarrow version pinned for builds in
# dependencies.yaml.
12.0.1
CACHE STRING "The version of Arrow to find (or build)"
)
else()
set(CUDF_VERSION_Arrow
# This version must be kept in sync with the libarrow version pinned for builds in
# dependencies.yaml.
13.0.0
CACHE STRING "The version of Arrow to find (or build)"
)
endif()
set(CUDF_VERSION_Arrow
# This version must be kept in sync with the libarrow version pinned for builds in
# dependencies.yaml.
13.0.0
CACHE STRING "The version of Arrow to find (or build)"
)
endif()

find_and_configure_arrow(
Expand Down
1 change: 0 additions & 1 deletion cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#pragma once

#include <cudf/io/json.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>

Expand Down
4 changes: 4 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -512,13 +512,15 @@ class json_reader_options_builder {
* @endcode
*
* @param options Settings for controlling reading behavior
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate device memory of the table in the returned
* table_with_metadata.
*
* @return The set of columns along with metadata
*/
table_with_metadata read_json(
json_reader_options options,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down Expand Up @@ -861,9 +863,11 @@ class json_writer_options_builder {
* @endcode
*
* @param options Settings for controlling writing behavior
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
void write_json(json_writer_options const& options,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
57 changes: 57 additions & 0 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,9 @@ class parquet_writer_options {
// Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
// If true then overrides any per-column setting in _metadata.
bool _write_timestamps_as_int96 = false;
// Parquet writer can write timestamps as UTC
// Defaults to true because libcudf timestamps are implicitly UTC
bool _write_timestamps_as_UTC = true;
// Column chunks file paths to be set in the raw output metadata. One per output file
std::vector<std::string> _column_chunks_file_paths;
// Maximum size of each row group (unless smaller than a single page)
Expand Down Expand Up @@ -652,6 +655,13 @@ class parquet_writer_options {
*/
bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }

/**
* @brief Returns `true` if timestamps will be written as UTC
*
* @return `true` if timestamps will be written as UTC
*/
[[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }

/**
* @brief Returns Column chunks file paths to be set in the raw output metadata.
*
Expand Down Expand Up @@ -789,6 +799,13 @@ class parquet_writer_options {
*/
void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }

/**
* @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
*
* @param val Boolean value to enable/disable writing of timestamps as UTC.
*/
void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }

/**
* @brief Sets column chunks file path to be set in the raw output metadata.
*
Expand Down Expand Up @@ -1100,6 +1117,18 @@ class parquet_writer_options_builder {
return *this;
}

/**
* @brief Set to true if timestamps are to be written as UTC.
*
* @param enabled Boolean value to enable/disable writing of timestamps as UTC.
* @return this for chaining
*/
parquet_writer_options_builder& utc_timestamps(bool enabled)
{
options._write_timestamps_as_UTC = enabled;
return *this;
}

/**
* @brief Set to true if V2 page headers are to be written.
*
Expand Down Expand Up @@ -1171,6 +1200,8 @@ class chunked_parquet_writer_options {
// Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
// If true then overrides any per-column setting in _metadata.
bool _write_timestamps_as_int96 = false;
// Parquet writer can write timestamps as UTC. Defaults to true.
bool _write_timestamps_as_UTC = true;
// Maximum size of each row group (unless smaller than a single page)
size_t _row_group_size_bytes = default_row_group_size_bytes;
// Maximum number of rows in row group (unless smaller than a single page)
Expand Down Expand Up @@ -1254,6 +1285,13 @@ class chunked_parquet_writer_options {
*/
bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }

/**
* @brief Returns `true` if timestamps will be written as UTC
*
* @return `true` if timestamps will be written as UTC
*/
[[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }

/**
* @brief Returns maximum row group size, in bytes.
*
Expand Down Expand Up @@ -1375,6 +1413,13 @@ class chunked_parquet_writer_options {
*/
void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }

/**
* @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
*
* @param val Boolean value to enable/disable writing of timestamps as UTC.
*/
void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }

/**
* @brief Sets the maximum row group size, in bytes.
*
Expand Down Expand Up @@ -1539,6 +1584,18 @@ class chunked_parquet_writer_options_builder {
return *this;
}

/**
* @brief Set to true if timestamps are to be written as UTC.
*
* @param enabled Boolean value to enable/disable writing of timestamps as UTC.
* @return this for chaining
*/
chunked_parquet_writer_options_builder& utc_timestamps(bool enabled)
{
options._write_timestamps_as_UTC = enabled;
return *this;
}

/**
* @brief Set to true if V2 page headers are to be written.
*
Expand Down
Loading

0 comments on commit 829d9ea

Please sign in to comment.