Merge branch 'branch-23.12' into bug-merge-overflow

rapidsai · Oct 31, 2023 · 829d9ea · 829d9ea
2 parents bec7feb + f4c95aa
commit 829d9ea
Show file tree

Hide file tree

Showing 30 changed files with 631 additions and 168 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -101,6 +101,7 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
       build_type: pull-request
+      build-2_28-wheels: "true"
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
@@ -9,8 +9,7 @@ export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
 
 ./ci/build_wheel.sh cudf ${package_dir}
 
-mkdir -p ${package_dir}/final_dist
 python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
@@ -3,8 +3,21 @@
 
 set -eou pipefail
 
+# Set the manylinux version used for downloading the wheels so that we test the
+# newer ABI wheels on the newer images that support their installation.
+# Need to disable pipefail for the head not to fail, see
+# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+set +o pipefail
+glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+set -o pipefail
+manylinux_version="2_17"
+if [[ ${glibc_minor_version} -ge 28 ]]; then
+    manylinux_version="2_28"
+fi
+manylinux="manylinux_${manylinux_version}"
+
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cudf*.whl)[test]

diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
@@ -7,7 +7,20 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+# Set the manylinux version used for downloading the wheels so that we test the
+# newer ABI wheels on the newer images that support their installation.
+# Need to disable pipefail for the head not to fail, see
+# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+set +o pipefail
+glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+set -o pipefail
+manylinux_version="2_17"
+if [[ ${glibc_minor_version} -ge 28 ]]; then
+    manylinux_version="2_28"
+fi
+manylinux="manylinux_${manylinux_version}"
+
+RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # Always install latest dask for testing

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,6 +15,7 @@ dependencies:
 - c-compiler
 - cachetools
 - cmake>=3.26.4
+- cramjam
 - cubinlinker
 - cuda-nvtx=11.8
 - cuda-python>=11.7.1,<12.0a0

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -15,6 +15,7 @@ dependencies:
 - c-compiler
 - cachetools
 - cmake>=3.26.4
+- cramjam
 - cuda-cudart-dev
 - cuda-gdb
 - cuda-nvcc

diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -53,19 +53,35 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION)
   find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
   add_library(arrow_shared ALIAS Arrow::Arrow)
 
-  # When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's
-  # `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note
-  # that these flags will often be redundant because we build wheels in manylinux containers that
-  # actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent
-  # behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer
-  # containers. Note that tests will not build successfully without also propagating these options
-  # to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly
-  # NVBench) builds. We are currently ignoring these limitations since we don't anticipate using
-  # this feature except for building wheels.
-  target_compile_options(
-    Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
-                           "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
+  # When using the libarrow inside a wheel, whether or not libcudf may be built using the new C++11
+  # ABI is dependent on whether the libarrow inside the wheel was compiled using that ABI because we
+  # need the arrow library that we bundle in cudf to be ABI-compatible with the one inside pyarrow.
+  # We determine what options to use by checking the glibc version on the current system, which is
+  # also how pip determines which manylinux-versioned pyarrow wheel to install. Note that tests will
+  # not build successfully without also propagating these options to builds of GTest. Similarly,
+  # benchmarks will not work without updating GBench (and possibly NVBench) builds. We are currently
+  # ignoring these limitations since we don't anticipate using this feature except for building
+  # wheels.
+  EXECUTE_PROCESS(
+    COMMAND ${CMAKE_C_COMPILER} -print-file-name=libc.so.6
+    OUTPUT_VARIABLE GLIBC_EXECUTABLE
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  EXECUTE_PROCESS(
+    COMMAND ${GLIBC_EXECUTABLE}
+    OUTPUT_VARIABLE GLIBC_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
   )
+  STRING(REGEX MATCH "stable release version ([0-9]+\\.[0-9]+)" GLIBC_VERSION ${GLIBC_OUTPUT})
+  STRING(REPLACE "stable release version " "" GLIBC_VERSION ${GLIBC_VERSION})
+  STRING(REPLACE "." ";" GLIBC_VERSION_LIST ${GLIBC_VERSION})
+  LIST(GET GLIBC_VERSION_LIST 1 GLIBC_VERSION_MINOR)
+  if(GLIBC_VERSION_MINOR LESS 28)
+    target_compile_options(
+      Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
+                             "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
+    )
+  endif()
 
   rapids_export_package(BUILD Arrow cudf-exports)
   rapids_export_package(INSTALL Arrow cudf-exports)
@@ -408,22 +424,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 endfunction()
 
 if(NOT DEFINED CUDF_VERSION_Arrow)
-  # Temporarily use Arrow 12.0.1 in wheels and Arrow 13.0.0 otherwise
-  if(USE_LIBARROW_FROM_PYARROW)
-    set(CUDF_VERSION_Arrow
-        # This version must be kept in sync with the libarrow version pinned for builds in
-        # dependencies.yaml.
-        12.0.1
-        CACHE STRING "The version of Arrow to find (or build)"
-    )
-  else()
-    set(CUDF_VERSION_Arrow
-        # This version must be kept in sync with the libarrow version pinned for builds in
-        # dependencies.yaml.
-        13.0.0
-        CACHE STRING "The version of Arrow to find (or build)"
-    )
-  endif()
+  set(CUDF_VERSION_Arrow
+      # This version must be kept in sync with the libarrow version pinned for builds in
+      # dependencies.yaml.
+      13.0.0
+      CACHE STRING "The version of Arrow to find (or build)"
+  )
 endif()
 
 find_and_configure_arrow(

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/json.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
@@ -512,13 +512,15 @@ class json_reader_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata.
  *
  * @return The set of columns along with metadata
  */
 table_with_metadata read_json(
   json_reader_options options,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
@@ -861,9 +863,11 @@ class json_writer_options_builder {
  * @endcode
  *
  * @param options Settings for controlling writing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(json_writer_options const& options,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
@@ -532,6 +532,9 @@ class parquet_writer_options {
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
+  // Parquet writer can write timestamps as UTC
+  // Defaults to true because libcudf timestamps are implicitly UTC
+  bool _write_timestamps_as_UTC = true;
   // Column chunks file paths to be set in the raw output metadata. One per output file
   std::vector<std::string> _column_chunks_file_paths;
   // Maximum size of each row group (unless smaller than a single page)
@@ -652,6 +655,13 @@ class parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
+  /**
+   * @brief Returns `true` if timestamps will be written as UTC
+   *
+   * @return `true` if timestamps will be written as UTC
+   */
+  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
+
   /**
    * @brief Returns Column chunks file paths to be set in the raw output metadata.
    *
@@ -789,6 +799,13 @@ class parquet_writer_options {
    */
   void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
 
+  /**
+   * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of timestamps as UTC.
+   */
+  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
+
   /**
    * @brief Sets column chunks file path to be set in the raw output metadata.
    *
@@ -1100,6 +1117,18 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if timestamps are to be written as UTC.
+   *
+   * @param enabled Boolean value to enable/disable writing of timestamps as UTC.
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& utc_timestamps(bool enabled)
+  {
+    options._write_timestamps_as_UTC = enabled;
+    return *this;
+  }
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
@@ -1171,6 +1200,8 @@ class chunked_parquet_writer_options {
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
+  // Parquet writer can write timestamps as UTC. Defaults to true.
+  bool _write_timestamps_as_UTC = true;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -1254,6 +1285,13 @@ class chunked_parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
+  /**
+   * @brief Returns `true` if timestamps will be written as UTC
+   *
+   * @return `true` if timestamps will be written as UTC
+   */
+  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
+
   /**
    * @brief Returns maximum row group size, in bytes.
    *
@@ -1375,6 +1413,13 @@ class chunked_parquet_writer_options {
    */
   void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
 
+  /**
+   * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of timestamps as UTC.
+   */
+  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
+
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
@@ -1539,6 +1584,18 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if timestamps are to be written as UTC.
+   *
+   * @param enabled Boolean value to enable/disable writing of timestamps as UTC.
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& utc_timestamps(bool enabled)
+  {
+    options._write_timestamps_as_UTC = enabled;
+    return *this;
+  }
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *