rapidsai · rapids-bot · Sep 14, 2021 · May 7, 2021 · May 12, 2021 · May 12, 2021
@@ -125,6 +125,8 @@ include(cmake/thirdparty/CUDF_GetCPM.cmake)
 include(cmake/thirdparty/CUDF_FindBoost.cmake)
 # find jitify
 include(cmake/thirdparty/CUDF_GetJitify.cmake)
+# find nvCOMP
+include(cmake/thirdparty/CUDF_GetnvCOMP.cmake)
 # find thrust/cub
 include(cmake/thirdparty/CUDF_GetThrust.cmake)
 # find rmm
@@ -475,6 +477,7 @@ target_link_libraries(cudf
                   Boost::filesystem
                   ${ARROW_LIBRARIES}
                   cudf::Thrust
+                  nvCOMP::nvcomp
                   rmm::rmm)
 
 if(CUDA_STATIC_RUNTIME)
@@ -580,6 +583,10 @@ install(TARGETS cudf
         DESTINATION lib
         EXPORT cudf-targets)
 
+install(TARGETS nvcomp
+        DESTINATION lib
+        EXPORT cudf-nvcomp-target)
+
 install(DIRECTORY
             ${CUDF_SOURCE_DIR}/include/cudf
             ${CUDF_SOURCE_DIR}/include/cudf_test
@@ -623,6 +630,11 @@ install(EXPORT  cudf-testing-targets
     NAMESPACE   cudf::
     DESTINATION "${INSTALL_CONFIGDIR}")
 
+install(EXPORT  cudf-nvcomp-target
+    FILE        cudf-nvcomp-target.cmake
+    NAMESPACE   nvCOMP::
+    DESTINATION "${INSTALL_CONFIGDIR}")
+
 ################################################################################################
 # - build export -------------------------------------------------------------------------------
 configure_package_config_file(cmake/cudf-build-config.cmake.in ${CUDF_BINARY_DIR}/cudf-config.cmake
@@ -656,6 +668,15 @@ if(TARGET gtest)
     endif()
 endif()
 
+if(TARGET nvcomp)
+    get_target_property(nvcomp_is_imported nvcomp IMPORTED)
+    if(NOT nvcomp_is_imported)
+        export(TARGETS nvcomp
+            FILE ${CUDF_BINARY_DIR}/cudf-nvcomp-target.cmake
+            NAMESPACE   nvCOMP::)
+    endif()
+endif()
+
 export(EXPORT cudf-targets
     FILE ${CUDF_BINARY_DIR}/cudf-targets.cmake
     NAMESPACE   cudf::)

diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in
@@ -71,6 +71,13 @@ else()
   include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetGTest.cmake)
 endif()
 
+# find nvCOMP
+if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-nvcomp-target.cmake")
+  include("${CMAKE_CURRENT_LIST_DIR}/cudf-nvcomp-target.cmake")
+else()
+  include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetnvCOMP.cmake)
+endif()
+
 list(POP_FRONT CMAKE_MODULE_PATH)
 
 

diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in
@@ -83,6 +83,9 @@ find_dependency(ArrowCUDA @CUDF_VERSION_Arrow@)
 
 find_dependency(rmm @CUDF_MIN_VERSION_rmm@)
 
+find_dependency(nvCOMP @CUDF_MIN_VERSION_nvCOMP@)
+include("${CMAKE_CURRENT_LIST_DIR}/cudf-nvcomp-target.cmake")
+
 set(Thrust_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust")
 find_dependency(Thrust @CUDF_MIN_VERSION_Thrust@)
 thrust_create_target(cudf::Thrust FROM_OPTIONS)

diff --git a/cpp/cmake/thirdparty/CUDF_GetnvCOMP.cmake b/cpp/cmake/thirdparty/CUDF_GetnvCOMP.cmake
@@ -0,0 +1,41 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_nvcomp VERSION)
+
+    if(TARGET nvCOMP::nvcomp)
+        return()
+    endif()
+
+    # Find or install nvCOMP
+    CPMFindPackage(NAME nvCOMP
+        VERSION         ${VERSION}
+        GIT_REPOSITORY  https://github.com/NVIDIA/nvcomp.git
+        GIT_TAG         v${VERSION}
+        GIT_SHALLOW     TRUE
+        OPTIONS         )
+
+    if(NOT TARGET nvCOMP::nvcomp)
+        add_library(nvCOMP::nvcomp ALIAS nvcomp)
+    endif()
+
+    # Make sure consumers of cudf can also see nvCOMP::nvcomp target
+    fix_cmake_global_defaults(nvCOMP::nvcomp)
+endfunction()
+
+set(CUDF_MIN_VERSION_nvCOMP 2.0.0)
+
+find_and_configure_nvcomp(${CUDF_MIN_VERSION_nvCOMP})
@@ -257,7 +257,7 @@ static __device__ uint32_t Match60(const uint8_t *src1,
  * @param[out] outputs Compression status per block
  * @param[in] count Number of blocks to compress
  */
-extern "C" __global__ void __launch_bounds__(128)
+__global__ void __launch_bounds__(128)
   snap_kernel(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, int count)
 {
   __shared__ __align__(16) snap_state_s state_g;

@@ -410,6 +410,7 @@ __global__ void __launch_bounds__(128)
                device_span<parquet_column_device_view const> col_desc,
                statistics_merge_group *page_grstats,
                statistics_merge_group *chunk_grstats,
+               size_t max_page_comp_data_size,
                int32_t num_columns)
 {
   // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach
@@ -439,6 +440,8 @@ __global__ void __launch_bounds__(128)
     uint32_t page_offset         = ck_g.ck_stat_size;
     uint32_t num_dict_entries    = 0;
     uint32_t comp_page_offset    = ck_g.ck_stat_size;
+    uint32_t page_headers_size   = 0;
+    uint32_t max_page_data_size  = 0;
     uint32_t cur_row             = ck_g.start_row;
     uint32_t ck_max_stats_len    = 0;
     uint32_t max_stats_len       = 0;
@@ -465,7 +468,9 @@ __global__ void __launch_bounds__(128)
         page_g.num_leaf_values = ck_g.total_dict_entries;
         page_g.num_values      = ck_g.total_dict_entries;
         page_offset += page_g.max_hdr_size + page_g.max_data_size;
-        comp_page_offset += page_g.max_hdr_size + GetMaxCompressedBfrSize(page_g.max_data_size);
+        comp_page_offset += page_g.max_hdr_size + max_page_comp_data_size;
+        page_headers_size += page_g.max_hdr_size;
+        max_page_data_size = max(max_page_data_size, page_g.max_data_size);
       }
       __syncwarp();
       if (t == 0) {
@@ -571,7 +576,9 @@ __global__ void __launch_bounds__(128)
           pagestats_g.start_chunk = ck_g.first_fragment + page_start;
           pagestats_g.num_chunks  = page_g.num_fragments;
           page_offset += page_g.max_hdr_size + page_g.max_data_size;
-          comp_page_offset += page_g.max_hdr_size + GetMaxCompressedBfrSize(page_g.max_data_size);
+          comp_page_offset += page_g.max_hdr_size + max_page_comp_data_size;
+          page_headers_size += page_g.max_hdr_size;
+          max_page_data_size = max(max_page_data_size, page_g.max_data_size);
           cur_row += rows_in_page;
           ck_max_stats_len = max(ck_max_stats_len, max_stats_len);
         }
@@ -610,6 +617,8 @@ __global__ void __launch_bounds__(128)
       ck_g.num_pages          = num_pages;
       ck_g.bfr_size           = page_offset;
       ck_g.compressed_size    = comp_page_offset;
+      ck_g.page_headers_size  = page_headers_size;
+      ck_g.max_page_data_size = max_page_data_size;
       pagestats_g.start_chunk = ck_g.first_page + ck_g.has_dictionary;  // Exclude dictionary
       pagestats_g.num_chunks  = num_pages - ck_g.has_dictionary;
     }
@@ -2141,6 +2150,7 @@ void InitFragmentStatistics(device_2dspan<statistics_group> groups,
  * @param[in] num_columns Number of columns
  * @param[out] page_grstats Setup for page-level stats
  * @param[out] chunk_grstats Setup for chunk-level stats
+ * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages
  * @param[in] stream CUDA stream to use, default 0
  */
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
@@ -2149,12 +2159,13 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats,
                       statistics_merge_group *chunk_grstats,
+                      size_t max_page_comp_data_size,
                       rmm::cuda_stream_view stream)
 {
   auto num_rowgroups = chunks.size().first;
   dim3 dim_grid(num_columns, num_rowgroups);  // 1 threadblock per rowgroup
   gpuInitPages<<<dim_grid, 128, 0, stream.value()>>>(
-    chunks, pages, col_desc, page_grstats, chunk_grstats, num_columns);
+    chunks, pages, col_desc, page_grstats, chunk_grstats, max_page_comp_data_size, num_columns);
 }
 
 /**

@@ -298,8 +298,10 @@ struct EncColumnChunk {
   statistics_chunk const *stats;               //!< Fragment statistics
   uint32_t bfr_size;                           //!< Uncompressed buffer size
   uint32_t compressed_size;                    //!< Compressed buffer size
-  uint32_t start_row;                          //!< First row of chunk
-  uint32_t num_rows;                           //!< Number of rows in chunk
+  uint32_t max_page_data_size;  //!< Max data size (excuding header) of any page in this chunk
+  uint32_t page_headers_size;   //!< Sum of size of all page headers
+  uint32_t start_row;           //!< First row of chunk
+  uint32_t num_rows;            //!< Number of rows in chunk
   uint32_t num_values;      //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
   EncPage *pages;           //!< Ptr to pages that belong to this chunk
@@ -480,6 +482,7 @@ void InitFragmentStatistics(cudf::detail::device_2dspan<statistics_group> groups
  * @param[in] num_columns Number of columns
  * @param[in] page_grstats Setup for page-level stats
  * @param[in] chunk_grstats Setup for chunk-level stats
+ * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages
  * @param[in] stream CUDA stream to use, default 0
  */
 void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
@@ -488,6 +491,7 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats  = nullptr,
                       statistics_merge_group *chunk_grstats = nullptr,
+                      size_t max_page_comp_data_size        = 0,
                       rmm::cuda_stream_view stream          = rmm::cuda_stream_default);
 
 /**