Merge branch 'branch-22.06' of https://github.com/rapidsai/cudf into …

…bug-pq-metadata-array
rapidsai · May 6, 2022 · 4a4bda7 · 4a4bda7
2 parents ec806eb + 4913a9b
commit 4a4bda7
Show file tree

Hide file tree

Showing 102 changed files with 3,766 additions and 2,229 deletions.
diff --git a/build.sh b/build.sh
@@ -112,28 +112,39 @@ function buildLibCudfJniInDocker {
     local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
     local workspaceRepoDir="$workspaceDir/cudf"
     local workspaceMavenRepoDir="$workspaceDir/.m2/repository"
+    local workspaceCcacheDir="$workspaceDir/.ccache"
     mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
+    mkdir -p "$HOME/.ccache" "$HOME/.m2"
     nvidia-docker build \
         -f java/ci/Dockerfile.centos7 \
         --build-arg CUDA_VERSION=${cudaVersion} \
         -t $imageName .
     nvidia-docker run -it -u $(id -u):$(id -g) --rm \
+        -e PARALLEL_LEVEL \
+        -e CCACHE_DISABLE \
+        -e CCACHE_DIR="$workspaceCcacheDir" \
         -v "/etc/group:/etc/group:ro" \
         -v "/etc/passwd:/etc/passwd:ro" \
         -v "/etc/shadow:/etc/shadow:ro" \
         -v "/etc/sudoers.d:/etc/sudoers.d:ro" \
+        -v "$HOME/.ccache:$workspaceCcacheDir:rw" \
         -v "$REPODIR:$workspaceRepoDir:rw" \
         -v "$localMavenRepo:$workspaceMavenRepoDir:rw" \
         --workdir "$workspaceRepoDir/java/target/libcudf-cmake-build" \
         ${imageName} \
         scl enable devtoolset-9 \
             "cmake $workspaceRepoDir/cpp \
                 -G${CMAKE_GENERATOR} \
+                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_LINKER_LAUNCHER=ccache \
                 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} \
-                -DCMAKE_INSTALL_PREFIX==/usr/local/rapids \
-                -DUSE_NVTX=ON -DCUDF_USE_ARROW_STATIC=ON \
+                -DCMAKE_INSTALL_PREFIX=/usr/local/rapids \
+                -DUSE_NVTX=ON \
+                -DCUDF_USE_ARROW_STATIC=ON \
                 -DCUDF_ENABLE_ARROW_S3=OFF \
                 -DBUILD_TESTS=OFF \
                 -DPER_THREAD_DEFAULT_STREAM=ON \
@@ -145,6 +156,10 @@ function buildLibCudfJniInDocker {
                 -Dmaven.repo.local=$workspaceMavenRepoDir \
                 -DskipTests=${SKIP_TESTS:-false} \
                 -Dparallel.level=${PARALLEL_LEVEL} \
+                -Dcmake.ccache.opts='-DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_LINKER_LAUNCHER=ccache' \
                 -DCUDF_CPP_BUILD_DIR=$workspaceRepoDir/java/target/libcudf-cmake-build \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DPER_THREAD_DEFAULT_STREAM=ON \

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -189,7 +189,6 @@ add_library(
   src/ast/expression_parser.cpp
   src/ast/expressions.cpp
   src/binaryop/binaryop.cpp
-  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/Add.cu
   src/binaryop/compiled/ATan2.cu
   src/binaryop/compiled/BitwiseAnd.cu
@@ -220,6 +219,7 @@ add_library(
   src/binaryop/compiled/ShiftRightUnsigned.cu
   src/binaryop/compiled/Sub.cu
   src/binaryop/compiled/TrueDiv.cu
+  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/util.cpp
   src/labeling/label_bins.cu
   src/bitmask/null_mask.cu
@@ -362,6 +362,7 @@ add_library(
   src/join/mixed_join_size_kernel_nulls.cu
   src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
+  src/lists/apply_boolean_mask.cu
   src/lists/contains.cu
   src/lists/combine/concatenate_list_elements.cu
   src/lists/combine/concatenate_rows.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -242,6 +242,10 @@ ConfigureBench(PARQUET_WRITER_BENCH io/parquet/parquet_writer.cpp)
 # * orc writer benchmark --------------------------------------------------------------------------
 ConfigureBench(ORC_WRITER_BENCH io/orc/orc_writer.cpp)
 
+# ##################################################################################################
+# * orc writer chunks benchmark ---------------------------------------------------------------
+ConfigureNVBench(ORC_WRITER_CHUNKS_NVBENCH io/orc/orc_writer_chunks.cpp)
+
 # ##################################################################################################
 # * csv writer benchmark --------------------------------------------------------------------------
 ConfigureBench(CSV_WRITER_BENCH io/csv/csv_writer.cpp)

diff --git a/cpp/benchmarks/io/orc/orc_writer_chunks.cpp b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <cudf/column/column.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/table/table.hpp>
+
+// to enable, run cmake with -DBUILD_BENCHMARKS=ON
+
+constexpr int64_t data_size = 512 << 20;
+
+namespace cudf_io = cudf::io;
+
+void nvbench_orc_write(nvbench::state& state)
+{
+  cudf::size_type num_cols = state.get_int64("num_columns");
+
+  auto tbl =
+    create_random_table(cycle_dtypes(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
+                                                        int32_t(type_group_id::FLOATING_POINT),
+                                                        int32_t(type_group_id::FIXED_POINT),
+                                                        int32_t(type_group_id::TIMESTAMP),
+                                                        int32_t(cudf::type_id::STRING),
+                                                        int32_t(cudf::type_id::STRUCT),
+                                                        int32_t(cudf::type_id::LIST)}),
+                                     num_cols),
+                        table_size_bytes{data_size});
+  cudf::table_view view = tbl->view();
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  state.add_global_memory_reads<int64_t>(data_size);
+  state.add_element_count(view.num_columns() * view.num_rows());
+
+  size_t encoded_file_size = 0;
+
+  state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch, auto& timer) {
+               cuio_source_sink_pair source_sink(io_type::VOID);
+               timer.start();
+
+               cudf_io::orc_writer_options opts =
+                 cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view);
+               cudf_io::write_orc(opts);
+
+               timer.stop();
+               encoded_file_size = source_sink.size();
+             });
+
+  state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
+  state.add_buffer_size(encoded_file_size, "efs", "Encoded File Size");
+  state.add_buffer_size(view.num_rows(), "trc", "Total Rows");
+}
+
+void nvbench_orc_chunked_write(nvbench::state& state)
+{
+  cudf::size_type num_cols   = state.get_int64("num_columns");
+  cudf::size_type num_tables = state.get_int64("num_chunks");
+
+  std::vector<std::unique_ptr<cudf::table>> tables;
+  for (cudf::size_type idx = 0; idx < num_tables; idx++) {
+    tables.push_back(
+      create_random_table(cycle_dtypes(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
+                                                          int32_t(type_group_id::FLOATING_POINT),
+                                                          int32_t(type_group_id::FIXED_POINT),
+                                                          int32_t(type_group_id::TIMESTAMP),
+                                                          int32_t(cudf::type_id::STRING),
+                                                          int32_t(cudf::type_id::STRUCT),
+                                                          int32_t(cudf::type_id::LIST)}),
+                                       num_cols),
+                          table_size_bytes{size_t(data_size / num_tables)}));
+  }
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  auto size_iter = thrust::make_transform_iterator(
+    tables.begin(), [](auto const& i) { return i->num_columns() * i->num_rows(); });
+  auto row_count_iter =
+    thrust::make_transform_iterator(tables.begin(), [](auto const& i) { return i->num_rows(); });
+  auto total_elements = std::accumulate(size_iter, size_iter + num_tables, 0);
+  auto total_rows     = std::accumulate(row_count_iter, row_count_iter + num_tables, 0);
+
+  state.add_global_memory_reads<int64_t>(data_size);
+  state.add_element_count(total_elements);
+
+  size_t encoded_file_size = 0;
+
+  state.exec(
+    nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
+      cuio_source_sink_pair source_sink(io_type::VOID);
+      timer.start();
+
+      cudf_io::chunked_orc_writer_options opts =
+        cudf_io::chunked_orc_writer_options::builder(source_sink.make_sink_info());
+      cudf_io::orc_chunked_writer writer(opts);
+      std::for_each(tables.begin(),
+                    tables.end(),
+                    [&writer](std::unique_ptr<cudf::table> const& tbl) { writer.write(*tbl); });
+      writer.close();
+
+      timer.stop();
+      encoded_file_size = source_sink.size();
+    });
+
+  state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
+  state.add_buffer_size(encoded_file_size, "efs", "Encoded File Size");
+  state.add_buffer_size(total_rows, "trc", "Total Rows");
+}
+
+NVBENCH_BENCH(nvbench_orc_write)
+  .set_name("orc_write")
+  .set_min_samples(4)
+  .add_int64_axis("num_columns", {8, 64});
+
+NVBENCH_BENCH(nvbench_orc_chunked_write)
+  .set_name("orc_chunked_write")
+  .set_min_samples(4)
+  .add_int64_axis("num_columns", {8, 64})
+  .add_int64_axis("num_chunks", {8, 64});
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -22,7 +22,7 @@ function(find_and_configure_cucollections)
     GLOBAL_TARGETS cuco::cuco
     BUILD_EXPORT_SET cudf-exports
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG fb58a38701f1c24ecfe07d8f1f208bbe80930da5
+    GIT_TAG 8b15f06f38d034e815bc72045ca3403787f75e07
     EXCLUDE_FROM_ALL ${BUILD_SHARED_LIBS}
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )

diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -245,6 +245,20 @@ std::tuple<cudf::table_view, std::vector<rmm::device_buffer>> superimpose_parent
   table_view const& table,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Checks if a column or any of its children is a struct column with structs that are null.
+ *
+ * This function searches for structs that are null -- differentiating between structs that are null
+ * and structs containing null values. Null structs add a column to the result of the flatten column
+ * utility and necessitates column_nullability::FORCE when flattening the column for comparison
+ * operations.
+ *
+ * @param col Column to check for null structs
+ * @return A boolean indicating if the column is or contains a struct column that contains a null
+ * struct.
+ */
+bool contains_null_structs(column_view const& col);
 }  // namespace detail
 }  // namespace structs
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::lists::detail {
+
+/**
+ * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<column> apply_boolean_mask(
+  lists_column_view const& input,
+  lists_column_view const& boolean_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::lists {
+
+/**
+ * @brief Filters elements in each row of `input` LIST column using `boolean_mask`
+ * LIST of booleans as a mask.
+ *
+ * Given an input `LIST` column and a list-of-bools column, the function produces
+ * a new `LIST` column of the same type as `input`, where each element is copied
+ * from the input row *only* if the corresponding `boolean_mask` is non-null and `true`.
+ *
+ * E.g.
+ * @code{.pseudo}
+ * input        = { {0,1,2}, {3,4}, {5,6,7}, {8,9} };
+ * boolean_mask = { {0,1,1}, {1,0}, {1,1,1}, {0,0} };
+ * results      = { {1,2},   {3},   {5,6,7}, {} };
+ * @endcode
+ *
+ * `input` and `boolean_mask` must have the same number of rows.
+ * The output column has the same number of rows as the input column.
+ * An element is copied to an output row *only* if the corresponding boolean_mask element is `true`.
+ * An output row is invalid only if the input row is invalid.
+ *
+ * @throws cudf::logic_error if `boolean_mask` is not a "lists of bools" column
+ * @throws cudf::logic_error if `input` and `boolean_mask` have different number of rows
+ *
+ * @param input The input list column view to be filtered
+ * @param boolean_mask A nullable list of bools column used to filter `input` elements
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return List column of the same type as `input`, containing filtered list rows
+ */
+std::unique_ptr<column> apply_boolean_mask(
+  lists_column_view const& input,
+  lists_column_view const& boolean_mask,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::lists
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
@@ -74,7 +74,7 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs)
  * @brief A specialization for floating-point `Element` type relational comparison
  * to derive the order of the elements with respect to `lhs`.
  *
- * This Specialization handles `nan` in the following order:
+ * This specialization handles `nan` in the following order:
  * `[-Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN, null] (for null_order::AFTER)`
  * `[null, -Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN] (for null_order::BEFORE)`
  *