Merge remote-tracking branch 'upstream/branch-22.06' into binaryop-co…

…mpiled-util-refactor
rapidsai · May 5, 2022 · ecfadd1 · ecfadd1
2 parents 1b9656c + 4ce7b65
commit ecfadd1
Show file tree

Hide file tree

Showing 140 changed files with 5,194 additions and 1,883 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,3 +1,5 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
 repos:
       - repo: https://github.com/PyCQA/isort
         rev: 5.6.4
@@ -56,6 +58,15 @@ repos:
         hooks:
               - id: pydocstyle
                 args: ["--config=python/.flake8"]
+                exclude: |
+                    (?x)^(
+                    ci|
+                    cpp|
+                    conda|
+                    docs|
+                    java|
+                    notebooks
+                    )
       - repo: https://github.com/pre-commit/mirrors-clang-format
         rev: v11.1.0
         hooks:

diff --git a/build.sh b/build.sh
@@ -112,28 +112,39 @@ function buildLibCudfJniInDocker {
     local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
     local workspaceRepoDir="$workspaceDir/cudf"
     local workspaceMavenRepoDir="$workspaceDir/.m2/repository"
+    local workspaceCcacheDir="$workspaceDir/.ccache"
     mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
+    mkdir -p "$HOME/.ccache" "$HOME/.m2"
     nvidia-docker build \
         -f java/ci/Dockerfile.centos7 \
         --build-arg CUDA_VERSION=${cudaVersion} \
         -t $imageName .
     nvidia-docker run -it -u $(id -u):$(id -g) --rm \
+        -e PARALLEL_LEVEL \
+        -e CCACHE_DISABLE \
+        -e CCACHE_DIR="$workspaceCcacheDir" \
         -v "/etc/group:/etc/group:ro" \
         -v "/etc/passwd:/etc/passwd:ro" \
         -v "/etc/shadow:/etc/shadow:ro" \
         -v "/etc/sudoers.d:/etc/sudoers.d:ro" \
+        -v "$HOME/.ccache:$workspaceCcacheDir:rw" \
         -v "$REPODIR:$workspaceRepoDir:rw" \
         -v "$localMavenRepo:$workspaceMavenRepoDir:rw" \
         --workdir "$workspaceRepoDir/java/target/libcudf-cmake-build" \
         ${imageName} \
         scl enable devtoolset-9 \
             "cmake $workspaceRepoDir/cpp \
                 -G${CMAKE_GENERATOR} \
+                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_LINKER_LAUNCHER=ccache \
                 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} \
-                -DCMAKE_INSTALL_PREFIX==/usr/local/rapids \
-                -DUSE_NVTX=ON -DCUDF_USE_ARROW_STATIC=ON \
+                -DCMAKE_INSTALL_PREFIX=/usr/local/rapids \
+                -DUSE_NVTX=ON \
+                -DCUDF_USE_ARROW_STATIC=ON \
                 -DCUDF_ENABLE_ARROW_S3=OFF \
                 -DBUILD_TESTS=OFF \
                 -DPER_THREAD_DEFAULT_STREAM=ON \
@@ -145,6 +156,10 @@ function buildLibCudfJniInDocker {
                 -Dmaven.repo.local=$workspaceMavenRepoDir \
                 -DskipTests=${SKIP_TESTS:-false} \
                 -Dparallel.level=${PARALLEL_LEVEL} \
+                -Dcmake.ccache.opts='-DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_LINKER_LAUNCHER=ccache' \
                 -DCUDF_CPP_BUILD_DIR=$workspaceRepoDir/java/target/libcudf-cmake-build \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DPER_THREAD_DEFAULT_STREAM=ON \

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -79,6 +79,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
         - test -f $PREFIX/include/cudf/detail/concatenate.hpp
         - test -f $PREFIX/include/cudf/detail/copy.hpp
+        - test -f $PREFIX/include/cudf/detail/copy.cuh
         - test -f $PREFIX/include/cudf/detail/datetime.hpp
         - test -f $PREFIX/include/cudf/detail/fill.hpp
         - test -f $PREFIX/include/cudf/detail/gather.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -189,7 +189,6 @@ add_library(
   src/ast/expression_parser.cpp
   src/ast/expressions.cpp
   src/binaryop/binaryop.cpp
-  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/Add.cu
   src/binaryop/compiled/ATan2.cu
   src/binaryop/compiled/BitwiseAnd.cu
@@ -220,6 +219,7 @@ add_library(
   src/binaryop/compiled/ShiftRightUnsigned.cu
   src/binaryop/compiled/Sub.cu
   src/binaryop/compiled/TrueDiv.cu
+  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/util.cpp
   src/labeling/label_bins.cu
   src/bitmask/null_mask.cu
@@ -238,6 +238,7 @@ add_library(
   src/copying/gather.cu
   src/copying/get_element.cu
   src/copying/pack.cpp
+  src/copying/purge_nonempty_nulls.cu
   src/copying/reverse.cu
   src/copying/sample.cu
   src/copying/scatter.cu
@@ -361,6 +362,7 @@ add_library(
   src/join/mixed_join_size_kernel_nulls.cu
   src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
+  src/lists/apply_boolean_mask.cu
   src/lists/contains.cu
   src/lists/combine/concatenate_list_elements.cu
   src/lists/combine/concatenate_rows.cu

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
@@ -16,6 +16,7 @@
 
 #include <benchmarks/io/cuio_common.hpp>
 
+#include <cstdio>
 #include <fstream>
 #include <numeric>
 #include <string>
@@ -145,6 +146,8 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,
 // Executes the command and returns stderr output
 std::string exec_cmd(std::string_view cmd)
 {
+  // Prevent the output from the command from mixing with the original process' output
+  std::fflush(nullptr);
   // Switch stderr and stdout to only capture stderr
   auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
   std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);

diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -19,6 +19,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/lists/list_view.cuh>
 #include <cudf/types.hpp>
 
 #include <nvbench/nvbench.cuh>
@@ -55,3 +56,43 @@ NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("distinct")
   .set_type_axes_names({"Type"})
   .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+
+template <typename Type>
+void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
+{
+  cudf::rmm_pool_raii pool_raii;
+
+  auto const size             = state.get_int64("ColumnSize");
+  auto const dtype            = cudf::type_to_id<Type>();
+  double const null_frequency = state.get_float64("null_frequency");
+
+  data_profile table_data_profile;
+  if (dtype == cudf::type_id::LIST) {
+    table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 4);
+    table_data_profile.set_distribution_params(
+      cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4);
+    table_data_profile.set_list_depth(1);
+  } else {
+    // We're comparing distinct() on a non-nested column to that on a list column with the same
+    // number of distinct rows. The max list size is 4 and the number of distinct values in the
+    // list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + 5^4 = 781
+    // We want this column to also have 781 distinct values.
+    table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 781);
+  }
+  table_data_profile.set_null_frequency(null_frequency);
+
+  auto const table = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, table_data_profile, 0);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::detail::distinct(*table, {0}, cudf::null_equality::EQUAL, stream_view);
+  });
+}
+
+NVBENCH_BENCH_TYPES(nvbench_distinct_list,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
+  .set_name("distinct_list")
+  .set_type_axes_names({"Type"})
+  .add_float64_axis("null_frequency", {0.0, 0.1})
+  .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -53,9 +54,9 @@ static std::string create_hash_vocab_file()
   return hash_file;
 }
 
-static void BM_cuda_tokenizer_cudf(benchmark::State& state)
+static void BM_subword_tokenizer(benchmark::State& state)
 {
-  uint32_t nrows = 1000;
+  auto const nrows = static_cast<cudf::size_type>(state.range(0));
   std::vector<const char*> h_strings(nrows, "This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = create_hash_vocab_file();
@@ -67,6 +68,7 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state)
   //
   auto vocab = nvtext::load_vocabulary_file(hash_file);
   for (auto _ : state) {
+    cuda_event_timer raii(state, true);
     auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                            *vocab,
                                            max_sequence_length,
@@ -76,6 +78,18 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state)
                                            MAX_ROWS_TENSOR);
   }
 }
-BENCHMARK(BM_cuda_tokenizer_cudf);
 
-BENCHMARK_MAIN();
+class Subword : public cudf::benchmark {
+};
+
+#define SUBWORD_BM_BENCHMARK_DEFINE(name)                                                        \
+  BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \
+  BENCHMARK_REGISTER_F(Subword, name)                                                            \
+    ->RangeMultiplier(2)                                                                         \
+    ->Range(1 << 10, 1 << 17)                                                                    \
+    ->UseManualTime()                                                                            \
+    ->Unit(benchmark::kMillisecond);
+
+SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer);
+
+// BENCHMARK_MAIN();
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -22,7 +22,7 @@ function(find_and_configure_cucollections)
     GLOBAL_TARGETS cuco::cuco
     BUILD_EXPORT_SET cudf-exports
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG fb58a38701f1c24ecfe07d8f1f208bbe80930da5
+    GIT_TAG 8b15f06f38d034e815bc72045ca3403787f75e07
     EXCLUDE_FROM_ALL ${BUILD_SHARED_LIBS}
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )