Merge remote-tracking branch 'upstream/branch-22.06' into mwilson/cui…

…o-chunked-stats-p2
rapidsai · May 5, 2022 · 3447424 · 3447424
2 parents 0cba0a6 + 14b5169
commit 3447424
Show file tree

Hide file tree

Showing 251 changed files with 8,467 additions and 4,073 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,3 +1,5 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
 repos:
       - repo: https://github.com/PyCQA/isort
         rev: 5.6.4
@@ -56,6 +58,15 @@ repos:
         hooks:
               - id: pydocstyle
                 args: ["--config=python/.flake8"]
+                exclude: |
+                    (?x)^(
+                    ci|
+                    cpp|
+                    conda|
+                    docs|
+                    java|
+                    notebooks
+                    )
       - repo: https://github.com/pre-commit/mirrors-clang-format
         rev: v11.1.0
         hooks:

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
diff --git a/build.sh b/build.sh
@@ -112,28 +112,39 @@ function buildLibCudfJniInDocker {
     local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
     local workspaceRepoDir="$workspaceDir/cudf"
     local workspaceMavenRepoDir="$workspaceDir/.m2/repository"
+    local workspaceCcacheDir="$workspaceDir/.ccache"
     mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
+    mkdir -p "$HOME/.ccache" "$HOME/.m2"
     nvidia-docker build \
         -f java/ci/Dockerfile.centos7 \
         --build-arg CUDA_VERSION=${cudaVersion} \
         -t $imageName .
     nvidia-docker run -it -u $(id -u):$(id -g) --rm \
+        -e PARALLEL_LEVEL \
+        -e CCACHE_DISABLE \
+        -e CCACHE_DIR="$workspaceCcacheDir" \
         -v "/etc/group:/etc/group:ro" \
         -v "/etc/passwd:/etc/passwd:ro" \
         -v "/etc/shadow:/etc/shadow:ro" \
         -v "/etc/sudoers.d:/etc/sudoers.d:ro" \
+        -v "$HOME/.ccache:$workspaceCcacheDir:rw" \
         -v "$REPODIR:$workspaceRepoDir:rw" \
         -v "$localMavenRepo:$workspaceMavenRepoDir:rw" \
         --workdir "$workspaceRepoDir/java/target/libcudf-cmake-build" \
         ${imageName} \
         scl enable devtoolset-9 \
             "cmake $workspaceRepoDir/cpp \
                 -G${CMAKE_GENERATOR} \
+                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_LINKER_LAUNCHER=ccache \
                 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} \
-                -DCMAKE_INSTALL_PREFIX==/usr/local/rapids \
-                -DUSE_NVTX=ON -DCUDF_USE_ARROW_STATIC=ON \
+                -DCMAKE_INSTALL_PREFIX=/usr/local/rapids \
+                -DUSE_NVTX=ON \
+                -DCUDF_USE_ARROW_STATIC=ON \
                 -DCUDF_ENABLE_ARROW_S3=OFF \
                 -DBUILD_TESTS=OFF \
                 -DPER_THREAD_DEFAULT_STREAM=ON \
@@ -145,6 +156,10 @@ function buildLibCudfJniInDocker {
                 -Dmaven.repo.local=$workspaceMavenRepoDir \
                 -DskipTests=${SKIP_TESTS:-false} \
                 -Dparallel.level=${PARALLEL_LEVEL} \
+                -Dcmake.ccache.opts='-DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_LINKER_LAUNCHER=ccache' \
                 -DCUDF_CPP_BUILD_DIR=$workspaceRepoDir/java/target/libcudf-cmake-build \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DPER_THREAD_DEFAULT_STREAM=ON \

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -79,6 +79,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
         - test -f $PREFIX/include/cudf/detail/concatenate.hpp
         - test -f $PREFIX/include/cudf/detail/copy.hpp
+        - test -f $PREFIX/include/cudf/detail/copy.cuh
         - test -f $PREFIX/include/cudf/detail/datetime.hpp
         - test -f $PREFIX/include/cudf/detail/fill.hpp
         - test -f $PREFIX/include/cudf/detail/gather.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -189,7 +189,6 @@ add_library(
   src/ast/expression_parser.cpp
   src/ast/expressions.cpp
   src/binaryop/binaryop.cpp
-  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/Add.cu
   src/binaryop/compiled/ATan2.cu
   src/binaryop/compiled/BitwiseAnd.cu
@@ -220,6 +219,7 @@ add_library(
   src/binaryop/compiled/ShiftRightUnsigned.cu
   src/binaryop/compiled/Sub.cu
   src/binaryop/compiled/TrueDiv.cu
+  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/util.cpp
   src/labeling/label_bins.cu
   src/bitmask/null_mask.cu
@@ -238,6 +238,7 @@ add_library(
   src/copying/gather.cu
   src/copying/get_element.cu
   src/copying/pack.cpp
+  src/copying/purge_nonempty_nulls.cu
   src/copying/reverse.cu
   src/copying/sample.cu
   src/copying/scatter.cu
@@ -302,6 +303,8 @@ add_library(
   src/io/comp/cpu_unbz2.cpp
   src/io/comp/debrotli.cu
   src/io/comp/gpuinflate.cu
+  src/io/comp/nvcomp_adapter.cpp
+  src/io/comp/nvcomp_adapter.cu
   src/io/comp/snap.cu
   src/io/comp/uncomp.cpp
   src/io/comp/unsnap.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -194,10 +194,18 @@ ConfigureBench(FILL_BENCH filling/repeat.cpp)
 # ##################################################################################################
 # * groupby benchmark -----------------------------------------------------------------------------
 ConfigureBench(
-  GROUPBY_BENCH groupby/group_sum.cu groupby/group_nth.cu groupby/group_shift.cu
-  groupby/group_struct.cu groupby/group_no_requests.cu groupby/group_scan.cu
+  GROUPBY_BENCH
+  groupby/group_sum.cu
+  groupby/group_nth.cu
+  groupby/group_shift.cu
+  groupby/group_struct.cu
+  groupby/group_no_requests.cu
+  groupby/group_scan.cu
+  groupby/group_rank_benchmark.cu
 )
 
+ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu)
+
 # ##################################################################################################
 # * hashing benchmark -----------------------------------------------------------------------------
 ConfigureBench(HASHING_BENCH hashing/hash.cpp hashing/partition.cpp)

diff --git a/cpp/benchmarks/groupby/group_rank_benchmark.cu b/cpp/benchmarks/groupby/group_rank_benchmark.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/groupby.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <cudf::rank_method method>
+static void nvbench_groupby_rank(nvbench::state& state,
+                                 nvbench::type_list<nvbench::enum_type<method>>)
+{
+  using namespace cudf;
+  using type           = int64_t;
+  constexpr auto dtype = type_to_id<int64_t>();
+  cudf::rmm_pool_raii pool_raii;
+
+  bool const is_sorted              = state.get_int64("is_sorted");
+  cudf::size_type const column_size = state.get_int64("data_size");
+  constexpr int num_groups          = 100;
+
+  data_profile profile;
+  profile.set_null_frequency(std::nullopt);
+  profile.set_cardinality(0);
+  profile.set_distribution_params<type>(dtype, distribution_id::UNIFORM, 0, num_groups);
+
+  auto source_table = create_random_table({dtype, dtype}, row_count{column_size}, profile);
+
+  // values to be pre-sorted too for groupby rank
+  if (is_sorted) source_table = cudf::sort(*source_table);
+
+  table_view keys{{source_table->view().column(0)}};
+  column_view order_by{source_table->view().column(1)};
+
+  auto agg = cudf::make_rank_aggregation<groupby_scan_aggregation>(method);
+  std::vector<groupby::scan_request> requests;
+  requests.emplace_back(groupby::scan_request());
+  requests[0].values = order_by;
+  requests[0].aggregations.push_back(std::move(agg));
+
+  groupby::groupby gb_obj(keys, null_policy::EXCLUDE, is_sorted ? sorted::YES : sorted::NO);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    // groupby scan uses sort implementation
+    auto result = gb_obj.scan(requests);
+  });
+}
+
+enum class rank_method : int32_t {};
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  cudf::rank_method,
+  [](cudf::rank_method value) {
+    switch (value) {
+      case cudf::rank_method::FIRST: return "FIRST";
+      case cudf::rank_method::AVERAGE: return "AVERAGE";
+      case cudf::rank_method::MIN: return "MIN";
+      case cudf::rank_method::MAX: return "MAX";
+      case cudf::rank_method::DENSE: return "DENSE";
+      default: return "unknown";
+    }
+  },
+  [](cudf::rank_method value) {
+    switch (value) {
+      case cudf::rank_method::FIRST: return "cudf::rank_method::FIRST";
+      case cudf::rank_method::AVERAGE: return "cudf::rank_method::AVERAGE";
+      case cudf::rank_method::MIN: return "cudf::rank_method::MIN";
+      case cudf::rank_method::MAX: return "cudf::rank_method::MAX";
+      case cudf::rank_method::DENSE: return "cudf::rank_method::DENSE";
+      default: return "unknown";
+    }
+  })
+
+using methods = nvbench::enum_type_list<cudf::rank_method::AVERAGE,
+                                        cudf::rank_method::DENSE,
+                                        cudf::rank_method::FIRST,
+                                        cudf::rank_method::MAX,
+                                        cudf::rank_method::MIN>;
+
+NVBENCH_BENCH_TYPES(nvbench_groupby_rank, NVBENCH_TYPE_AXES(methods))
+  .set_type_axes_names({"rank_method"})
+  .set_name("groupby_rank")
+  .add_int64_axis("data_size",
+                  {
+                    1000000,    // 1M
+                    10000000,   // 10M
+                    100000000,  // 100M
+                  })
+
+  .add_int64_axis("is_sorted", {0, 1});
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
@@ -16,6 +16,7 @@
 
 #include <benchmarks/io/cuio_common.hpp>
 
+#include <cstdio>
 #include <fstream>
 #include <numeric>
 #include <string>
@@ -145,6 +146,8 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,
 // Executes the command and returns stderr output
 std::string exec_cmd(std::string_view cmd)
 {
+  // Prevent the output from the command from mixing with the original process' output
+  std::fflush(nullptr);
   // Switch stderr and stdout to only capture stderr
   auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
   std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);

diff --git a/cpp/benchmarks/reduction/segment_reduce.cu b/cpp/benchmarks/reduction/segment_reduce.cu
@@ -82,7 +82,7 @@ std::pair<std::unique_ptr<column>, thrust::device_vector<size_type>> make_test_d
 
   thrust::device_vector<size_type> d_offsets(offset_it, offset_it + num_segments + 1);
 
-  return std::make_pair(std::move((input->release())[0]), d_offsets);
+  return std::pair(std::move((input->release())[0]), d_offsets);
 }
 
 template <typename InputType, typename OutputType, aggregation::Kind kind>

diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -19,6 +19,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/lists/list_view.cuh>
 #include <cudf/types.hpp>
 
 #include <nvbench/nvbench.cuh>
@@ -55,3 +56,43 @@ NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("distinct")
   .set_type_axes_names({"Type"})
   .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+
+template <typename Type>
+void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
+{
+  cudf::rmm_pool_raii pool_raii;
+
+  auto const size             = state.get_int64("ColumnSize");
+  auto const dtype            = cudf::type_to_id<Type>();
+  double const null_frequency = state.get_float64("null_frequency");
+
+  data_profile table_data_profile;
+  if (dtype == cudf::type_id::LIST) {
+    table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 4);
+    table_data_profile.set_distribution_params(
+      cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4);
+    table_data_profile.set_list_depth(1);
+  } else {
+    // We're comparing distinct() on a non-nested column to that on a list column with the same
+    // number of distinct rows. The max list size is 4 and the number of distinct values in the
+    // list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + 5^4 = 781
+    // We want this column to also have 781 distinct values.
+    table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 781);
+  }
+  table_data_profile.set_null_frequency(null_frequency);
+
+  auto const table = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, table_data_profile, 0);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::detail::distinct(*table, {0}, cudf::null_equality::EQUAL, stream_view);
+  });
+}
+
+NVBENCH_BENCH_TYPES(nvbench_distinct_list,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
+  .set_name("distinct_list")
+  .set_type_axes_names({"Type"})
+  .add_float64_axis("null_frequency", {0.0, 0.1})
+  .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -53,9 +54,9 @@ static std::string create_hash_vocab_file()
   return hash_file;
 }
 
-static void BM_cuda_tokenizer_cudf(benchmark::State& state)
+static void BM_subword_tokenizer(benchmark::State& state)
 {
-  uint32_t nrows = 1000;
+  auto const nrows = static_cast<cudf::size_type>(state.range(0));
   std::vector<const char*> h_strings(nrows, "This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = create_hash_vocab_file();
@@ -67,6 +68,7 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state)
   //
   auto vocab = nvtext::load_vocabulary_file(hash_file);
   for (auto _ : state) {
+    cuda_event_timer raii(state, true);
     auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                            *vocab,
                                            max_sequence_length,
@@ -76,6 +78,18 @@ static void BM_cuda_tokenizer_cudf(benchmark::State& state)
                                            MAX_ROWS_TENSOR);
   }
 }
-BENCHMARK(BM_cuda_tokenizer_cudf);
 
-BENCHMARK_MAIN();
+class Subword : public cudf::benchmark {
+};
+
+#define SUBWORD_BM_BENCHMARK_DEFINE(name)                                                        \
+  BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \
+  BENCHMARK_REGISTER_F(Subword, name)                                                            \
+    ->RangeMultiplier(2)                                                                         \
+    ->Range(1 << 10, 1 << 17)                                                                    \
+    ->UseManualTime()                                                                            \
+    ->Unit(benchmark::kMillisecond);
+
+SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer);
+
+// BENCHMARK_MAIN();
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
@@ -572,7 +572,7 @@ The preferred style for how inputs are passed in and outputs are returned is the
 
 Sometimes it is necessary for functions to have multiple outputs. There are a few ways this can be
 done in C++ (including creating a `struct` for the output). One convenient way to do this is
-using `std::tie`  and `std::make_pair`. Note that objects passed to `std::make_pair` will invoke
+using `std::tie`  and `std::pair`. Note that objects passed to `std::pair` will invoke
 either the copy constructor or the move constructor of the object, and it may be preferable to move
 non-trivially copyable objects (and required for types with deleted copy constructors, like
 `std::unique_ptr`).
@@ -585,7 +585,7 @@ std::pair<table, table> return_two_tables(void){
   // Do stuff with out0, out1
 
   // Return a std::pair of the two outputs
-  return std::make_pair(std::move(out0), std::move(out1));
+  return std::pair(std::move(out0), std::move(out1));
 }
 
 cudf::table out0;