Merge branch 'branch-22.04' into scan_reduce_aggregations

rapidsai · Mar 11, 2022 · 6f940fd · 6f940fd
2 parents 23cae44 + c0f7fe6
commit 6f940fd
Show file tree

Hide file tree

Showing 241 changed files with 9,314 additions and 5,016 deletions.
diff --git a/build.sh b/build.sh
@@ -168,6 +168,10 @@ if hasArg clean; then
         rmdir ${bd} || true
     fi
     done
+
+    # Cleaning up python artifacts
+    find ${REPODIR}/python/ | grep -E "(__pycache__|\.pyc|\.pyo|\.so$)"  | xargs rm -rf
+
 fi
 
 

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -239,9 +239,10 @@ fi
 # TEST - Run py.test, notebooks
 ################################################################################
 
-cd "$WORKSPACE/python/cudf"
+cd "$WORKSPACE/python/cudf/cudf"
+# It is essential to cd into $WORKSPACE/python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
 gpuci_logger "Python py.test for cuDF"
-py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests
 
 cd "$WORKSPACE/python/dask_cudf"
 gpuci_logger "Python py.test for dask-cudf"

diff --git a/codecov.yml b/codecov.yml
@@ -2,4 +2,7 @@
 coverage:
   status:
     project: off
-    patch: off
+    patch: on
+
+github_checks:
+    annotations: true
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -22,6 +22,9 @@ build:
     - CC
     - CXX
     - CUDAHOSTCXX
+  # libcudf's run_exports pinning is looser than we would like
+  ignore_run_exports:
+    - libcudf
 
 requirements:
   build:
@@ -44,6 +47,7 @@ requirements:
     - numba >=0.54
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda
+    - libcudf {{ version }}
     - fastavro >=0.22.0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec>=0.6.0

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -137,6 +137,7 @@ test:
     - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
     - test -f $PREFIX/include/cudf/io/orc.hpp
     - test -f $PREFIX/include/cudf/io/parquet.hpp
+    - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
     - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
     - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
     - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
@@ -203,6 +204,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/json.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/utf8.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
@@ -20,13 +20,18 @@ build:
     - PARALLEL_LEVEL
     - VERSION_SUFFIX
     - PROJECT_FLASH
+  # libcudf's run_exports pinning is looser than we would like
+  ignore_run_exports:
+    - libcudf
 
 requirements:
   build:
     - cmake >=3.20.1
   host:
     - libcudf {{version}}
     - librdkafka >=1.7.0,<1.8.0a0
+  run:
+    - libcudf {{version}}
 
 test:
   commands:

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -311,6 +311,7 @@ add_library(
   src/io/parquet/writer_impl.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
+  src/io/text/byte_range_info.cpp
   src/io/text/multibyte_split.cu
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp
@@ -361,6 +362,7 @@ add_library(
   src/quantiles/quantiles.cu
   src/reductions/all.cu
   src/reductions/any.cu
+  src/reductions/collect_ops.cu
   src/reductions/max.cu
   src/reductions/mean.cu
   src/reductions/min.cu
@@ -372,6 +374,13 @@ add_library(
   src/reductions/scan/scan.cpp
   src/reductions/scan/scan_exclusive.cu
   src/reductions/scan/scan_inclusive.cu
+  src/reductions/segmented_all.cu
+  src/reductions/segmented_any.cu
+  src/reductions/segmented_max.cu
+  src/reductions/segmented_min.cu
+  src/reductions/segmented_product.cu
+  src/reductions/segmented_reductions.cpp
+  src/reductions/segmented_sum.cu
   src/reductions/std.cu
   src/reductions/sum.cu
   src/reductions/sum_of_squares.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 find_package(Threads REQUIRED)
 
-add_library(cudf_datagen STATIC common/generate_input.cpp)
+add_library(cudf_datagen STATIC common/generate_input.cpp common/generate_nullmask.cu)
 target_compile_features(cudf_datagen PUBLIC cxx_std_17 cuda_std_17)
 
 target_compile_options(
@@ -24,7 +24,7 @@ target_compile_options(
 
 target_link_libraries(
   cudf_datagen PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
-                      benchmark::benchmark nvbench::nvbench Threads::Threads cudf
+                      benchmark::benchmark nvbench::nvbench Threads::Threads cudf cudftestutil
 )
 
 target_include_directories(
@@ -175,9 +175,10 @@ ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu)
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------
 ConfigureBench(
-  REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/reduce.cpp
-  reduction/scan.cpp reduction/minmax.cpp
+  REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/minmax.cpp
+  reduction/reduce.cpp reduction/scan.cpp
 )
+ConfigureNVBench(REDUCTION_NVBENCH reduction/segment_reduce.cu)
 
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------
@@ -276,7 +277,7 @@ ConfigureBench(
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
-ConfigureBench(JSON_BENCH string/json.cpp)
+ConfigureBench(JSON_BENCH string/json.cu)
 
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,72 +14,42 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <benchmark/benchmark.h>
-#include <fixture/benchmark_fixture.hpp>
-#include <fixture/templated_benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
 #include <list>
-#include <numeric>
-#include <random>
+#include <memory>
 #include <vector>
 
 enum class TreeType {
   IMBALANCED_LEFT  // All operator expressions have a left child operator expression and a right
                    // child column reference
 };
 
+template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 class AST : public cudf::benchmark {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 static void BM_ast_transform(benchmark::State& state)
 {
-  const cudf::size_type table_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type tree_levels = (cudf::size_type)state.range(1);
+  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
 
   // Create table data
-  auto n_cols          = reuse_columns ? 1 : tree_levels + 1;
-  auto column_wrappers = std::vector<cudf::test::fixed_width_column_wrapper<key_type>>(n_cols);
-  auto columns         = std::vector<cudf::column_view>(n_cols);
-
-  auto data_iterator = thrust::make_counting_iterator(0);
-
-  if constexpr (Nullable) {
-    auto validities = std::vector<bool>(table_size);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-
-    std::generate(
-      validities.begin(), validities.end(), [&]() { return gen() > (0.5 * gen.max()); });
-    std::generate_n(column_wrappers.begin(), n_cols, [=]() {
-      return cudf::test::fixed_width_column_wrapper<key_type>(
-        data_iterator, data_iterator + table_size, validities.begin());
-    });
-  } else {
-    std::generate_n(column_wrappers.begin(), n_cols, [=]() {
-      return cudf::test::fixed_width_column_wrapper<key_type>(data_iterator,
-                                                              data_iterator + table_size);
-    });
-  }
-  std::transform(
-    column_wrappers.begin(), column_wrappers.end(), columns.begin(), [](auto const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
-
-  cudf::table_view table{columns};
+  auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
+  auto const source_table =
+    create_sequence_table(cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols),
+                          row_count{table_size},
+                          Nullable ? 0.5 : -1.0);
+  auto table = source_table->view();
 
   // Create column references
   auto column_refs = std::vector<cudf::ast::column_reference>();
@@ -138,10 +108,15 @@ static void CustomRanges(benchmark::internal::Benchmark* b)
   }
 }
 
-#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable)   \
-  TEMPLATED_BENCHMARK_F(AST, BM_ast_transform, key_type, tree_type, reuse_columns, nullable) \
-    ->Apply(CustomRanges)                                                                    \
-    ->Unit(benchmark::kMillisecond)                                                          \
+#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
+  (::benchmark::State & st)                                                                \
+  {                                                                                        \
+    BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                    \
+  }                                                                                        \
+  BENCHMARK_REGISTER_F(AST, name)                                                          \
+    ->Apply(CustomRanges)                                                                  \
+    ->Unit(benchmark::kMillisecond)                                                        \
     ->UseManualTime();
 
 AST_TRANSFORM_BENCHMARK_DEFINE(

diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,23 +14,15 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/binaryop.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <benchmark/benchmark.h>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <numeric>
 #include <vector>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
@@ -47,40 +39,29 @@ class BINARYOP : public cudf::benchmark {
 template <typename key_type, TreeType tree_type, bool reuse_columns>
 static void BM_binaryop_transform(benchmark::State& state)
 {
-  const cudf::size_type table_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type tree_levels = (cudf::size_type)state.range(1);
+  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
 
   // Create table data
-  auto n_cols          = reuse_columns ? 1 : tree_levels + 1;
-  auto column_wrappers = std::vector<cudf::test::fixed_width_column_wrapper<key_type>>();
-  auto columns         = std::vector<cudf::column_view>(n_cols);
-
-  auto data_iterator = thrust::make_counting_iterator(0);
-  std::generate_n(std::back_inserter(column_wrappers), n_cols, [=]() {
-    return cudf::test::fixed_width_column_wrapper<key_type>(data_iterator,
-                                                            data_iterator + table_size);
-  });
-  std::transform(
-    column_wrappers.begin(), column_wrappers.end(), columns.begin(), [](auto const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
-
-  cudf::table_view table{columns};
+  auto const n_cols       = reuse_columns ? 1 : tree_levels + 1;
+  auto const source_table = create_sequence_table(
+    cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
+  cudf::table_view table{*source_table};
 
   // Execute benchmark
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     // Execute tree that chains additions like (((a + b) + c) + d)
-    auto const op         = cudf::binary_operator::ADD;
-    auto result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
+    auto const op               = cudf::binary_operator::ADD;
+    auto const result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
     if (reuse_columns) {
-      auto result = cudf::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
+      auto result = cudf::binary_operation(table.column(0), table.column(0), op, result_data_type);
       for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-        result = cudf::binary_operation(result->view(), columns.at(0), op, result_data_type);
+        result = cudf::binary_operation(result->view(), table.column(0), op, result_data_type);
       }
     } else {
-      auto result = cudf::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
-      std::for_each(std::next(columns.cbegin(), 2), columns.cend(), [&](auto const& col) {
+      auto result = cudf::binary_operation(table.column(0), table.column(1), op, result_data_type);
+      std::for_each(std::next(table.begin(), 2), table.end(), [&](auto const& col) {
         result = cudf::binary_operation(result->view(), col, op, result_data_type);
       });
     }

diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -14,30 +14,26 @@
  * limitations under the License.
  */
 
-#include <fixture/benchmark_fixture.hpp>
-#include <fixture/templated_benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 class COMPILED_BINARYOP : public cudf::benchmark {
 };
 
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
 void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  auto const column_size{static_cast<cudf::size_type>(state.range(0))};
+
+  auto const source_table = create_random_table(
+    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{column_size});
 
-  auto data_it = thrust::make_counting_iterator(0);
-  cudf::test::fixed_width_column_wrapper<TypeLhs> input1(data_it, data_it + column_size);
-  cudf::test::fixed_width_column_wrapper<TypeRhs> input2(data_it, data_it + column_size);
+  auto lhs = cudf::column_view(source_table->get_column(0));
+  auto rhs = cudf::column_view(source_table->get_column(1));
 
-  auto lhs          = cudf::column_view(input1);
-  auto rhs          = cudf::column_view(input2);
   auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
 
   // Call once for hot cache.
-Original file line number
+Diff line change
@@ Expand Up / @@ -168,6 +168,10 @@ if hasArg clean; then @@
             rmdir ${bd} || true
         fi
         done
+        # Cleaning up python artifacts
+        find ${REPODIR}/python/ | grep -E "(__pycache__|\.pyc|\.pyo|\.so$)"  | xargs rm -rf
     fi
@@ Expand Down @@