Merge branch 'branch-0.18' into rwlee/sparkspecific

rapidsai · Jan 4, 2021 · a4e95fe · a4e95fe
2 parents 4b6db38 + ca1a4d6
commit a4e95fe
Show file tree

Hide file tree

Showing 257 changed files with 6,185 additions and 4,052 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,29 +2,38 @@
 
 ## New Features
 - PR #6856 Add groupby idxmin, idxmax aggregation
-
 - PR #6847 Add a cmake find module for cuFile in JNI code
 - PR #6902 Implement `DataFrame.quantile` for `datetime` and `timedelta` data types
 - PR #6814 Implement `cudf::reduce` for `decimal32` and `decimal64` (part 1)
 - PR #6929 Add `Index.set_names` api
 - PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support
+- PR #6885 Share `factorize` implementation with Index and cudf module
+
+- PR #6775 Implement cudf.DateOffset for months
 
 ## Improvements
 
+- PR #6938 Pass numeric scalars of the same dtype through numeric binops
 - PR #6275 Update to official libcu++ on Github
 - PR #6838 Fix `columns` & `index` handling in dataframe constructor
 - PR #6750 Remove **kwargs from string/categorical methods
+- PR #6909 Support reading byte array backed decimal columns from parquet files
 - PR #6939 Use simplified `rmm::exec_policy`
+- PR #6512 Refactor rolling.cu to reduce compile time
+- PR #6982 Disable some pragma unroll statements in thrust `sort.h`
 
 ## Bug Fixes
 
+- PR #6884 Correct the sampling range when sampling with replacement
+- PR #6903 Add null count test for apply_boolean_mask
 - PR #6922 Fix N/A detection for empty fields in CSV reader
 - PR #6912 Fix rmm_mode=managed parameter for gtests
+- PR #6943 Fix join with nulls not equal performance
 - PR #6945 Fix groupby agg/apply behaviour when no key columns are provided 
 - PR #6942 Fix cudf::merge gtest for dictionary columns
 
 
-# cuDF 0.17.0 (Date TBD)
+# cuDF 0.17.0 (10 Dec 2020)
 
 ## New Features
 
@@ -63,6 +72,7 @@
 - PR #6765 Cupy fallback for __array_function__ and __array_ufunc__ for cudf.Series
 - PR #6817 Add support for scatter() on lists-of-struct columns
 - PR #6805 Implement `cudf::detail::copy_if` for `decimal32` and `decimal64`
+- PR #6483 Add `agg` function to aggregate dataframe using one or more operations
 - PR #6726 Support selecting different hash functions in hash_partition
 - PR #6619 Improve Dockerfile
 - PR #6831 Added parquet chunked writing ability for list columns
@@ -153,6 +163,7 @@
 - PR #6837 Avoid gather when copying strings view from start of strings column
 - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp
 - PR #6807 Refactor `std::array` usage in row group index writing in ORC
+- PR #6914 Enable groupby `list` aggregation for strings
 - PR #6908 Parquet option for strictly decimal reading
 
 ## Bug Fixes

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -205,15 +205,15 @@ fi
 
 cd $WORKSPACE/python/cudf
 gpuci_logger "Python py.test for cuDF"
-py.test --cache-clear --basetemp=${WORKSPACE}/cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-cudf.xml -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:${WORKSPACE}/python/cudf/cudf-coverage.xml --cov-report term
+py.test -n 6 --cache-clear --basetemp=${WORKSPACE}/cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-cudf.xml -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:${WORKSPACE}/python/cudf/cudf-coverage.xml --cov-report term
 
 cd $WORKSPACE/python/dask_cudf
 gpuci_logger "Python py.test for dask-cudf"
-py.test --cache-clear --basetemp=${WORKSPACE}/dask-cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-dask-cudf.xml -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:${WORKSPACE}/python/dask_cudf/dask-cudf-coverage.xml --cov-report term
+py.test -n 6 --cache-clear --basetemp=${WORKSPACE}/dask-cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-dask-cudf.xml -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:${WORKSPACE}/python/dask_cudf/dask-cudf-coverage.xml --cov-report term
 
 cd $WORKSPACE/python/custreamz
 gpuci_logger "Python py.test for cuStreamz"
-py.test --cache-clear --basetemp=${WORKSPACE}/custreamz-cuda-tmp --junitxml=${WORKSPACE}/junit-custreamz.xml -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:${WORKSPACE}/python/custreamz/custreamz-coverage.xml --cov-report term
+py.test -n 6 --cache-clear --basetemp=${WORKSPACE}/custreamz-cuda-tmp --junitxml=${WORKSPACE}/junit-custreamz.xml -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:${WORKSPACE}/python/custreamz/custreamz-coverage.xml --cov-report term
 
 gpuci_logger "Test notebooks"
 ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log

diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
@@ -23,6 +23,7 @@ dependencies:
   - fsspec>=0.6.0
   - pytest
   - pytest-benchmark
+  - pytest-xdist
   - sphinx
   - sphinx_rtd_theme
   - sphinxcontrib-websupport

diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
@@ -23,6 +23,7 @@ dependencies:
   - fsspec>=0.6.0
   - pytest
   - pytest-benchmark
+  - pytest-xdist
   - sphinx
   - sphinx_rtd_theme
   - sphinxcontrib-websupport

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
@@ -23,6 +23,7 @@ dependencies:
   - fsspec>=0.6.0
   - pytest
   - pytest-benchmark
+  - pytest-xdist
   - sphinx
   - sphinx_rtd_theme
   - sphinxcontrib-websupport

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
@@ -26,7 +26,7 @@ requirements:
     - cmake >=3.17.0
   host:
     - libcudf {{ version }}
-    - librdkafka 1.5
+    - librdkafka >=1.5.0,<1.5.3
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not
 

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -164,7 +164,8 @@ ConfigureBench(SEARCH_BENCH "${SEARCH_BENCH_SRC}")
 # - sort benchmark --------------------------------------------------------------------------------
 
 set(SORT_BENCH_SRC
-  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cu")
+  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cu"
+  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_strings_benchmark.cu")
 
 ConfigureBench(SORT_BENCH "${SORT_BENCH_SRC}")
 

diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -307,7 +307,7 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
  */
 struct string_column_data {
   std::vector<char> chars;
-  std::vector<int32_t> offsets;
+  std::vector<cudf::size_type> offsets;
   std::vector<cudf::bitmask_type> null_mask;
   explicit string_column_data(cudf::size_type rows, cudf::size_type size)
   {

diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu
@@ -23,6 +23,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
 #include <fixture/benchmark_fixture.hpp>
@@ -36,7 +37,7 @@ template <typename key_type, typename payload_type>
 class Join : public cudf::benchmark {
 };
 
-template <typename key_type, typename payload_type>
+template <typename key_type, typename payload_type, bool Nullable>
 static void BM_join(benchmark::State &state)
 {
   const cudf::size_type build_table_size{(cudf::size_type)state.range(0)};
@@ -46,11 +47,33 @@ static void BM_join(benchmark::State &state)
   const bool is_build_table_key_unique = true;
 
   // Generate build and probe tables
-
-  auto build_key_column =
-    cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()), build_table_size);
-  auto probe_key_column =
-    cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()), probe_table_size);
+  cudf::test::UniformRandomGenerator<cudf::size_type> rand_gen(0, build_table_size);
+  auto build_random_null_mask = [&rand_gen](int size) {
+    if (Nullable) {
+      // roughly 25% nulls
+      auto validity = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; });
+      return cudf::test::detail::make_null_mask(validity, validity + size);
+    } else {
+      return cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED);
+    }
+  };
+
+  std::unique_ptr<cudf::column> build_key_column = [&]() {
+    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                build_table_size,
+                                                build_random_null_mask(build_table_size))
+                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                build_table_size);
+  }();
+  std::unique_ptr<cudf::column> probe_key_column = [&]() {
+    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                probe_table_size,
+                                                build_random_null_mask(probe_table_size))
+                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                probe_table_size);
+  }();
 
   generate_input_tables<key_type, cudf::size_type>(
     build_key_column->mutable_view().data<key_type>(),
@@ -82,17 +105,23 @@ static void BM_join(benchmark::State &state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, 0);
 
-    auto result =
-      cudf::inner_join(probe_table, build_table, columns_to_join, columns_to_join, {{0, 0}});
+    auto result = cudf::inner_join(probe_table,
+                                   build_table,
+                                   columns_to_join,
+                                   columns_to_join,
+                                   {{0, 0}},
+                                   cudf::null_equality::UNEQUAL);
   }
 }
 
-#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type)       \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type) \
-  (::benchmark::State & st) { BM_join<key_type, payload_type>(st); }
+#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)     \
+  (::benchmark::State & st) { BM_join<key_type, payload_type, nullable>(st); }
 
-JOIN_BENCHMARK_DEFINE(join_32bit, int32_t, int32_t);
-JOIN_BENCHMARK_DEFINE(join_64bit, int64_t, int64_t);
+JOIN_BENCHMARK_DEFINE(join_32bit, int32_t, int32_t, false);
+JOIN_BENCHMARK_DEFINE(join_64bit, int64_t, int64_t, false);
+JOIN_BENCHMARK_DEFINE(join_32bit_nulls, int32_t, int32_t, true);
+JOIN_BENCHMARK_DEFINE(join_64bit_nulls, int64_t, int64_t, true);
 
 BENCHMARK_REGISTER_F(Join, join_32bit)
   ->Unit(benchmark::kMillisecond)
@@ -111,3 +140,21 @@ BENCHMARK_REGISTER_F(Join, join_64bit)
   ->Args({50'000'000, 50'000'000})
   ->Args({40'000'000, 120'000'000})
   ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
diff --git a/cpp/benchmarks/sort/sort_strings_benchmark.cu b/cpp/benchmarks/sort/sort_strings_benchmark.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+
+class SortStrings : public cudf::benchmark {
+};
+
+static void BM_sort(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+
+  auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::sort(table->view());
+  }
+}
+
+#define SORT_BENCHMARK_DEFINE(name)          \
+  BENCHMARK_DEFINE_F(SortStrings, name)      \
+  (::benchmark::State & st) { BM_sort(st); } \
+  BENCHMARK_REGISTER_F(SortStrings, name)    \
+    ->RangeMultiplier(8)                     \
+    ->Ranges({{1 << 10, 1 << 24}})           \
+    ->UseManualTime()                        \
+    ->Unit(benchmark::kMillisecond);
+
+SORT_BENCHMARK_DEFINE(stringssort)
diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp
@@ -17,7 +17,7 @@
 /**
  * @file synchronization.hpp
  * @brief This is the header file for `cuda_event_timer`.
- **/
+ */
 
 /**
  * @brief  This class serves as a wrapper for using `cudaEvent_t` as the user
@@ -54,7 +54,7 @@
     BENCHMARK(sample_cuda_benchmark)->UseManualTime();
 
 
- **/
+ */
 
 #ifndef CUDF_BENCH_SYNCHRONIZATION_H
 #define CUDF_BENCH_SYNCHRONIZATION_H
@@ -79,7 +79,7 @@ class cuda_event_timer {
    * @param[in] flush_l2_cache_ whether or not to flush the L2 cache before
    *                            every iteration.
    * @param[in] stream_ The CUDA stream we are measuring time on.
-   **/
+   */
   cuda_event_timer(benchmark::State& state,
                    bool flush_l2_cache,
                    rmm::cuda_stream_view stream = rmm::cuda_stream_default);

diff --git a/cpp/docs/TRANSITIONGUIDE.md b/cpp/docs/TRANSITIONGUIDE.md
@@ -777,7 +777,7 @@ namespace experimental{
  * @param mr Memory resource used to allocate device memory for the returned
  * output column
  * @return std::unique_ptr<column> Newly allocated output column
- **/
+ */
 std::unique_ptr<column> new_function(cudf::column_view input, 
                                      cudf::mutable_column_view in_out, 
                                      cudf::table_view input_table,

diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp
@@ -39,7 +39,6 @@ namespace detail {
  *
  * This enum is device-specific. For instance, intermediate data references are generated by the
  * linearization process but cannot be explicitly created by the user.
- *
  */
 enum class device_data_reference_type {
   COLUMN,       // A value in a table column
@@ -52,7 +51,6 @@ enum class device_data_reference_type {
  *
  * This is a POD class used to create references describing data type and locations for consumption
  * by the `row_evaluator`.
- *
  */
 struct alignas(8) device_data_reference {
   device_data_reference(device_data_reference_type reference_type,
@@ -85,7 +83,6 @@ class linearizer;
  *
  * This class is a part of a "visitor" pattern with the `linearizer` class.
  * Nodes inheriting from this class can accept visitors.
- *
  */
 class node {
   friend class detail::linearizer;
@@ -104,7 +101,6 @@ class node {
  * the nodes and constructing vectors of information that are later used by the device for
  * evaluating the abstract syntax tree as a "linear" list of operators whose input dependencies are
  * resolved into intermediate data storage in shared memory.
- *
  */
 class linearizer {
   friend class literal;

diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
@@ -931,7 +931,6 @@ struct dispatch_unary_operator_types {
 
 /**
  * @brief Functor performing a type dispatch for a unary operator.
- *
  */
 struct type_dispatch_unary_op {
   template <ast_operator op, typename F, typename... Ts>
@@ -968,7 +967,6 @@ CUDA_HOST_DEVICE_CALLABLE constexpr void unary_operator_dispatcher(ast_operator
 
 /**
  * @brief Functor to determine the return type of an operator from its input types.
- *
  */
 struct return_type_functor {
   /**
@@ -1057,7 +1055,6 @@ inline cudf::data_type ast_operator_return_type(ast_operator op,
 
 /**
  * @brief Functor to determine the arity (number of operands) of an operator.
- *
  */
 struct arity_functor {
   template <ast_operator op>

diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh
@@ -126,7 +126,6 @@ struct binary_row_output : public row_output {
  * This class is designed for n-ary transform evaluation. Currently this class assumes that there's
  * only one relevant "row index" in its methods, which corresponds to a row in a single input table
  * and the same row index in an output column.
- *
  */
 struct row_evaluator {
   friend struct row_output;