Merge remote-tracking branch 'upstream/branch-24.04' into cuco-set-gr…

…oupby
rapidsai · Feb 29, 2024 · 6e54cd9 · 6e54cd9
2 parents 8bade44 + efc4edf
commit 6e54cd9
Show file tree

Hide file tree

Showing 16 changed files with 310 additions and 161 deletions.
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -3,6 +3,8 @@
 
 set -euo pipefail
 
+export RAPIDS_VERSION_NUMBER="$(rapids-generate-version)"
+
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
@@ -27,7 +29,6 @@ rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
   libcudf cudf dask-cudf
 
-export RAPIDS_VERSION_NUMBER="24.04"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -86,7 +86,6 @@ for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
 done
-sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
 
 # Java files
 NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"

diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh
@@ -8,9 +8,7 @@ source ./ci/test_cpp_common.sh
 
 rapids-logger "Memcheck gtests with rmm_mode=cuda"
 
-./ci/run_cudf_memcheck_ctests.sh \
-    --gtest_output=xml:"${RAPIDS_TESTS_DIR}${test_name}.xml" \
- && EXITCODE=$? || EXITCODE=$?;
+./ci/run_cudf_memcheck_ctests.sh && EXITCODE=$? || EXITCODE=$?;
 
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -150,5 +150,16 @@ std::unique_ptr<table> sort(table_view const& values,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr);
 
+/**
+ * @copydoc cudf::stable_sort
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> stable_sort(table_view const& values,
+                                   std::vector<order> const& column_order,
+                                   std::vector<null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,6 +115,18 @@ std::unique_ptr<table> sort(
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Performs a stable lexicographic sort of the rows of a table
+ *
+ * @copydoc cudf::sort
+ */
+std::unique_ptr<table> stable_sort(
+  table_view const& input,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a key-value sort.
  *
@@ -148,26 +160,7 @@ std::unique_ptr<table> sort_by_key(
 /**
  * @brief Performs a key-value stable sort.
  *
- * Creates a new table that reorders the rows of `values` according to the
- * lexicographic ordering of the rows of `keys`.
- *
- * The order of equivalent elements is guaranteed to be preserved.
- *
- * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`.
- *
- * @param values The table to reorder
- * @param keys The table that determines the ordering
- * @param column_order The desired order for each column in `keys`. Size must be
- * equal to `keys.num_columns()` or empty. If empty, all columns are sorted in
- * ascending order.
- * @param null_precedence The desired order of a null element compared to other
- * elements for each column in `keys`. Size must be equal to
- * `keys.num_columns()` or empty. If empty, all columns will be sorted with
- * `null_order::BEFORE`.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned table's device memory
- * @return The reordering of `values` determined by the lexicographic order of
- * the rows of `keys`.
+ * @copydoc cudf::sort_by_key
  */
 std::unique_ptr<table> stable_sort_by_key(
   table_view const& values,

diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
@@ -205,27 +205,24 @@ CUDF_KERNEL void distinct_join_probe_kernel(Iter iter,
     cudf::size_type buffer_size = 0;
 
     while (idx - block.thread_rank() < n) {  // the whole thread block falls into the same iteration
-      cudf::size_type thread_count{0};
-      cudf::size_type build_idx{0};
-      if (idx < n) {
-        auto const found = hash_table.find(*(iter + idx));
-        thread_count     = found != hash_table.end();
-        build_idx        = static_cast<cudf::size_type>(found->second);
-      }
+      auto const found     = idx < n ? hash_table.find(*(iter + idx)) : hash_table.end();
+      auto const has_match = found != hash_table.end();
 
       // Use a whole-block scan to calculate the output location
       cudf::size_type offset;
       cudf::size_type block_count;
-      block_scan(block_scan_temp_storage).ExclusiveSum(thread_count, offset, block_count);
+      block_scan(block_scan_temp_storage)
+        .ExclusiveSum(static_cast<cudf::size_type>(has_match), offset, block_count);
 
       if (buffer_size + block_count > buffer_capacity) {
         flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices);
         block.sync();
         buffer_size = 0;
       }
 
-      if (thread_count == 1) {
-        buffer[buffer_size + offset] = cuco::pair{build_idx, static_cast<cudf::size_type>(idx)};
+      if (has_match) {
+        buffer[buffer_size + offset] = cuco::pair{static_cast<cudf::size_type>(found->second),
+                                                  static_cast<cudf::size_type>(idx)};
       }
       buffer_size += block_count;
       block.sync();

diff --git a/cpp/src/sort/common_sort_impl.cuh b/cpp/src/sort/common_sort_impl.cuh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/sort.h>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The enum specifying which sorting method to use (stable or unstable).
+ */
+enum class sort_method : bool { STABLE, UNSTABLE };
+
+/**
+ * @brief Functor performs a fast-path, in-place sort on eligible columns
+ *
+ * @tparam method Whether to use a stable or unstable sort.
+ */
+template <sort_method method>
+struct inplace_column_sort_fn {
+  /**
+   * @brief Check if fast-path, in-place sort is available for the given column
+   *
+   * @param column to check
+   * @return true if fast-path sort is available, false otherwise.
+   */
+  static bool is_usable(column_view const& column)
+  {
+    return !column.has_nulls() && cudf::is_fixed_width(column.type()) &&
+           !cudf::is_floating_point(column.type());
+  }
+  /**
+   * @brief Check if fast-path, in-place sort is available for the given table
+   *
+   * @param table to check
+   * @return true if fast-path sort is available, false otherwise.
+   */
+  static bool is_usable(table_view const& table)
+  {
+    return table.num_columns() == 1 && is_usable(table.column(0));
+  }
+
+  /**
+   * @brief Fast-path sort a column in place
+   *
+   * Precondition, is_usable(column) returned true
+   *
+   * @tparam T column data type.
+   * @param col Column to sort, modified in place.
+   * @param order Ascending or descending sort order.
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   *
+   */
+  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+  void operator()(mutable_column_view& col, order order, rmm::cuda_stream_view stream) const
+  {
+    auto const do_sort = [&](auto const cmp) {
+      if constexpr (method == sort_method::STABLE) {
+        thrust::stable_sort(rmm::exec_policy(stream), col.begin<T>(), col.end<T>(), cmp);
+      } else {
+        thrust::sort(rmm::exec_policy(stream), col.begin<T>(), col.end<T>(), cmp);
+      }
+    };
+    if (order == order::ASCENDING) {
+      do_sort(thrust::less<T>());
+    } else {
+      do_sort(thrust::greater<T>());
+    }
+  }
+
+  template <typename T, std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
+  void operator()(mutable_column_view&, order, rmm::cuda_stream_view) const
+  {
+    CUDF_FAIL("Column type must be relationally comparable and fixed-width");
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "common_sort_impl.cuh"
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
@@ -29,11 +33,6 @@
 namespace cudf {
 namespace detail {
 
-/**
- * @brief The enum specifying which sorting method to use (stable or unstable).
- */
-enum class sort_method { STABLE, UNSTABLE };
-
 /**
  * @brief Functor performs faster segmented sort on eligible columns
  */

diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "common_sort_impl.cuh"
 #include "sort_impl.cuh"
 
 #include <cudf/column/column.hpp>
@@ -37,7 +38,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  return sorted_order<false>(input, column_order, null_precedence, stream, mr);
+  return sorted_order<sort_method::UNSTABLE>(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort_by_key(table_view const& values,
@@ -61,47 +62,24 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                         mr);
 }
 
-struct inplace_column_sort_fn {
-  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
-  void operator()(mutable_column_view& col, bool ascending, rmm::cuda_stream_view stream) const
-  {
-    CUDF_EXPECTS(!col.has_nulls(), "Nulls not supported for in-place sort");
-    if (ascending) {
-      thrust::sort(rmm::exec_policy(stream), col.begin<T>(), col.end<T>(), thrust::less<T>());
-    } else {
-      thrust::sort(rmm::exec_policy(stream), col.begin<T>(), col.end<T>(), thrust::greater<T>());
-    }
-  }
-
-  template <typename T, std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
-  void operator()(mutable_column_view&, bool, rmm::cuda_stream_view) const
-  {
-    CUDF_FAIL("Column type must be relationally comparable and fixed-width");
-  }
-};
-
 std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
   // fast-path sort conditions: single, non-floating-point, fixed-width column with no nulls
-  if (input.num_columns() == 1 && !input.column(0).has_nulls() &&
-      cudf::is_fixed_width(input.column(0).type()) &&
-      !cudf::is_floating_point(input.column(0).type())) {
-    auto output    = std::make_unique<column>(input.column(0), stream, mr);
-    auto view      = output->mutable_view();
-    bool ascending = (column_order.empty() ? true : column_order.front() == order::ASCENDING);
+  if (inplace_column_sort_fn<sort_method::UNSTABLE>::is_usable(input)) {
+    auto output = std::make_unique<column>(input.column(0), stream, mr);
+    auto view   = output->mutable_view();
+    auto order  = (column_order.empty() ? order::ASCENDING : column_order.front());
     cudf::type_dispatcher<dispatch_storage_type>(
-      output->type(), inplace_column_sort_fn{}, view, ascending, stream);
+      output->type(), inplace_column_sort_fn<sort_method::UNSTABLE>{}, view, order, stream);
     std::vector<std::unique_ptr<column>> columns;
     columns.emplace_back(std::move(output));
     return std::make_unique<table>(std::move(columns));
   }
-  return detail::sort_by_key(
-    input, input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort_by_key(input, input, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace detail

diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "common_sort_impl.cuh"
 #include "sort_column_impl.cuh"
 
 #include <cudf/column/column_factories.hpp>
@@ -30,19 +31,19 @@ namespace detail {
  * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
  */
 template <>
-std::unique_ptr<column> sorted_order<false>(column_view const& input,
-                                            order column_order,
-                                            null_order null_precedence,
-                                            rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> sorted_order<sort_method::UNSTABLE>(column_view const& input,
+                                                            order column_order,
+                                                            null_order null_precedence,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr)
 {
   auto sorted_indices = cudf::make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
   mutable_column_view indices_view = sorted_indices->mutable_view();
   thrust::sequence(
     rmm::exec_policy(stream), indices_view.begin<size_type>(), indices_view.end<size_type>(), 0);
   cudf::type_dispatcher<dispatch_storage_type>(input.type(),
-                                               column_sorted_order_fn<false>{},
+                                               column_sorted_order_fn<sort_method::UNSTABLE>{},
                                                input,
                                                indices_view,
                                                column_order == order::ASCENDING,