Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…

…ent/ListOfColumnsRefactor/copying
rapidsai · Mar 8, 2022 · 071b393 · 071b393
2 parents a287693 + e9876cf
commit 071b393
Show file tree

Hide file tree

Showing 38 changed files with 1,844 additions and 1,095 deletions.
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -239,9 +239,10 @@ fi
 # TEST - Run py.test, notebooks
 ################################################################################
 
-cd "$WORKSPACE/python/cudf"
+cd "$WORKSPACE/python/cudf/cudf"
+# It is essential to cd into $WORKSPACE/python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
 gpuci_logger "Python py.test for cuDF"
-py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests
 
 cd "$WORKSPACE/python/dask_cudf"
 gpuci_logger "Python py.test for dask-cudf"

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -22,6 +22,9 @@ build:
     - CC
     - CXX
     - CUDAHOSTCXX
+  # libcudf's run_exports pinning is looser than we would like
+  ignore_run_exports:
+    - libcudf
 
 requirements:
   build:
@@ -44,6 +47,7 @@ requirements:
     - numba >=0.54
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda
+    - libcudf {{ version }}
     - fastavro >=0.22.0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec>=0.6.0

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -204,6 +204,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/json.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/utf8.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp

diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
@@ -20,13 +20,18 @@ build:
     - PARALLEL_LEVEL
     - VERSION_SUFFIX
     - PROJECT_FLASH
+  # libcudf's run_exports pinning is looser than we would like
+  ignore_run_exports:
+    - libcudf
 
 requirements:
   build:
     - cmake >=3.20.1
   host:
     - libcudf {{version}}
     - librdkafka >=1.7.0,<1.8.0a0
+  run:
+    - libcudf {{version}}
 
 test:
   commands:

diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
@@ -114,7 +114,8 @@ std::pair<int64_t, int64_t> default_range()
 template <typename T, std::enable_if_t<cudf::is_numeric<T>()>* = nullptr>
 std::pair<T, T> default_range()
 {
-  return {std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max()};
+  // Limits need to be such that `upper - lower` does not overflow
+  return {std::numeric_limits<T>::lowest() / 2, std::numeric_limits<T>::max() / 2};
 }
 }  // namespace
 

diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,6 +63,19 @@ std::unique_ptr<table> sort_by_key(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::stable_sort_by_key
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> stable_sort_by_key(
+  table_view const& values,
+  table_view const& keys,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::segmented_sorted_order
  *

diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,6 +145,36 @@ std::unique_ptr<table> sort_by_key(
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Performs a key-value stable sort.
+ *
+ * Creates a new table that reorders the rows of `values` according to the
+ * lexicographic ordering of the rows of `keys`.
+ *
+ * The order of equivalent elements is guaranteed to be preserved.
+ *
+ * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`.
+ *
+ * @param values The table to reorder
+ * @param keys The table that determines the ordering
+ * @param column_order The desired order for each column in `keys`. Size must be
+ * equal to `keys.num_columns()` or empty. If empty, all columns are sorted in
+ * ascending order.
+ * @param null_precedence The desired order of a null element compared to other
+ * elements for each column in `keys`. Size must be equal to
+ * `keys.num_columns()` or empty. If empty, all columns will be sorted with
+ * `null_order::BEFORE`.
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return The reordering of `values` determined by the lexicographic order of
+ * the rows of `keys`.
+ */
+std::unique_ptr<table> stable_sort_by_key(
+  table_view const& values,
+  table_view const& keys,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Computes the ranks of input column in sorted order.
  *

diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+/**
+ * @file
+ * @brief Standalone string functions.
+ */
+
+namespace cudf {
+
+using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
+
+namespace strings {
+namespace detail {
+
+/**
+ * @brief This will return true if passed the first byte of a UTF-8 character.
+ *
+ * @param byte Any byte from a valid UTF-8 character
+ * @return true if this the first byte of the character
+ */
+constexpr bool is_begin_utf8_char(uint8_t byte)
+{
+  // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
+  return (byte & 0xC0) != 0x80;
+}
+
+/**
+ * @brief Returns the number of bytes in the specified character.
+ *
+ * @param character Single character
+ * @return Number of bytes
+ */
+constexpr size_type bytes_in_char_utf8(char_utf8 character)
+{
+  return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
+         static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
+         static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
+}
+
+/**
+ * @brief Returns the number of bytes used to represent the provided byte.
+ *
+ * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
+ * single character. For example, for the two-byte 0xC3A8 single character,
+ * the first byte would return 2 and the second byte would return 0.
+ *
+ * @param byte Byte from an encoded character.
+ * @return Number of bytes.
+ */
+constexpr size_type bytes_in_utf8_byte(uint8_t byte)
+{
+  return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
+         + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
+         + static_cast<size_type>((byte & 0xC0) == 0xC0)    // 2-byte character prefix
+         - static_cast<size_type>((byte & 0xC0) == 0x80);   // intermediate byte
+}
+
+/**
+ * @brief Convert a char array into a char_utf8 value.
+ *
+ * @param str String containing encoded char bytes.
+ * @param[out] character Single char_utf8 value.
+ * @return The number of bytes in the character
+ */
+constexpr size_type to_char_utf8(const char* str, char_utf8& character)
+{
+  size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
+
+  character = static_cast<char_utf8>(*str++) & 0xFF;
+  if (chr_width > 1) {
+    character = character << 8;
+    character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 8;
+    if (chr_width > 2) {
+      character = character << 8;
+      character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 16;
+      if (chr_width > 3) {
+        character = character << 8;
+        character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 24;
+      }
+    }
+  }
+  return chr_width;
+}
+
+/**
+ * @brief Place a char_utf8 value into a char array.
+ *
+ * @param character Single character
+ * @param[out] str Output array.
+ * @return The number of bytes in the character
+ */
+constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
+{
+  size_type const chr_width = bytes_in_char_utf8(character);
+  for (size_type idx = 0; idx < chr_width; ++idx) {
+    str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
+    character                = character >> 8;
+  }
+  return chr_width;
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.hpp>
 
 #ifndef __CUDA_ARCH__

diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -333,96 +333,4 @@ class string_view {
   __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const;
 };
 
-namespace strings {
-namespace detail {
-
-/**
- * @brief This will return true if passed the first byte of a UTF-8 character.
- *
- * @param byte Any byte from a valid UTF-8 character
- * @return true if this the first byte of the character
- */
-constexpr bool is_begin_utf8_char(uint8_t byte)
-{
-  // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
-  return (byte & 0xC0) != 0x80;
-}
-
-/**
- * @brief Returns the number of bytes in the specified character.
- *
- * @param character Single character
- * @return Number of bytes
- */
-constexpr size_type bytes_in_char_utf8(char_utf8 character)
-{
-  return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
-         static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
-         static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
-}
-
-/**
- * @brief Returns the number of bytes used to represent the provided byte.
- *
- * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
- * single character. For example, for the two-byte 0xC3A8 single character,
- * the first byte would return 2 and the second byte would return 0.
- *
- * @param byte Byte from an encoded character.
- * @return Number of bytes.
- */
-constexpr size_type bytes_in_utf8_byte(uint8_t byte)
-{
-  return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
-         + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
-         + static_cast<size_type>((byte & 0xC0) == 0xC0)    // 2-byte character prefix
-         - static_cast<size_type>((byte & 0xC0) == 0x80);   // intermediate byte
-}
-
-/**
- * @brief Convert a char array into a char_utf8 value.
- *
- * @param str String containing encoded char bytes.
- * @param[out] character Single char_utf8 value.
- * @return The number of bytes in the character
- */
-CUDF_HOST_DEVICE inline size_type to_char_utf8(const char* str, char_utf8& character)
-{
-  size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
-
-  character = static_cast<char_utf8>(*str++) & 0xFF;
-  if (chr_width > 1) {
-    character = character << 8;
-    character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 8;
-    if (chr_width > 2) {
-      character = character << 8;
-      character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 16;
-      if (chr_width > 3) {
-        character = character << 8;
-        character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 24;
-      }
-    }
-  }
-  return chr_width;
-}
-
-/**
- * @brief Place a char_utf8 value into a char array.
- *
- * @param character Single character
- * @param[out] str Allocated char array with enough space to hold the encoded character.
- * @return The number of bytes in the character
- */
-CUDF_HOST_DEVICE inline size_type from_char_utf8(char_utf8 character, char* str)
-{
-  size_type const chr_width = bytes_in_char_utf8(character);
-  for (size_type idx = 0; idx < chr_width; ++idx) {
-    str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
-    character                = character >> 8;
-  }
-  return chr_width;
-}
-
-}  // namespace detail
-}  // namespace strings
 }  // namespace cudf