Skip to content

Commit

Permalink
Merge branch 'branch-22.04' of github.com:rapidsai/cudf into improvem…
Browse files Browse the repository at this point in the history
…ent/ListOfColumnsRefactor/copying
  • Loading branch information
isVoid committed Mar 8, 2022
2 parents a287693 + e9876cf commit 071b393
Show file tree
Hide file tree
Showing 38 changed files with 1,844 additions and 1,095 deletions.
5 changes: 3 additions & 2 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,10 @@ fi
# TEST - Run py.test, notebooks
################################################################################

cd "$WORKSPACE/python/cudf"
cd "$WORKSPACE/python/cudf/cudf"
# It is essential to cd into $WORKSPACE/python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
gpuci_logger "Python py.test for cuDF"
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests

cd "$WORKSPACE/python/dask_cudf"
gpuci_logger "Python py.test for dask-cudf"
Expand Down
4 changes: 4 additions & 0 deletions conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ build:
- CC
- CXX
- CUDAHOSTCXX
# libcudf's run_exports pinning is looser than we would like
ignore_run_exports:
- libcudf

requirements:
build:
Expand All @@ -44,6 +47,7 @@ requirements:
- numba >=0.54
- numpy
- {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda
- libcudf {{ version }}
- fastavro >=0.22.0
- {{ pin_compatible('rmm', max_pin='x.x') }}
- fsspec>=0.6.0
Expand Down
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ test:
- test -f $PREFIX/include/cudf/strings/detail/fill.hpp
- test -f $PREFIX/include/cudf/strings/detail/json.hpp
- test -f $PREFIX/include/cudf/strings/detail/replace.hpp
- test -f $PREFIX/include/cudf/strings/detail/utf8.hpp
- test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
- test -f $PREFIX/include/cudf/strings/extract.hpp
- test -f $PREFIX/include/cudf/strings/findall.hpp
Expand Down
5 changes: 5 additions & 0 deletions conda/recipes/libcudf_kafka/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,18 @@ build:
- PARALLEL_LEVEL
- VERSION_SUFFIX
- PROJECT_FLASH
# libcudf's run_exports pinning is looser than we would like
ignore_run_exports:
- libcudf

requirements:
build:
- cmake >=3.20.1
host:
- libcudf {{version}}
- librdkafka >=1.7.0,<1.8.0a0
run:
- libcudf {{version}}

test:
commands:
Expand Down
3 changes: 2 additions & 1 deletion cpp/benchmarks/common/generate_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ std::pair<int64_t, int64_t> default_range()
template <typename T, std::enable_if_t<cudf::is_numeric<T>()>* = nullptr>
std::pair<T, T> default_range()
{
return {std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max()};
// Limits need to be such that `upper - lower` does not overflow
return {std::numeric_limits<T>::lowest() / 2, std::numeric_limits<T>::max() / 2};
}
} // namespace

Expand Down
15 changes: 14 additions & 1 deletion cpp/include/cudf/detail/sorting.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,6 +63,19 @@ std::unique_ptr<table> sort_by_key(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::stable_sort_by_key
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<table> stable_sort_by_key(
table_view const& values,
table_view const& keys,
std::vector<order> const& column_order = {},
std::vector<null_order> const& null_precedence = {},
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::segmented_sorted_order
*
Expand Down
32 changes: 31 additions & 1 deletion cpp/include/cudf/sorting.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -145,6 +145,36 @@ std::unique_ptr<table> sort_by_key(
std::vector<null_order> const& null_precedence = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Performs a key-value stable sort.
*
* Creates a new table that reorders the rows of `values` according to the
* lexicographic ordering of the rows of `keys`.
*
* The order of equivalent elements is guaranteed to be preserved.
*
* @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`.
*
* @param values The table to reorder
* @param keys The table that determines the ordering
* @param column_order The desired order for each column in `keys`. Size must be
* equal to `keys.num_columns()` or empty. If empty, all columns are sorted in
* ascending order.
* @param null_precedence The desired order of a null element compared to other
* elements for each column in `keys`. Size must be equal to
* `keys.num_columns()` or empty. If empty, all columns will be sorted with
* `null_order::BEFORE`.
* @param mr Device memory resource used to allocate the returned table's device memory
* @return The reordering of `values` determined by the lexicographic order of
* the rows of `keys`.
*/
std::unique_ptr<table> stable_sort_by_key(
table_view const& values,
table_view const& keys,
std::vector<order> const& column_order = {},
std::vector<null_order> const& null_precedence = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Computes the ranks of input column in sorted order.
*
Expand Down
121 changes: 121 additions & 0 deletions cpp/include/cudf/strings/detail/utf8.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/types.hpp>

/**
* @file
* @brief Standalone string functions.
*/

namespace cudf {

using char_utf8 = uint32_t; ///< UTF-8 characters are 1-4 bytes

namespace strings {
namespace detail {

/**
* @brief This will return true if passed the first byte of a UTF-8 character.
*
* @param byte Any byte from a valid UTF-8 character
* @return true if this the first byte of the character
*/
constexpr bool is_begin_utf8_char(uint8_t byte)
{
// The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
return (byte & 0xC0) != 0x80;
}

/**
* @brief Returns the number of bytes in the specified character.
*
* @param character Single character
* @return Number of bytes
*/
constexpr size_type bytes_in_char_utf8(char_utf8 character)
{
return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
}

/**
* @brief Returns the number of bytes used to represent the provided byte.
*
* This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
* single character. For example, for the two-byte 0xC3A8 single character,
* the first byte would return 2 and the second byte would return 0.
*
* @param byte Byte from an encoded character.
* @return Number of bytes.
*/
constexpr size_type bytes_in_utf8_byte(uint8_t byte)
{
return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0) // 4-byte character prefix
+ static_cast<size_type>((byte & 0xE0) == 0xE0) // 3-byte character prefix
+ static_cast<size_type>((byte & 0xC0) == 0xC0) // 2-byte character prefix
- static_cast<size_type>((byte & 0xC0) == 0x80); // intermediate byte
}

/**
* @brief Convert a char array into a char_utf8 value.
*
* @param str String containing encoded char bytes.
* @param[out] character Single char_utf8 value.
* @return The number of bytes in the character
*/
constexpr size_type to_char_utf8(const char* str, char_utf8& character)
{
size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));

character = static_cast<char_utf8>(*str++) & 0xFF;
if (chr_width > 1) {
character = character << 8;
character |= (static_cast<char_utf8>(*str++) & 0xFF); // << 8;
if (chr_width > 2) {
character = character << 8;
character |= (static_cast<char_utf8>(*str++) & 0xFF); // << 16;
if (chr_width > 3) {
character = character << 8;
character |= (static_cast<char_utf8>(*str++) & 0xFF); // << 24;
}
}
}
return chr_width;
}

/**
* @brief Place a char_utf8 value into a char array.
*
* @param character Single character
* @param[out] str Output array.
* @return The number of bytes in the character
*/
constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
{
size_type const chr_width = bytes_in_char_utf8(character);
for (size_type idx = 0; idx < chr_width; ++idx) {
str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
character = character >> 8;
}
return chr_width;
}

} // namespace detail
} // namespace strings
} // namespace cudf
3 changes: 2 additions & 1 deletion cpp/include/cudf/strings/string_view.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,7 @@

#pragma once

#include <cudf/strings/detail/utf8.hpp>
#include <cudf/strings/string_view.hpp>

#ifndef __CUDA_ARCH__
Expand Down
94 changes: 1 addition & 93 deletions cpp/include/cudf/strings/string_view.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -333,96 +333,4 @@ class string_view {
__device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const;
};

namespace strings {
namespace detail {

/**
* @brief This will return true if passed the first byte of a UTF-8 character.
*
* @param byte Any byte from a valid UTF-8 character
* @return true if this the first byte of the character
*/
constexpr bool is_begin_utf8_char(uint8_t byte)
{
// The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
return (byte & 0xC0) != 0x80;
}

/**
* @brief Returns the number of bytes in the specified character.
*
* @param character Single character
* @return Number of bytes
*/
constexpr size_type bytes_in_char_utf8(char_utf8 character)
{
return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
}

/**
* @brief Returns the number of bytes used to represent the provided byte.
*
* This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
* single character. For example, for the two-byte 0xC3A8 single character,
* the first byte would return 2 and the second byte would return 0.
*
* @param byte Byte from an encoded character.
* @return Number of bytes.
*/
constexpr size_type bytes_in_utf8_byte(uint8_t byte)
{
return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0) // 4-byte character prefix
+ static_cast<size_type>((byte & 0xE0) == 0xE0) // 3-byte character prefix
+ static_cast<size_type>((byte & 0xC0) == 0xC0) // 2-byte character prefix
- static_cast<size_type>((byte & 0xC0) == 0x80); // intermediate byte
}

/**
* @brief Convert a char array into a char_utf8 value.
*
* @param str String containing encoded char bytes.
* @param[out] character Single char_utf8 value.
* @return The number of bytes in the character
*/
CUDF_HOST_DEVICE inline size_type to_char_utf8(const char* str, char_utf8& character)
{
size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));

character = static_cast<char_utf8>(*str++) & 0xFF;
if (chr_width > 1) {
character = character << 8;
character |= (static_cast<char_utf8>(*str++) & 0xFF); // << 8;
if (chr_width > 2) {
character = character << 8;
character |= (static_cast<char_utf8>(*str++) & 0xFF); // << 16;
if (chr_width > 3) {
character = character << 8;
character |= (static_cast<char_utf8>(*str++) & 0xFF); // << 24;
}
}
}
return chr_width;
}

/**
* @brief Place a char_utf8 value into a char array.
*
* @param character Single character
* @param[out] str Allocated char array with enough space to hold the encoded character.
* @return The number of bytes in the character
*/
CUDF_HOST_DEVICE inline size_type from_char_utf8(char_utf8 character, char* str)
{
size_type const chr_width = bytes_in_char_utf8(character);
for (size_type idx = 0; idx < chr_width; ++idx) {
str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
character = character >> 8;
}
return chr_width;
}

} // namespace detail
} // namespace strings
} // namespace cudf
Loading

0 comments on commit 071b393

Please sign in to comment.