Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial support for string udfs in libcudf #10686

Closed
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ add_library(
src/strings/strip.cu
src/strings/substring.cu
src/strings/translate.cu
src/strings/udf/column_functions.cu
src/strings/utilities.cu
src/strings/wrap.cu
src/structs/copying/concatenate.cu
Expand Down
66 changes: 57 additions & 9 deletions cpp/include/cudf/strings/string.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,21 @@

#include <cudf/strings/string_view.cuh>

#include <thrust/distance.h>
#include <thrust/execution_policy.h>
#include <thrust/logical.h>

namespace cudf {
namespace strings {
namespace string {
namespace detail {

__device__ inline static cudf::size_type bytes_in_null_terminated_string(char const* str)
{
if (!str) return 0;
cudf::size_type bytes = 0;
while (*str++)
++bytes;
return bytes;
}

} // namespace detail

/**
* @addtogroup strings_classes
* @{
Expand All @@ -50,9 +58,12 @@ inline __device__ bool is_integer(string_view const& d_str)
auto begin = d_str.begin();
auto end = d_str.end();
if (*begin == '+' || *begin == '-') ++begin;
return (thrust::distance(begin, end) > 0) &&
thrust::all_of(
thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; });
auto const result = begin < end;
while (begin < end) {
if (*begin < '0' || *begin > '9') { return false; }
++begin;
}
return result;
}

/**
Expand Down Expand Up @@ -149,7 +160,44 @@ inline __device__ bool is_float(string_view const& d_str)
return result;
}

__device__ inline bool starts_with(cudf::string_view const dstr,
char const* tgt,
cudf::size_type bytes)
{
if (bytes > dstr.size_bytes()) { return false; }
auto const start_str = cudf::string_view{dstr.data(), bytes};
return start_str.compare(tgt, bytes) == 0;
}

__device__ inline bool starts_with(cudf::string_view const dstr, char const* tgt)
{
return starts_with(dstr, tgt, detail::bytes_in_null_terminated_string(tgt));
}

__device__ inline bool starts_with(cudf::string_view const dstr, cudf::string_view const& tgt)
{
return starts_with(dstr, tgt.data(), tgt.size_bytes());
}

__device__ inline bool ends_with(cudf::string_view const dstr,
char const* tgt,
cudf::size_type bytes)
{
if (bytes > dstr.size_bytes()) { return false; }
auto const end_str = cudf::string_view{dstr.data() + dstr.size_bytes() - bytes, bytes};
return end_str.compare(tgt, bytes) == 0;
}

__device__ inline bool ends_with(cudf::string_view const dstr, char const* tgt)
{
return ends_with(dstr, tgt, detail::bytes_in_null_terminated_string(tgt));
}

__device__ inline bool ends_with(cudf::string_view const dstr, cudf::string_view const& tgt)
{
return starts_with(dstr, tgt.data(), tgt.size_bytes());
}

/** @} */ // end of group
} // namespace string
} // namespace strings
} // namespace cudf
14 changes: 1 addition & 13 deletions cpp/include/cudf/strings/string_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@
#include <cudf/utilities/error.hpp>
#endif

// This is defined when including this header in a https://github.com/NVIDIA/jitify
// or jitify2 source file. The jitify cannot include thrust headers at this time.
#ifndef CUDF_JIT_UDF
#include <thrust/count.h>
#include <thrust/execution_policy.h>
#endif

// This file should only include device code logic.
// Host-only or host/device code should be defined in the string_view.hpp header file.

Expand All @@ -47,18 +40,13 @@ namespace detail {
__device__ inline size_type characters_in_string(const char* str, size_type bytes)
{
if ((str == nullptr) || (bytes == 0)) return 0;
auto ptr = reinterpret_cast<uint8_t const*>(str);
#ifndef CUDF_JIT_UDF
return thrust::count_if(
thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); });
#else
auto ptr = reinterpret_cast<uint8_t const*>(str);
size_type chars = 0;
auto const end = ptr + bytes;
while (ptr < end) {
chars += is_begin_utf8_char(*ptr++);
}
return chars;
#endif
}

/**
Expand Down
88 changes: 88 additions & 0 deletions cpp/include/cudf/strings/udf/column_functions.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/udf/dstring.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/per_device_resource.hpp>

#include <memory>

namespace cudf {
namespace strings {
//! Strings UDF support
namespace udf {

/**
* @addtogroup strings_udfs
* @{
* @file
* @brief Strings APIs for supporting user-defined functions
*/

/**
* @brief Return a vector of cudf::string_view for the given strings column
*
* @param input Strings column
* @param mr Device memory resource used to allocate the returned vector
* @return Device vector of cudf::string_view objects
*/
rmm::device_uvector<string_view> create_string_view_array(
cudf::strings_column_view const input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Return an empty dstring array
*
* Once finished with the array call free_dstring_array to deallocate the dstring objects
* before destroying the return memory buffer.
*
* @param size Number of empty dstring elements
* @param mr Device memory resource used to allocate the returned vector
* @return Device buffer containing the empty dstring objects
*/
std::unique_ptr<rmm::device_buffer> create_dstring_array(
size_type size, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Return a cudf::column given an array of dstring objects
*
* @param input dstring array
* @param mr Device memory resource used to allocate the returned vector
* @return A strings column copy of the dstring objects
*/
std::unique_ptr<cudf::column> make_strings_column(
device_span<dstring const> input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Free all the dstring objects in the given array
*
* Call this to free the internal memory within individual dstring objects.
* The input dstrings are modified (emptied) and can be reused.
*
* @param input dstring array
*/
void free_dstring_array(device_span<dstring> input);

/** @} */ // end of group
} // namespace udf
} // namespace strings
} // namespace cudf
Loading