Skip to content

Commit

Permalink
Move common string utilities to public api (rapidsai#16070)
Browse files Browse the repository at this point in the history
As part of rapidsai#15982 a subset of the strings utility functions have been identified as being worth expsosing as part of the cudf public API. 

The `create_string_vector_from_column`, `get_offset64_threshold`, and `is_large_strings_enabled` are now made part of the public `cudf::strings` api.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Lawrence Mitchell (https://github.com/wence-)

URL: rapidsai#16070
  • Loading branch information
robertmaynard authored Jun 28, 2024
1 parent a4b951a commit 78f4a8a
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 20 deletions.
7 changes: 4 additions & 3 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/utilities.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -81,11 +82,11 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
auto const total_bytes =
cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);

auto const threshold = get_offset64_threshold();
CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold),
auto const threshold = cudf::strings::get_offset64_threshold();
CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold),
"Size of output exceeds the column size limit",
std::overflow_error);
if (total_bytes >= get_offset64_threshold()) {
if (total_bytes >= cudf::strings::get_offset64_threshold()) {
// recompute as int64 offsets when above the threshold
offsets_column = make_numeric_column(
data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
Expand Down
62 changes: 62 additions & 0 deletions cpp/include/cudf/strings/utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/export.hpp>

#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

namespace CUDF_EXPORT cudf {
namespace strings {

/**
* @brief Creates a string_view vector from a strings column.
*
* @param strings Strings column instance.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned vector's device memory.
* @return Device vector of string_views
*/
rmm::device_uvector<string_view> create_string_vector_from_column(
cudf::strings_column_view const strings,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

/**
* @brief Return the threshold size for a strings column to use int64 offsets
*
* A computed size above this threshold should using int64 offsets, otherwise
* int32 offsets. By default this function will return std::numeric_limits<int32_t>::max().
* This value can be overridden at runtime using the environment variable
* LIBCUDF_LARGE_STRINGS_THRESHOLD.
*
* @return size in bytes
*/
int64_t get_offset64_threshold();

/**
* @brief Checks if large strings is enabled
*
* This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
*
* @return true if large strings are supported
*/
bool is_large_strings_enabled();

} // namespace strings
} // namespace CUDF_EXPORT cudf
22 changes: 17 additions & 5 deletions cpp/src/strings/utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "strings/char_types/char_cases.h"
#include "strings/char_types/char_flags.h"

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/char_tables.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/utilities.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand All @@ -36,8 +37,7 @@
#include <cstdlib>
#include <string>

namespace cudf {
namespace strings {
namespace cudf::strings {
namespace detail {

/**
Expand Down Expand Up @@ -175,5 +175,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
}

} // namespace detail
} // namespace strings
} // namespace cudf

rmm::device_uvector<string_view> create_string_vector_from_column(
cudf::strings_column_view const strings,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::create_string_vector_from_column(strings, stream, mr);
}

int64_t get_offset64_threshold() { return detail::get_offset64_threshold(); }
bool is_large_strings_enabled() { return detail::is_large_strings_enabled(); }

} // namespace cudf::strings
4 changes: 2 additions & 2 deletions cpp/tests/column/factories_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#include <cudf/null_mask.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/utilities.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/type_dispatcher.hpp>
Expand Down Expand Up @@ -762,7 +762,7 @@ TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }

TEST_F(ColumnFactoryTest, FromScalarErrors)
{
if (cudf::strings::detail::is_large_strings_enabled()) { return; }
if (cudf::strings::is_large_strings_enabled()) { return; }
cudf::string_scalar ss("hello world");
EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);

Expand Down
8 changes: 4 additions & 4 deletions cpp/tests/copying/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#include <cudf/detail/null_mask.hpp>
#include <cudf/dictionary/encode.hpp>
#include <cudf/filling.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/utilities.hpp>
#include <cudf/table/table.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand Down Expand Up @@ -189,7 +189,7 @@ TEST_F(StringColumnTest, ConcatenateManyColumns)

TEST_F(StringColumnTest, ConcatenateTooLarge)
{
if (cudf::strings::detail::is_large_strings_enabled()) { return; }
if (cudf::strings::is_large_strings_enabled()) { return; }

std::string big_str(1000000, 'a'); // 1 million bytes x 5 = 5 million bytes
cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
Expand Down Expand Up @@ -379,7 +379,7 @@ TEST_F(OverflowTest, OverflowTest)
}

// string column, overflow on chars
if (!cudf::strings::detail::is_large_strings_enabled()) {
if (!cudf::strings::is_large_strings_enabled()) {
constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);

// try and concatenate 6 string columns of with 1 billion chars in each
Expand Down Expand Up @@ -502,7 +502,7 @@ TEST_F(OverflowTest, Presliced)
}

// strings, overflow on chars
if (!cudf::strings::detail::is_large_strings_enabled()) {
if (!cudf::strings::is_large_strings_enabled()) {
constexpr cudf::size_type total_chars_size = 1024 * 1024 * 1024;
constexpr cudf::size_type string_size = 64;
constexpr cudf::size_type num_rows = total_chars_size / string_size;
Expand Down
4 changes: 2 additions & 2 deletions cpp/tests/strings/array_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
#include <cudf/copying.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/sorting.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/utilities.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/default_stream.hpp>

Expand Down Expand Up @@ -153,7 +153,7 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)

TEST_F(StringsColumnTest, GatherTooBig)
{
if (cudf::strings::detail::is_large_strings_enabled()) { return; }
if (cudf::strings::is_large_strings_enabled()) { return; }

std::vector<int8_t> h_chars(3000000);
cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
Expand Down
4 changes: 2 additions & 2 deletions cpp/tests/strings/repeat_strings_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
#include <cudf_test/type_lists.hpp>

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/repeat_strings.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/utilities.hpp>

using namespace cudf::test::iterators;

Expand Down Expand Up @@ -221,7 +221,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)

TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
{
if (cudf::strings::detail::is_large_strings_enabled()) { return; }
if (cudf::strings::is_large_strings_enabled()) { return; }

auto const strs = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"};
auto const strs_cv = cudf::strings_column_view(strs);
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
*/

#include <cudf/column/column_factories.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/udf/udf_apis.hpp>
#include <cudf/strings/udf/udf_string.cuh>
#include <cudf/strings/utilities.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/device_uvector.hpp>
Expand Down Expand Up @@ -57,7 +57,7 @@ std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const
rmm::cuda_stream_view stream)
{
return std::make_unique<rmm::device_buffer>(
std::move(cudf::strings::detail::create_string_vector_from_column(
std::move(cudf::strings::create_string_vector_from_column(
cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource())
.release()));
}
Expand Down

0 comments on commit 78f4a8a

Please sign in to comment.