Skip to content

Commit

Permalink
Merge branch 'branch-0.19' into benchmarks-strings-translate
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Mar 18, 2021
2 parents 65fdcb9 + 951b455 commit 0a1db67
Show file tree
Hide file tree
Showing 34 changed files with 928 additions and 425 deletions.
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ ConfigureBench(STRINGS_BENCH
string/convert_floats_benchmark.cpp
string/copy_benchmark.cpp
string/extract_benchmark.cpp
string/factory_benchmark.cu
string/filter_benchmark.cpp
string/find_benchmark.cpp
string/replace_benchmark.cpp
Expand Down
93 changes: 93 additions & 0 deletions cpp/benchmarks/string/factory_benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <rmm/device_uvector.hpp>

#include <thrust/execution_policy.h>
#include <thrust/transform.h>

#include <limits>

namespace {
using string_pair = thrust::pair<char const*, cudf::size_type>;
struct string_view_to_pair {
__device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
{
return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
}
};
} // namespace

class StringsFactory : public cudf::benchmark {
};

static void BM_factory(benchmark::State& state)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile table_profile;
table_profile.set_distribution_params(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const table =
create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
auto d_column = cudf::column_device_view::create(table->view().column(0));
rmm::device_vector<string_pair> pairs(d_column->size());
thrust::transform(thrust::device,
d_column->pair_begin<cudf::string_view, true>(),
d_column->pair_end<cudf::string_view, true>(),
pairs.data(),
string_view_to_pair{});

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
cudf::make_strings_column(pairs);
}

cudf::strings_column_view input(table->view().column(0));
state.SetBytesProcessed(state.iterations() * input.chars_size());
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 4;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringsFactory, name) \
(::benchmark::State & st) { BM_factory(st); } \
BENCHMARK_REGISTER_F(StringsFactory, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(factory)
78 changes: 1 addition & 77 deletions cpp/include/cudf/strings/char_types/char_types.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -146,82 +146,6 @@ std::unique_ptr<column> filter_characters_of_type(
string_character_types types_to_keep = string_character_types::ALL_TYPES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to integers.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7']
* b = s.is_integer(s)
* b is [true, true, false, false, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_integer(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns `true` if all strings contain
* characters that are valid for conversion to integers.
*
* This function will return `true` if all string elements
* has at least one character in [-+0-9].
*
* Any null entry or empty string will cause this function to return `false`.
*
* @param strings Strings instance for this operation.
* @return true if all string are valid
*/
bool all_integer(strings_column_view const& strings);

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to floats.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9eE.].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
* b = s.is_float(s)
* b is [true, true, false, false, true, true, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_float(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns `true` if all strings contain
* characters that are valid for conversion to floats.
*
* This function will return `true` if all string elements
* has at least one character in [-+0-9eE.].
*
* Any null entry or empty string will cause this function to return `false`.
*
* @param strings Strings instance for this operation.
* @return true if all string are valid
*/
bool all_float(strings_column_view const& strings);

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
26 changes: 25 additions & 1 deletion cpp/include/cudf/strings/convert/convert_floats.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -68,6 +68,30 @@ std::unique_ptr<column> from_floats(
column_view const& floats,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to floats.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9eE.].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
* b = s.is_float(s)
* b is [true, true, false, false, true, true, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_float(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
26 changes: 25 additions & 1 deletion cpp/include/cudf/strings/convert/convert_integers.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -73,6 +73,30 @@ std::unique_ptr<column> from_integers(
column_view const& integers,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to integers.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9].
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'A', '+7']
* b = s.is_integer(s)
* b is [true, true, false, false, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_integer(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a new integer numeric column parsing hexadecimal values from the
* provided strings column.
Expand Down
87 changes: 59 additions & 28 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,60 @@
#include <thrust/transform.h>

namespace cudf {
namespace strings {
namespace detail {

template <typename Iterator>
constexpr inline bool is_signed_iterator()
/**
* @brief Returns a new chars column using the specified indices to select
* strings from the input iterator.
*
* This uses a character-parallel gather CUDA kernel that performs very
* well on a strings column with long strings (e.g. average > 64 bytes).
*
* @tparam StringIterator Iterator should produce `string_view` objects.
* @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
*
* @param strings_begin Start of the iterator to retrieve `string_view` instances
* @param map_begin Start of index iterator.
* @param map_end End of index iterator.
* @param offsets The offset values to be associated with the output chars column.
* @param chars_bytes The total number of bytes for the output chars column.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return New chars column fit for a strings column.
*/
template <typename StringIterator, typename MapIterator>
std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
MapIterator map_begin,
MapIterator map_end,
cudf::device_span<int32_t const> const offsets,
size_type chars_bytes,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
}
auto const output_count = std::distance(map_begin, map_end);
if (output_count == 0) return make_empty_column(data_type{type_id::INT8});

namespace strings {
namespace detail {
auto chars_column = create_chars_child_column(output_count, 0, chars_bytes, stream, mr);
auto const d_chars = chars_column->mutable_view().template data<char>();

auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
auto const out_row =
thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx));
auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)]; // get row index
auto const d_str = strings_begin[row_idx]; // get row's string
auto const offset = out_idx - *out_row; // get string's char
return d_str.data()[offset];
};

thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(chars_bytes),
d_chars,
gather_chars_fn);

return chars_column;
}

/**
* @brief Returns a new strings column using the specified indices to select
Expand Down Expand Up @@ -107,29 +152,15 @@ std::unique_ptr<cudf::column> gather(
rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);

// build chars column
size_type const out_chars_bytes = static_cast<size_type>(total_bytes);
auto out_chars_column = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr);
auto const d_out_chars = out_chars_column->mutable_view().template data<char>();

// fill in chars
cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
auto const d_in_chars = (strings_count > 0) ? strings.chars().data<char>() : nullptr;
auto gather_chars_fn =
[d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {
// find output row index for this output char index
auto const next_row_ptr = thrust::upper_bound(
thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx);
auto const out_row_idx = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1;
auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx];
auto const in_row_idx = begin[out_row_idx];
auto const in_char_offset = d_in_offsets[in_row_idx] + str_char_offset;
return d_in_chars[in_char_offset];
};
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(out_chars_bytes),
d_out_chars,
gather_chars_fn);
auto const d_strings = column_device_view::create(strings.parent(), stream);
auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
begin,
end,
d_out_offsets_span,
static_cast<size_type>(total_bytes),
stream,
mr);

return make_strings_column(output_count,
std::move(out_offsets_column),
Expand Down
Loading

0 comments on commit 0a1db67

Please sign in to comment.