Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve url_decode performance for long strings #7353

Merged
merged 10 commits into from
Feb 15, 2021
3 changes: 2 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ ConfigureBench(SUBWORD_TOKENIZER_BENCH "${SUBWORD_TOKENIZER_BENCH_SRC}")
# - strings benchmark -------------------------------------------------------------------

set(STRINGS_BENCH_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp")
"${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/string/url_decode_benchmark.cpp")

ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}")
103 changes: 103 additions & 0 deletions cpp/benchmarks/string/url_decode_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>

#include <cudf/strings/convert/convert_urls.hpp>
#include <cudf/types.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/cudf_gtest.hpp>

#include <algorithm>
#include <random>

#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"
#include "cudf/column/column_view.hpp"
#include "cudf/wrappers/durations.hpp"
jlowe marked this conversation as resolved.
Show resolved Hide resolved

struct url_string_generator {
size_t num_chars;
std::bernoulli_distribution dist;

url_string_generator(size_t num_chars, double esc_seq_chance)
: num_chars{num_chars}, dist{esc_seq_chance}
{
}

std::string operator()(std::mt19937& engine)
{
std::string str;
str.reserve(num_chars);
while (str.size() < num_chars) {
if (str.size() < num_chars - 3 && dist(engine)) {
str += "%20";
} else {
str.push_back('a');
}
}
return str;
}
};

cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows,
cudf::size_type chars_per_row,
double esc_seq_chance)
{
std::mt19937 engine(1);
url_string_generator url_gen(chars_per_row, esc_seq_chance);
std::vector<std::string> strings;
strings.reserve(num_rows);
std::generate_n(std::back_inserter(strings), num_rows, [&]() { return url_gen(engine); });
return cudf::test::strings_column_wrapper(strings.begin(), strings.end());
}

template <int esc_seq_pct>
class UrlDecode : public cudf::benchmark {
};

template <int esc_seq_pct>
void BM_url_decode(benchmark::State& state)
{
cudf::size_type const num_rows = state.range(0);
cudf::size_type const chars_per_row = state.range(1);

auto column = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0);
auto strings_view = cudf::strings_column_view(column);

for (auto _ : state) {
cuda_event_timer raii(state, true, 0);
auto result = cudf::strings::url_decode(strings_view);
}

state.SetBytesProcessed(state.iterations() * num_rows *
(chars_per_row + sizeof(cudf::size_type)));
}

#define URLD_BENCHMARK_DEFINE(name, esc_seq_pct) \
BENCHMARK_TEMPLATE_DEFINE_F(UrlDecode, name, esc_seq_pct) \
(::benchmark::State & state) { BM_url_decode<esc_seq_pct>(state); } \
BENCHMARK_REGISTER_F(UrlDecode, name) \
->RangeMultiplier(10) \
->Ranges({{100, 100000}, {10, 10000}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

URLD_BENCHMARK_DEFINE(url_decode_10pct, 10)
URLD_BENCHMARK_DEFINE(url_decode_20pct, 20)
197 changes: 143 additions & 54 deletions cpp/src/strings/convert/convert_urls.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,7 @@

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/convert/convert_urls.hpp>
Expand All @@ -25,6 +26,11 @@
#include <strings/utilities.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <thrust/binary_search.h>
#include <thrust/count.h>
#include <thrust/iterator/counting_iterator.h>

namespace cudf {
namespace strings {
Expand Down Expand Up @@ -164,50 +170,98 @@ std::unique_ptr<column> url_encode(strings_column_view const& strings,

namespace detail {
namespace {
//
// This is the functor for the url_decode() method below.
// Specific requirements are documented in custrings issue #321.
// In summary it converts all character sequences starting with '%' into bytes
// interpretting the following 2 characters as hex values to create the output byte.
// For example, the sequence '%20' is converted into byte (0x20) which is a single
// space character. Another example converts '%C3%A9' into 2 sequential bytes
// (0xc3 and 0xa9 respectively). Overall, 3 characters are converted into one byte
// whenever a '%' character is encountered in the string.
//
struct url_decoder_fn {
column_device_view const d_strings;
int32_t const* d_offsets{};
char* d_chars{};

// utility to convert a hex char into a single byte
__device__ uint8_t hex_char_to_byte(char ch)
// utility to convert a hex char into a single byte
__device__ uint8_t hex_char_to_byte(char ch)
jlowe marked this conversation as resolved.
Show resolved Hide resolved
{
if (ch >= '0' && ch <= '9') return (ch - '0');
if (ch >= 'A' && ch <= 'F') return (ch - 'A' + 10); // in hex A=10,B=11,...,F=15
if (ch >= 'a' && ch <= 'f') return (ch - 'a' + 10); // same for lower case
return 0;
}

__device__ bool is_hex_digit(char ch)
jlowe marked this conversation as resolved.
Show resolved Hide resolved
{
return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
}

// Functor for detecting character escape sequences in URL-encoded strings.
// It returns true for a character index corresponding to the start of an
// escape sequence, i.e.: '%' followed by two hexadecimal digits.
struct url_decode_escape_detector {
size_type num_rows;
jlowe marked this conversation as resolved.
Show resolved Hide resolved
size_type const* d_offsets{};
char const* d_chars{};

__device__ bool operator()(size_type char_idx)
{
if (ch >= '0' && ch <= '9') return (ch - '0');
if (ch >= 'A' && ch <= 'F') return (ch - 'A' + 10); // in hex A=10,B=11,...,F=15
if (ch >= 'a' && ch <= 'f') return (ch - 'a' + 10); // same for lower case
return 0;
size_type const* next_row_idx_ptr =
thrust::upper_bound(thrust::seq, d_offsets, d_offsets + num_rows, char_idx);
size_type end_char_idx = *next_row_idx_ptr;
return (char_idx + 2 < end_char_idx) && d_chars[char_idx] == '%' &&
is_hex_digit(d_chars[char_idx + 1]) && is_hex_digit(d_chars[char_idx + 2]);
}
};

// main functor method executed on each string
__device__ size_type operator()(size_type idx)
// Functor for replacing character escape sequences in URL-encoded strings.
// Each escape sequence interprets the following 2 characters as hex values to create the output
// byte. For example, the sequence '%20' is converted into byte (0x20) which is a single space
// character. Another example converts '%C3%A9' into 2 sequential bytes (0xc3 and 0xa9
// respectively). Overall, 3 characters are converted into one byte whenever a '%' character
// is encountered in the string.
struct url_decode_char_replacer {
size_type chars_start_offset; // index of first character, can be non-zero for sliced column
size_type num_esc_pos;
jlowe marked this conversation as resolved.
Show resolved Hide resolved
size_type const* d_esc_positions{}; // character index of escape sequences
char const* d_in_chars{};
jlowe marked this conversation as resolved.
Show resolved Hide resolved
char* d_out_chars{};

__device__ void operator()(size_type input_idx)
{
if (d_strings.is_null(idx)) return 0;
string_view d_str = d_strings.element<string_view>(idx);
char* out_ptr = d_chars ? out_ptr = d_chars + d_offsets[idx] : nullptr;
size_type nbytes = 0;
const char* in_ptr = d_str.data();
const char* end = in_ptr + d_str.size_bytes();
while (in_ptr < end) // walk through each byte
{
char ch = *in_ptr++;
if ((ch == '%') && ((in_ptr + 1) < end)) { // found '%', convert hex to byte
ch = static_cast<char>(16 * hex_char_to_byte(*in_ptr++));
ch += static_cast<char>(hex_char_to_byte(*in_ptr++));
char ch = d_in_chars[input_idx];
jlowe marked this conversation as resolved.
Show resolved Hide resolved

// determine the number of escape sequences at or before this character position
size_type const* next_esc_pos_ptr =
thrust::upper_bound(thrust::seq, d_esc_positions, d_esc_positions + num_esc_pos, input_idx);
size_type num_prev_esc = next_esc_pos_ptr - d_esc_positions;

// every escape that occurs before this one replaces 3 characters with 1
size_type output_idx = input_idx - (num_prev_esc * 2) - chars_start_offset;
if (num_prev_esc > 0) {
size_type prev_esc_pos = *(next_esc_pos_ptr - 1);
// find the previous escape to see if this character is within the escape sequence
if (input_idx - prev_esc_pos < 3) {
if (input_idx == prev_esc_pos) { // at a position that needs to be replaced
ch = (hex_char_to_byte(d_in_chars[input_idx + 1]) << 4) |
hex_char_to_byte(d_in_chars[input_idx + 2]);
// previous escape sequence is this position, so the original calculation over-adjusted
output_idx += 2;
} else {
// one of the escape hex digits that has no corresponding character in the output
return;
}
}
++nbytes; // keeping track of bytes and chars
if (out_ptr) out_ptr = copy_and_increment(out_ptr, &ch, 1);
}
return nbytes;

d_out_chars[output_idx] = ch;
}
};

// Functor to update the string column offsets.
jlowe marked this conversation as resolved.
Show resolved Hide resolved
// Each offset is reduced by 2 for every escape sequence that occurs in the entire string column
// character data before the offset, as 3 characters are replaced with 1 for each escape.
struct url_decode_offsets_updater {
size_type num_esc_pos;
jlowe marked this conversation as resolved.
Show resolved Hide resolved
size_type const* d_esc_positions{};
jlowe marked this conversation as resolved.
Show resolved Hide resolved

__device__ size_type operator()(size_type offset)
jlowe marked this conversation as resolved.
Show resolved Hide resolved
{
// determine the number of escape sequences occurring before this offset
size_type const* next_esc_pos_ptr =
thrust::lower_bound(thrust::seq, d_esc_positions, d_esc_positions + num_esc_pos, offset);
size_type num_prev_esc = next_esc_pos_ptr - d_esc_positions;
// every escape that occurs before this one replaces 3 characters with 1
return offset - (num_prev_esc * 2);
}
};

Expand All @@ -222,30 +276,65 @@ std::unique_ptr<column> url_decode(
size_type strings_count = strings.size();
if (strings_count == 0) return make_empty_strings_column(stream, mr);

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;
auto d_offsets = strings.offsets().data<size_type>();
// use indices relative to the base column data so they can be compared to offset values
auto d_in_chars = strings.chars().head<char>();
auto chars_bytes = strings.chars_size();
// determine index of first character in base column
size_type chars_start = 0;
if (strings.offset() != 0) {
chars_start = cudf::detail::get_value<size_type>(strings.offsets(), 0, stream);
}
size_type chars_end = chars_start + chars_bytes;
jlowe marked this conversation as resolved.
Show resolved Hide resolved

url_decode_escape_detector esc_detector{strings.size(), d_offsets, d_in_chars};

// count the number of URL escape sequences across all strings
size_type esc_count = thrust::count_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(chars_start),
thrust::make_counting_iterator<size_type>(chars_end),
esc_detector);

if (esc_count == 0) {
// nothing to replace, so just copy the input column
return std::make_unique<cudf::column>(strings.parent());
}

// create a vector of escape sequence positions
rmm::device_uvector<size_type> esc_positions(esc_count, stream);
auto d_esc_positions = esc_positions.data();
auto esc_pos_end = thrust::copy_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_t>(chars_start),
thrust::make_counting_iterator<size_t>(chars_end),
d_esc_positions,
esc_detector);

// copy null mask
rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
// build offsets column
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), url_decoder_fn{d_strings});
auto offsets_column = make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto d_offsets = offsets_column->view().data<int32_t>();
auto offsets_column = make_numeric_column(
data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
auto offsets_view = offsets_column->mutable_view();
thrust::transform(rmm::exec_policy(stream),
strings.offsets().begin<size_type>(),
strings.offsets().end<size_type>(),
offsets_view.begin<size_type>(),
jlowe marked this conversation as resolved.
Show resolved Hide resolved
jlowe marked this conversation as resolved.
Show resolved Hide resolved
url_decode_offsets_updater{esc_count, d_esc_positions});

// build chars column
// create the chars column
auto chars_column =
create_chars_child_column(strings_count,
strings.null_count(),
thrust::device_pointer_cast(d_offsets)[strings_count],
chars_bytes - (esc_count * 2), // replacing 3 bytes with 1
stream,
mr);
auto d_chars = chars_column->mutable_view().data<char>();
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
url_decoder_fn{d_strings, d_offsets, d_chars});
auto d_out_chars = chars_column->mutable_view().data<char>();
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
chars_bytes,
url_decode_char_replacer{chars_start, esc_count, d_esc_positions, d_in_chars, d_out_chars});

// copy null mask
rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);

return make_strings_column(strings_count,
std::move(offsets_column),
Expand Down