From 4945198763d6670ea3c442dab26bebd0cb80502c Mon Sep 17 00:00:00 2001 From: gaohao95 Date: Mon, 30 Aug 2021 09:30:23 -0700 Subject: [PATCH] Optimize URL Decoding (#8622) This PR is intended to optimize the URL decoding performance, especially on large URLs. Additionally, a test case for large URLs has been added. When tested on V100, baseline performance at 7521c3f2c1acc8161c82a5e4cd1d9f96f58945ec ``` ------------------------------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------------------------------------ UrlDecode<10>/url_decode_10pct/100000000/10/manual_time 111 ms 111 ms 6 bytes_per_second=11.7959G/s UrlDecode<10>/url_decode_10pct/10000000/100/manual_time 107 ms 107 ms 7 bytes_per_second=9.0136G/s UrlDecode<10>/url_decode_10pct/1000000/1000/manual_time 107 ms 107 ms 7 bytes_per_second=8.76755G/s UrlDecode<50>/url_decode_50pct/100000000/10/manual_time 129 ms 129 ms 5 bytes_per_second=10.144G/s UrlDecode<50>/url_decode_50pct/10000000/100/manual_time 126 ms 126 ms 6 bytes_per_second=7.70821G/s UrlDecode<50>/url_decode_50pct/1000000/1000/manual_time 122 ms 122 ms 6 bytes_per_second=7.66783G/s ``` This PR ``` ------------------------------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations UserCounters... ------------------------------------------------------------------------------------------------------------------ UrlDecode<10>/url_decode_10pct/100000000/10/manual_time 97.5 ms 97.6 ms 7 bytes_per_second=13.3669G/s UrlDecode<10>/url_decode_10pct/10000000/100/manual_time 28.8 ms 28.8 ms 24 bytes_per_second=33.6024G/s UrlDecode<10>/url_decode_10pct/1000000/1000/manual_time 21.8 ms 21.8 ms 32 bytes_per_second=42.9686G/s UrlDecode<50>/url_decode_50pct/100000000/10/manual_time 109 ms 109 ms 6 bytes_per_second=11.9786G/s UrlDecode<50>/url_decode_50pct/10000000/100/manual_time 30.2 ms 30.3 ms 23 bytes_per_second=32.0311G/s UrlDecode<50>/url_decode_50pct/1000000/1000/manual_time 22.7 ms 22.8 ms 31 bytes_per_second=41.1086G/s ``` close https://github.com/rapidsai/cudf/issues/8030 Authors: - https://github.com/gaohao95 Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/8622 --- .../string/url_decode_benchmark.cpp | 5 +- cpp/src/strings/convert/convert_urls.cu | 410 ++++++++++-------- cpp/tests/strings/urls_tests.cpp | 64 +++ 3 files changed, 287 insertions(+), 192 deletions(-) diff --git a/cpp/benchmarks/string/url_decode_benchmark.cpp b/cpp/benchmarks/string/url_decode_benchmark.cpp index fbb99bf3e8f..9cfaaf27286 100644 --- a/cpp/benchmarks/string/url_decode_benchmark.cpp +++ b/cpp/benchmarks/string/url_decode_benchmark.cpp @@ -92,8 +92,9 @@ void BM_url_decode(benchmark::State& state) BENCHMARK_TEMPLATE_DEFINE_F(UrlDecode, name, esc_seq_pct) \ (::benchmark::State & state) { BM_url_decode(state); } \ BENCHMARK_REGISTER_F(UrlDecode, name) \ - ->RangeMultiplier(10) \ - ->Ranges({{100, 100000}, {10, 10000}}) \ + ->Args({100000000, 10}) \ + ->Args({10000000, 100}) \ + ->Args({1000000, 1000}) \ ->Unit(benchmark::kMillisecond) \ ->UseManualTime(); diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index abf2dc25097..80d870abb8e 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -19,19 +19,25 @@ #include #include #include +#include +#include #include #include #include #include #include +#include #include #include #include -#include -#include +#include + #include +#include + +#include using cudf::device_span; @@ -185,137 +191,198 @@ constexpr bool is_hex_digit(char ch) return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); } -/** - * @brief Functor for detecting character escape sequences in URL-encoded strings. - */ -struct url_decode_escape_detector { - device_span const d_chars{}; - - /** - * @brief Detects if an escape sequence is at the specified character position. - * - * Returns true for a character index corresponding to the start of an escape - * sequence, i.e.: '%' followed by two hexadecimal digits. This does not check - * against string boundaries and therefore can produce false-positives. - * - * @param char_idx Character position to examine - * @return true An escape sequence was detected at the character position - * @return false No escape sequence at the character position - */ - __device__ bool operator()(size_type char_idx) const - { - return (char_idx + 2 < d_chars.size()) && d_chars[char_idx] == '%' && - is_hex_digit(d_chars[char_idx + 1]) && is_hex_digit(d_chars[char_idx + 2]); - } -}; +__forceinline__ __device__ bool is_escape_char(char const* const ptr) +{ + return (ptr[0] == '%' && is_hex_digit(ptr[1]) && is_hex_digit(ptr[2])); +} -/** - * @brief Functor for detecting escape sequence positions that cross a string boundary. - */ -struct url_decode_esc_position_filter { - device_span const d_offsets{}; - - /** - * @brief Detects if an escape sequence crosses a string boundary - * - * Returns true for an escape sequence that straddles a boundary between two strings. - * - * @param esc_pos_idx Character position corresponding to the start of an escape sequence - * @return true The escape sequence crosses a string boundary - * @return false The escape sequence does not cross a string boundary - */ - __device__ bool operator()(size_type esc_pos_idx) const - { - // find the end offset of the current string - size_type const* offset_ptr = - thrust::upper_bound(thrust::seq, d_offsets.begin(), d_offsets.end(), esc_pos_idx); - return esc_pos_idx + 2 >= *offset_ptr; - } -}; +// helper function for converting an escaped sequence starting at `ptr` to a single byte +__forceinline__ __device__ char escaped_sequence_to_byte(char const* const ptr) +{ + return (hex_char_to_byte(ptr[1]) << 4) | hex_char_to_byte(ptr[2]); +} /** - * @brief Functor for replacing character escape sequences in URL-encoded strings. + * @brief Count the number of characters of each string after URL decoding. + * + * @tparam num_warps_per_threadblock Number of warps in a threadblock. This template argument must + * match the launch configuration, i.e. the kernel must be launched with + * `num_warps_per_threadblock * cudf::detail::warp_size` threads per threadblock. + * @tparam char_block_size Number of characters which will be loaded into the shared memory at a + * time. * - * Each escape sequence interprets the following 2 characters as hex values to create the output - * byte. For example, the sequence '%20' is converted into byte (0x20) which is a single space - * character. Another example converts '%C3%A9' into 2 sequential bytes (0xc3 and 0xa9 - * respectively). Overall, 3 characters are converted into one byte whenever a '%' character - * is encountered in the string. + * @param[in] in_strings Input string column. + * @param[out] out_counts Number of characters in each decode URL. */ -struct url_decode_char_replacer { - device_span const d_esc_positions{}; - char const* const d_in_chars{}; - char* const d_out_chars{}; - int32_t const first_input_char_offset = 0; - - /** - * @brief Copy an input character to the output, decoding escape sequences - * - * Each character position is examined against the list of known escape sequence positions. - * If the position is not within an escape sequence then the input character is copied to the - * output. If the position is the start of an escape sequence then the sequence is decoded to - * produce the character copied to the output. Any characters after the start of an escape - * sequence but still within the escape sequence are discarded. - * - * @param input_idx The input character position to process - */ - __device__ void operator()(size_type input_idx) const - { - char ch = d_in_chars[input_idx]; - - // determine the number of escape sequences at or before this character position - size_type const* next_esc_pos_ptr = - thrust::upper_bound(thrust::seq, d_esc_positions.begin(), d_esc_positions.end(), input_idx); - size_type num_prev_esc = next_esc_pos_ptr - d_esc_positions.data(); - - // every escape that occurs before this one replaces 3 characters with 1 - size_type output_idx = input_idx - (num_prev_esc * 2) - first_input_char_offset; - if (num_prev_esc > 0) { - size_type prev_esc_pos = *(next_esc_pos_ptr - 1); - // find the previous escape to see if this character is within the escape sequence - if (input_idx - prev_esc_pos < 3) { - if (input_idx == prev_esc_pos) { // at a position that needs to be replaced - ch = (hex_char_to_byte(d_in_chars[input_idx + 1]) << 4) | - hex_char_to_byte(d_in_chars[input_idx + 2]); - // previous escape sequence is this position, so the original calculation over-adjusted - output_idx += 2; - } else { - // one of the escape hex digits that has no corresponding character in the output - return; - } - } +template +__global__ void url_decode_char_counter(column_device_view const in_strings, + offset_type* const out_counts) +{ + constexpr int halo_size = 2; + __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size]; + __shared__ typename cub::WarpReduce::TempStorage cub_storage[num_warps_per_threadblock]; + + int const global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int const global_warp_id = global_thread_id / cudf::detail::warp_size; + int const local_warp_id = threadIdx.x / cudf::detail::warp_size; + int const warp_lane = threadIdx.x % cudf::detail::warp_size; + int const nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; + char* in_chars_shared = temporary_buffer[local_warp_id]; + + // Loop through strings, and assign each string to a warp. + for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) { + if (in_strings.is_null(row_idx)) { + out_counts[row_idx] = 0; + continue; } - d_out_chars[output_idx] = ch; + auto const in_string = in_strings.element(row_idx); + auto const in_chars = in_string.data(); + auto const string_length = in_string.size_bytes(); + int const nblocks = cudf::util::div_rounding_up_unsafe(string_length, char_block_size); + offset_type escape_char_count = 0; + + for (int block_idx = 0; block_idx < nblocks; block_idx++) { + int const string_length_block = + std::min(char_block_size, string_length - char_block_size * block_idx); + + // Each warp collectively loads input characters of the current block to the shared memory. + // When testing whether a location is the start of an escaped character, we need to access + // the current location as well as the next two locations. To avoid branches, two halo cells + // are added after the end of the block. If the cell is beyond the end of the string, 0s are + // filled in to make sure the last two characters of the string are not the start of an + // escaped sequence. + for (int char_idx = warp_lane; char_idx < string_length_block + halo_size; + char_idx += cudf::detail::warp_size) { + int const in_idx = block_idx * char_block_size + char_idx; + in_chars_shared[char_idx] = in_idx < string_length ? in_chars[in_idx] : 0; + } + + __syncwarp(); + + // `char_idx_start` represents the start character index of the current warp. + for (int char_idx_start = 0; char_idx_start < string_length_block; + char_idx_start += cudf::detail::warp_size) { + int const char_idx = char_idx_start + warp_lane; + int8_t const is_ichar_escape_char = + (char_idx < string_length_block && is_escape_char(in_chars_shared + char_idx)) ? 1 : 0; + + // Warp-wise reduction to calculate the number of escape characters. + // All threads in the warp participate in the reduction, even if `char_idx` is beyond + // `string_length_block`. + int8_t const total_escape_char = + cub::WarpReduce(cub_storage[local_warp_id]).Sum(is_ichar_escape_char); + + if (warp_lane == 0) { escape_char_count += total_escape_char; } + + __syncwarp(); + } + } + // URL decoding replaces 3 bytes with 1 for each escape character. + if (warp_lane == 0) { out_counts[row_idx] = string_length - escape_char_count * 2; } } -}; +} /** - * @brief Functor to update the string column offsets. + * @brief Decode and copy from the input string column to the output char buffer. + * + * @tparam num_warps_per_threadblock Number of warps in a threadblock. This template argument must + * match the launch configuration, i.e. the kernel must be launched with + * `num_warps_per_threadblock * cudf::detail::warp_size` threads per threadblock. + * @tparam char_block_size Number of characters which will be loaded into the shared memory at a + * time. + * + * @param[in] in_strings Input string column. + * @param[out] out_chars Character buffer for the output string column. + * @param[in] out_offsets Offset value of each string associated with `out_chars`. */ -struct url_decode_offsets_updater { - device_span const d_esc_positions{}; - int32_t const first_input_offset = 0; - - /** - * @brief Convert input offsets into output offsets - * - * Each offset is reduced by 2 for every escape sequence that occurs in the entire string column - * character data before the offset, as 3 characters are replaced with 1 for each escape. - * - * @param offset An original offset value from the input string column - * @return Adjusted offset value - */ - __device__ int32_t operator()(int32_t offset) const - { - // determine the number of escape sequences occurring before this offset - size_type const* next_esc_pos_ptr = - thrust::lower_bound(thrust::seq, d_esc_positions.begin(), d_esc_positions.end(), offset); - size_type num_prev_esc = next_esc_pos_ptr - d_esc_positions.data(); - // every escape that occurs before this one replaces 3 characters with 1 - return offset - first_input_offset - (num_prev_esc * 2); +template +__global__ void url_decode_char_replacer(column_device_view const in_strings, + char* const out_chars, + offset_type const* const out_offsets) +{ + constexpr int halo_size = 2; + __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2]; + __shared__ typename cub::WarpScan::TempStorage cub_storage[num_warps_per_threadblock]; + __shared__ int out_idx[num_warps_per_threadblock]; + + int const global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int const global_warp_id = global_thread_id / cudf::detail::warp_size; + int const local_warp_id = threadIdx.x / cudf::detail::warp_size; + int const warp_lane = threadIdx.x % cudf::detail::warp_size; + int const nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; + char* in_chars_shared = temporary_buffer[local_warp_id]; + + // Loop through strings, and assign each string to a warp + for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) { + if (in_strings.is_null(row_idx)) continue; + + auto const in_string = in_strings.element(row_idx); + auto const in_chars = in_string.data(); + auto const string_length = in_string.size_bytes(); + auto out_chars_string = out_chars + out_offsets[row_idx]; + int const nblocks = cudf::util::div_rounding_up_unsafe(string_length, char_block_size); + + // Use the last thread of the warp to initialize `out_idx` to 0. + if (warp_lane == cudf::detail::warp_size - 1) { out_idx[local_warp_id] = 0; } + + for (int block_idx = 0; block_idx < nblocks; block_idx++) { + int const string_length_block = + std::min(char_block_size, string_length - char_block_size * block_idx); + + // Each warp collectively loads input characters of the current block to shared memory. + // Two halo cells before and after the block are added. The halo cells are used to test + // whether the current location as well as the previous two locations are escape characters, + // without branches. + for (int char_idx = warp_lane; char_idx < string_length_block + halo_size * 2; + char_idx += cudf::detail::warp_size) { + int const in_idx = block_idx * char_block_size + char_idx - halo_size; + in_chars_shared[char_idx] = in_idx >= 0 && in_idx < string_length ? in_chars[in_idx] : 0; + } + + __syncwarp(); + + // `char_idx_start` represents the start character index of the current warp. + for (int char_idx_start = 0; char_idx_start < string_length_block; + char_idx_start += cudf::detail::warp_size) { + int const char_idx = char_idx_start + warp_lane; + // If the current character is part of an escape sequence starting at the previous two + // locations, the thread with the starting location should output the escaped character, and + // the current thread should not output a character. + int8_t const out_size = + (char_idx >= string_length_block || is_escape_char(in_chars_shared + char_idx) || + is_escape_char(in_chars_shared + char_idx + 1)) + ? 0 + : 1; + + // Warp-wise prefix sum to establish output location of the current thread. + // All threads in the warp participate in the prefix sum, even if `char_idx` is beyond + // `string_length_block`. + int8_t out_offset; + cub::WarpScan(cub_storage[local_warp_id]).ExclusiveSum(out_size, out_offset); + + if (out_size == 1) { + char const* const ch_ptr = in_chars_shared + char_idx + halo_size; + char const ch = + is_escape_char(ch_ptr) + ? + // If the current location is the start of an escape sequence, load and decode. + escaped_sequence_to_byte(ch_ptr) + : + // If the current location is not the start of an escape sequence, load directly. + *ch_ptr; + out_chars_string[out_idx[local_warp_id] + out_offset] = ch; + } + + if (warp_lane == cudf::detail::warp_size - 1) { + out_idx[local_warp_id] += (out_offset + out_size); + } + + __syncwarp(); + } + } } -}; +} } // namespace @@ -328,82 +395,45 @@ std::unique_ptr url_decode( size_type strings_count = strings.size(); if (strings_count == 0) return make_empty_column(data_type{type_id::STRING}); - auto offset_count = strings_count + 1; - auto d_offsets = strings.offsets().data() + strings.offset(); - auto d_in_chars = strings.chars().data(); - // determine index of first character in base column - size_type chars_start = (strings.offset() == 0) ? 0 - : cudf::detail::get_value( - strings.offsets(), strings.offset(), stream); - size_type chars_end = (offset_count == strings.offsets().size()) - ? strings.chars_size() - : cudf::detail::get_value( - strings.offsets(), strings.offset() + strings_count, stream); - size_type chars_bytes = chars_end - chars_start; - - url_decode_escape_detector esc_detector{device_span(d_in_chars, chars_end)}; - - // Count the number of URL escape sequences across all strings, ignoring string boundaries. - // This may count more sequences than actually are there since string boundaries are ignored. - size_type esc_count = thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(chars_start), - thrust::make_counting_iterator(chars_end), - esc_detector); - - if (esc_count == 0) { - // nothing to replace, so just copy the input column - return std::make_unique(strings.parent(), stream, mr); - } + constexpr int num_warps_per_threadblock = 4; + constexpr int threadblock_size = num_warps_per_threadblock * cudf::detail::warp_size; + constexpr int char_block_size = 256; + const int num_threadblocks = + std::min(65536, cudf::util::div_rounding_up_unsafe(strings_count, num_warps_per_threadblock)); - // create a vector of the potential escape sequence positions - rmm::device_uvector esc_positions(esc_count, stream); - auto d_esc_positions = esc_positions.data(); - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(chars_start), - thrust::make_counting_iterator(chars_end), - d_esc_positions, - esc_detector); - - // In-place remove any escape positions that crossed string boundaries. - device_span d_offsets_span(d_offsets, offset_count); - auto esc_pos_end = thrust::remove_if(rmm::exec_policy(stream), - d_esc_positions, - d_esc_positions + esc_count, - url_decode_esc_position_filter{d_offsets_span}); - - // update count in case any were filtered - esc_count = esc_pos_end - d_esc_positions; - if (esc_count == 0) { - // nothing to replace, so just copy the input column - return std::make_unique(strings.parent(), stream, mr); - } - - device_span d_esc_positions_span(d_esc_positions, esc_count); + auto offset_count = strings_count + 1; + auto const d_strings = column_device_view::create(strings.parent(), stream); // build offsets column auto offsets_column = make_numeric_column( data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr); - auto offsets_view = offsets_column->mutable_view(); - thrust::transform(rmm::exec_policy(stream), - d_offsets_span.begin(), - d_offsets_span.end(), - offsets_view.begin(), - url_decode_offsets_updater{d_esc_positions_span, chars_start}); + + // count number of bytes in each string after decoding and store it in offsets_column + auto offsets_view = offsets_column->view(); + auto offsets_mutable_view = offsets_column->mutable_view(); + url_decode_char_counter + <<>>( + *d_strings, offsets_mutable_view.begin()); + + // use scan to transform number of bytes into offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + offsets_view.begin(), + offsets_view.end(), + offsets_mutable_view.begin()); + + // copy the total number of characters of all strings combined (last element of the offset column) + // to the host memory + auto out_chars_bytes = + cudf::detail::get_value(offsets_view, offset_count - 1, stream); // create the chars column - auto chars_column = - create_chars_child_column(chars_bytes - (esc_count * 2), // replacing 3 bytes with 1 - stream, - mr); - auto d_out_chars = chars_column->mutable_view().data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(chars_start), - chars_bytes, - url_decode_char_replacer{d_esc_positions_span, d_in_chars, d_out_chars, chars_start}); - - // free the escape positions buffer as it is no longer needed - (void)esc_positions.release(); + auto chars_column = create_chars_child_column(out_chars_bytes, stream, mr); + auto d_out_chars = chars_column->mutable_view().data(); + + // decode and copy the characters from the input column to the output column + url_decode_char_replacer + <<>>( + *d_strings, d_out_chars, offsets_column->view().begin()); // copy null mask rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); diff --git a/cpp/tests/strings/urls_tests.cpp b/cpp/tests/strings/urls_tests.cpp index e6fbc50e40e..6ac3e08b1e9 100644 --- a/cpp/tests/strings/urls_tests.cpp +++ b/cpp/tests/strings/urls_tests.cpp @@ -21,6 +21,7 @@ #include #include +#include #include struct StringsConvertTest : public cudf::test::BaseFixture { @@ -159,6 +160,69 @@ TEST_F(StringsConvertTest, UrlDecodeSliced) } } +TEST_F(StringsConvertTest, UrlDecodeLargeStrings) +{ + constexpr int string_len = 35000; + std::vector string_encoded; + string_encoded.reserve(string_len * 3); + std::vector string_plain; + string_plain.reserve(string_len + 1); + + std::random_device rd; + std::mt19937 random_number_generator(rd()); + std::uniform_int_distribution distribution(0, 4); + + for (int character_idx = 0; character_idx < string_len; character_idx++) { + switch (distribution(random_number_generator)) { + case 0: + string_encoded.push_back('a'); + string_plain.push_back('a'); + break; + case 1: + string_encoded.push_back('b'); + string_plain.push_back('b'); + break; + case 2: + string_encoded.push_back('c'); + string_plain.push_back('c'); + break; + case 3: + string_encoded.push_back('%'); + string_encoded.push_back('3'); + string_encoded.push_back('F'); + string_plain.push_back('?'); + break; + case 4: + string_encoded.push_back('%'); + string_encoded.push_back('3'); + string_encoded.push_back('D'); + string_plain.push_back('='); + break; + } + } + string_encoded.push_back('\0'); + string_plain.push_back('\0'); + + std::vector h_strings{string_encoded.data()}; + cudf::test::strings_column_wrapper strings( + h_strings.cbegin(), + h_strings.cend(), + thrust::make_transform_iterator(h_strings.cbegin(), + [](auto const str) { return str != nullptr; })); + + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::url_decode(strings_view); + + std::vector h_expected{string_plain.data()}; + cudf::test::strings_column_wrapper expected( + h_expected.cbegin(), + h_expected.cend(), + thrust::make_transform_iterator(h_expected.cbegin(), + [](auto const str) { return str != nullptr; })); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsConvertTest, ZeroSizeUrlStringsColumn) { cudf::column_view zero_size_column(