rapidsai · rapids-bot · Feb 15, 2021 · Feb 8, 2021 · Feb 9, 2021 · Feb 9, 2021
@@ -305,6 +305,7 @@ ConfigureBench(SUBWORD_TOKENIZER_BENCH "${SUBWORD_TOKENIZER_BENCH_SRC}")
 # - strings benchmark -------------------------------------------------------------------
 
 set(STRINGS_BENCH_SRC
-  "${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp")
+  "${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/string/url_decode_benchmark.cpp")
 
 ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}")
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <cudf/strings/convert/convert_urls.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
+#include <algorithm>
+#include <random>
+
+#include "../fixture/benchmark_fixture.hpp"
+#include "../synchronization/synchronization.hpp"
+#include "cudf/column/column_view.hpp"
+#include "cudf/wrappers/durations.hpp"
+
+struct url_string_generator {
+  size_t num_chars;
+  std::bernoulli_distribution dist;
+
+  url_string_generator(size_t num_chars, double esc_seq_chance)
+    : num_chars{num_chars}, dist{esc_seq_chance}
+  {
+  }
+
+  std::string operator()(std::mt19937& engine)
+  {
+    std::string str;
+    str.reserve(num_chars);
+    while (str.size() < num_chars) {
+      if (str.size() < num_chars - 3 && dist(engine)) {
+        str += "%20";
+      } else {
+        str.push_back('a');
+      }
+    }
+    return str;
+  }
+};
+
+cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows,
+                                                   cudf::size_type chars_per_row,
+                                                   double esc_seq_chance)
+{
+  std::mt19937 engine(1);
+  url_string_generator url_gen(chars_per_row, esc_seq_chance);
+  std::vector<std::string> strings;
+  strings.reserve(num_rows);
+  std::generate_n(std::back_inserter(strings), num_rows, [&]() { return url_gen(engine); });
+  return cudf::test::strings_column_wrapper(strings.begin(), strings.end());
+}
+
+template <int esc_seq_pct>
+class UrlDecode : public cudf::benchmark {
+};
+
+template <int esc_seq_pct>
+void BM_url_decode(benchmark::State& state)
+{
+  cudf::size_type const num_rows      = state.range(0);
+  cudf::size_type const chars_per_row = state.range(1);
+
+  auto column       = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0);
+  auto strings_view = cudf::strings_column_view(column);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    auto result = cudf::strings::url_decode(strings_view);
+  }
+
+  state.SetBytesProcessed(state.iterations() * num_rows *
+                          (chars_per_row + sizeof(cudf::size_type)));
+}
+
+#define URLD_BENCHMARK_DEFINE(name, esc_seq_pct)                      \
+  BENCHMARK_TEMPLATE_DEFINE_F(UrlDecode, name, esc_seq_pct)           \
+  (::benchmark::State & state) { BM_url_decode<esc_seq_pct>(state); } \
+  BENCHMARK_REGISTER_F(UrlDecode, name)                               \
+    ->RangeMultiplier(10)                                             \
+    ->Ranges({{100, 100000}, {10, 10000}})                            \
+    ->Unit(benchmark::kMillisecond)                                   \
+    ->UseManualTime();
+
+URLD_BENCHMARK_DEFINE(url_decode_10pct, 10)
+URLD_BENCHMARK_DEFINE(url_decode_20pct, 20)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
@@ -25,6 +26,11 @@
 #include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/count.h>
+#include <thrust/iterator/counting_iterator.h>
 
 namespace cudf {
 namespace strings {
@@ -164,50 +170,98 @@ std::unique_ptr<column> url_encode(strings_column_view const& strings,
 
 namespace detail {
 namespace {
-//
-// This is the functor for the url_decode() method below.
-// Specific requirements are documented in custrings issue #321.
-// In summary it converts all character sequences starting with '%' into bytes
-// interpretting the following 2 characters as hex values to create the output byte.
-// For example, the sequence '%20' is converted into byte (0x20) which is a single
-// space character. Another example converts '%C3%A9' into 2 sequential bytes
-// (0xc3 and 0xa9 respectively). Overall, 3 characters are converted into one byte
-// whenever a '%' character is encountered in the string.
-//
-struct url_decoder_fn {
-  column_device_view const d_strings;
-  int32_t const* d_offsets{};
-  char* d_chars{};
 
-  // utility to convert a hex char into a single byte
-  __device__ uint8_t hex_char_to_byte(char ch)
+// utility to convert a hex char into a single byte
+__device__ uint8_t hex_char_to_byte(char ch)
+{
+  if (ch >= '0' && ch <= '9') return (ch - '0');
+  if (ch >= 'A' && ch <= 'F') return (ch - 'A' + 10);  // in hex A=10,B=11,...,F=15
+  if (ch >= 'a' && ch <= 'f') return (ch - 'a' + 10);  // same for lower case
+  return 0;
+}
+
+__device__ bool is_hex_digit(char ch)
+{
+  return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
+}
+
+// Functor for detecting character escape sequences in URL-encoded strings.
+// It returns true for a character index corresponding to the start of an
+// escape sequence, i.e.: '%' followed by two hexadecimal digits.
+struct url_decode_escape_detector {
+  size_type num_rows;
+  size_type const* d_offsets{};
+  char const* d_chars{};
+
+  __device__ bool operator()(size_type char_idx)
   {
-    if (ch >= '0' && ch <= '9') return (ch - '0');
-    if (ch >= 'A' && ch <= 'F') return (ch - 'A' + 10);  // in hex A=10,B=11,...,F=15
-    if (ch >= 'a' && ch <= 'f') return (ch - 'a' + 10);  // same for lower case
-    return 0;
+    size_type const* next_row_idx_ptr =
+      thrust::upper_bound(thrust::seq, d_offsets, d_offsets + num_rows, char_idx);
+    size_type end_char_idx = *next_row_idx_ptr;
+    return (char_idx + 2 < end_char_idx) && d_chars[char_idx] == '%' &&
+           is_hex_digit(d_chars[char_idx + 1]) && is_hex_digit(d_chars[char_idx + 2]);
   }
+};
 
-  // main functor method executed on each string
-  __device__ size_type operator()(size_type idx)
+// Functor for replacing character escape sequences in URL-encoded strings.
+// Each escape sequence interprets the following 2 characters as hex values to create the output
+// byte. For example, the sequence '%20' is converted into byte (0x20) which is a single space
+// character. Another example converts '%C3%A9' into 2 sequential bytes (0xc3 and 0xa9
+// respectively). Overall, 3 characters are converted into one byte whenever a '%' character
+// is encountered in the string.
+struct url_decode_char_replacer {
+  size_type chars_start_offset;  // index of first character, can be non-zero for sliced column
+  size_type num_esc_pos;
+  size_type const* d_esc_positions{};  // character index of escape sequences
+  char const* d_in_chars{};
+  char* d_out_chars{};
+
+  __device__ void operator()(size_type input_idx)
   {
-    if (d_strings.is_null(idx)) return 0;
-    string_view d_str = d_strings.element<string_view>(idx);
-    char* out_ptr = d_chars ? out_ptr = d_chars + d_offsets[idx] : nullptr;
-    size_type nbytes                  = 0;
-    const char* in_ptr                = d_str.data();
-    const char* end                   = in_ptr + d_str.size_bytes();
-    while (in_ptr < end)  // walk through each byte
-    {
-      char ch = *in_ptr++;
-      if ((ch == '%') && ((in_ptr + 1) < end)) {  // found '%', convert hex to byte
-        ch = static_cast<char>(16 * hex_char_to_byte(*in_ptr++));
-        ch += static_cast<char>(hex_char_to_byte(*in_ptr++));
+    char ch = d_in_chars[input_idx];
+
+    // determine the number of escape sequences at or before this character position
+    size_type const* next_esc_pos_ptr =
+      thrust::upper_bound(thrust::seq, d_esc_positions, d_esc_positions + num_esc_pos, input_idx);
+    size_type num_prev_esc = next_esc_pos_ptr - d_esc_positions;
+
+    // every escape that occurs before this one replaces 3 characters with 1
+    size_type output_idx = input_idx - (num_prev_esc * 2) - chars_start_offset;
+    if (num_prev_esc > 0) {
+      size_type prev_esc_pos = *(next_esc_pos_ptr - 1);
+      // find the previous escape to see if this character is within the escape sequence
+      if (input_idx - prev_esc_pos < 3) {
+        if (input_idx == prev_esc_pos) {  // at a position that needs to be replaced
+          ch = (hex_char_to_byte(d_in_chars[input_idx + 1]) << 4) |
+               hex_char_to_byte(d_in_chars[input_idx + 2]);
+          // previous escape sequence is this position, so the original calculation over-adjusted
+          output_idx += 2;
+        } else {
+          // one of the escape hex digits that has no corresponding character in the output
+          return;
+        }
       }
-      ++nbytes;  // keeping track of bytes and chars
-      if (out_ptr) out_ptr = copy_and_increment(out_ptr, &ch, 1);
     }
-    return nbytes;
+
+    d_out_chars[output_idx] = ch;
+  }
+};
+
+// Functor to update the string column offsets.
+// Each offset is reduced by 2 for every escape sequence that occurs in the entire string column
+// character data before the offset, as 3 characters are replaced with 1 for each escape.
+struct url_decode_offsets_updater {
+  size_type num_esc_pos;
+  size_type const* d_esc_positions{};
+
+  __device__ size_type operator()(size_type offset)
+  {
+    // determine the number of escape sequences occurring before this offset
+    size_type const* next_esc_pos_ptr =
+      thrust::lower_bound(thrust::seq, d_esc_positions, d_esc_positions + num_esc_pos, offset);
+    size_type num_prev_esc = next_esc_pos_ptr - d_esc_positions;
+    // every escape that occurs before this one replaces 3 characters with 1
+    return offset - (num_prev_esc * 2);
   }
 };
 
@@ -222,30 +276,65 @@ std::unique_ptr<column> url_decode(
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
+  auto d_offsets = strings.offsets().data<size_type>();
+  // use indices relative to the base column data so they can be compared to offset values
+  auto d_in_chars  = strings.chars().head<char>();
+  auto chars_bytes = strings.chars_size();
+  // determine index of first character in base column
+  size_type chars_start = 0;
+  if (strings.offset() != 0) {
+    chars_start = cudf::detail::get_value<size_type>(strings.offsets(), 0, stream);
+  }
+  size_type chars_end = chars_start + chars_bytes;
+
+  url_decode_escape_detector esc_detector{strings.size(), d_offsets, d_in_chars};
+
+  // count the number of URL escape sequences across all strings
+  size_type esc_count = thrust::count_if(rmm::exec_policy(stream),
+                                         thrust::make_counting_iterator<size_type>(chars_start),
+                                         thrust::make_counting_iterator<size_type>(chars_end),
+                                         esc_detector);
+
+  if (esc_count == 0) {
+    // nothing to replace, so just copy the input column
+    return std::make_unique<cudf::column>(strings.parent());
+  }
+
+  // create a vector of escape sequence positions
+  rmm::device_uvector<size_type> esc_positions(esc_count, stream);
+  auto d_esc_positions = esc_positions.data();
+  auto esc_pos_end     = thrust::copy_if(rmm::exec_policy(stream),
+                                     thrust::make_counting_iterator<size_t>(chars_start),
+                                     thrust::make_counting_iterator<size_t>(chars_end),
+                                     d_esc_positions,
+                                     esc_detector);
 
-  // copy null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   // build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), url_decoder_fn{d_strings});
-  auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
+  auto offsets_column = make_numeric_column(
+    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto offsets_view = offsets_column->mutable_view();
+  thrust::transform(rmm::exec_policy(stream),
+                    strings.offsets().begin<size_type>(),
+                    strings.offsets().end<size_type>(),
+                    offsets_view.begin<size_type>(),
+                    url_decode_offsets_updater{esc_count, d_esc_positions});
 
-  // build chars column
+  // create the chars column
   auto chars_column =
     create_chars_child_column(strings_count,
                               strings.null_count(),
-                              thrust::device_pointer_cast(d_offsets)[strings_count],
+                              chars_bytes - (esc_count * 2),  // replacing 3 bytes with 1
                               stream,
                               mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     url_decoder_fn{d_strings, d_offsets, d_chars});
+  auto d_out_chars = chars_column->mutable_view().data<char>();
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    chars_bytes,
+    url_decode_char_replacer{chars_start, esc_count, d_esc_positions, d_in_chars, d_out_chars});
+
+  // copy null mask
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),