Skip to content

Commit

Permalink
Add strings 'like' function (#11558)
Browse files Browse the repository at this point in the history
Adds new strings `like` function to cudf. This is a wildcard-based string matching function based on SQL's LIKE statement.
https://www.sqltutorial.org/sql-like/
Though some SQL implementations provide regex-like capabilities in the `like` statement pattern, the implementation here is strictly limited to the `%` (multi-character placeholder) and the `_` (single character placeholder) behavior. It also accepts an optional escape character that can be used when trying to match strings that contain `%` or `_` in them.

This is an easier (and faster) alternative to using the regex based `contains` function.
Example usage:
```
s = cudf.Series(["David", "Daniel", "Darcy"])
s.str.like('Da%')   ==> [True, True, True]    # starts with 'Da'
s.str.like('_a_i%') ==> [True, True, False]   # 2nd character is 'a' and 4th character is 'i'
s.str.like('_____') ==> [True, False, True]   # match any 5 characters
s.str.like('%y')    ==> [False, False, True]  # ends with 'y'
```

This PR includes gtests, pytest, and an nvbench-mark.

Reference #10797

Authors:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - Tobias Ribizel (https://github.com/upsj)
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #11558
  • Loading branch information
davidwendt authored Aug 26, 2022
1 parent 5f15ed4 commit ccd72f2
Show file tree
Hide file tree
Showing 13 changed files with 635 additions and 2 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,7 @@ add_library(
src/strings/extract/extract_all.cu
src/strings/filling/fill.cu
src/strings/filter_chars.cu
src/strings/like.cu
src/strings/padding.cu
src/strings/json/json_path.cu
src/strings/regex/regcomp.cpp
Expand Down
3 changes: 3 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ ConfigureBench(
string/factory.cu
string/filter.cpp
string/find.cpp
string/like.cpp
string/repeat_strings.cpp
string/replace.cpp
string/replace_re.cpp
Expand All @@ -290,6 +291,8 @@ ConfigureBench(
string/url_decode.cu
)

ConfigureNVBench(STRINGS_NVBENCH string/like.cpp)

# ##################################################################################################
# * json benchmark -------------------------------------------------------------------
ConfigureBench(JSON_BENCH string/json.cu)
Expand Down
98 changes: 98 additions & 0 deletions cpp/benchmarks/string/like.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/copying.hpp>
#include <cudf/filling.hpp>
#include <cudf/strings/contains.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

namespace {
std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows, int32_t hit_rate)
{
// build input table using the following data
auto data = cudf::test::strings_column_wrapper({
"123 abc 4567890 DEFGHI 0987 5W43", // matches always;
"012345 6789 01234 56789 0123 456", // the rest do not match
"abc 4567890 DEFGHI 0987 Wxyz 123",
"abcdefghijklmnopqrstuvwxyz 01234",
"",
"AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
"9876543210,abcdefghijklmnopqrstU",
"9876543210,abcdefghijklmnopqrstU",
"123 édf 4567890 DéFG 0987 X5",
"1",
});
auto data_view = cudf::column_view(data);

// compute number of rows in n_rows that should match
auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;

// Create a randomized gather-map to build a column out of the strings in data.
data_profile gather_profile =
data_profile_builder().cardinality(0).null_probability(0.0).distribution(
cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
auto gather_table =
create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);

// Create scatter map by placing 0-index values throughout the gather-map
auto scatter_data = cudf::sequence(
matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
auto gather_map = table->view().column(0);
table = cudf::gather(cudf::table_view({data_view}), gather_map);

return std::move(table->release().front());
}

} // namespace

static void bench_like(nvbench::state& state)
{
cudf::rmm_pool_raii pool_raii;
auto const n_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const hit_rate = static_cast<int32_t>(state.get_int64("hit_rate"));

auto col = build_input_column(n_rows, hit_rate);
auto input = cudf::strings_column_view(col->view());

// This pattern forces reading the entire target string (when matched expected)
auto pattern = std::string("% 5W4_"); // regex equivalent: ".* 5W4."

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
// gather some throughput statistics as well
auto chars_size = input.chars_size();
state.add_element_count(chars_size, "chars_size"); // number of bytes;
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read;
state.add_global_memory_writes<nvbench::int8_t>(n_rows); // writes are BOOL8

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::strings::like(input, pattern); });
}

NVBENCH_BENCH(bench_like)
.set_name("strings_like")
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_int64_axis("hit_rate", {1, 5, 10, 25, 70, 100});
46 changes: 46 additions & 0 deletions cpp/include/cudf/strings/contains.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/regex/flags.hpp>
#include <cudf/strings/strings_column_view.hpp>

Expand Down Expand Up @@ -111,6 +112,51 @@ std::unique_ptr<column> count_re(
regex_flags const flags = regex_flags::DEFAULT,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a boolean column identifying rows which
* match the given like pattern.
*
* The like pattern expects only 2 wildcard special characters:
* - `%` any number of any character (including no characters)
* - `_` any single character
*
* @code{.pseudo}
* Example:
* s = ["azaa", "ababaabba", "aaxa"]
* r = like(s, "%a_aa%")
* r is now [1, 1, 0]
* r = like(s, "a__a")
* r is now [1, 0, 1]
* @endcode
*
* Specify an escape character to include either `%` or `_` in the search.
* The `escape_character` is expected to be either 0 or 1 characters.
* If more than one character is specified only the first character is used.
*
* @code{.pseudo}
* Example:
* s = ["abc_def", "abc1def", "abc_"]
* r = like(s, "abc/_d%", "/")
* r is now [1, 0, 0]
* @endcode
*
* Any null string entries return corresponding null output column entries.
*
* @throw cudf::logic_error if `pattern` or `escape_character` is invalid
*
* @param input Strings instance for this operation
* @param pattern Like pattern to match within each string
* @param escape_character Optional character specifies the escape prefix;
* default is no escape character
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New boolean column
*/
std::unique_ptr<column> like(
strings_column_view const& input,
string_scalar const& pattern,
string_scalar const& escape_character = string_scalar(""),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
152 changes: 152 additions & 0 deletions cpp/src/strings/like.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/contains.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/transform.h>

namespace cudf {
namespace strings {
namespace detail {

namespace {

constexpr char multi_wildcard = '%';
constexpr char single_wildcard = '_';

struct like_fn {
column_device_view const d_strings;
string_view const d_pattern;
string_view const d_escape;

__device__ bool operator()(size_type const idx)
{
if (d_strings.is_null(idx)) return false;
auto const d_str = d_strings.element<string_view>(idx);

// using only iterators to better handle UTF-8 characters
auto target_itr = d_str.begin();
auto pattern_itr = d_pattern.begin();

auto const target_end = d_str.end();
auto const pattern_end = d_pattern.end();
auto const esc_char = d_escape.empty() ? 0 : d_escape[0];

auto last_target_itr = target_end;
auto last_pattern_itr = pattern_end;

bool result = true;
while (true) {
// walk through the pattern and check against the current character
while (pattern_itr < pattern_end) {
auto const escaped = *pattern_itr == esc_char;
auto const pattern_char =
escaped && (pattern_itr + 1 < pattern_end) ? *(++pattern_itr) : *pattern_itr;

if (escaped || (pattern_char != multi_wildcard)) {
// check match with the current character
result = ((target_itr != target_end) && ((!escaped && pattern_char == single_wildcard) ||
(pattern_char == *target_itr)));
if (!result) { break; }
++target_itr;
++pattern_itr;
} else {
// process wildcard '%'
result = true;
++pattern_itr;
if (pattern_itr == pattern_end) { // pattern ends with '%' so we are done
target_itr = target_end;
break;
}
// save positions
last_pattern_itr = pattern_itr;
last_target_itr = target_itr;
}
} // next pattern character

if (result && (target_itr == target_end)) { break; } // success

result = false;
// check if exhausted either the pattern or the target string
if (last_pattern_itr == pattern_end || last_target_itr == target_end) { break; }

// restore saved positions
pattern_itr = last_pattern_itr;
target_itr = ++last_target_itr;
}
return result;
}
};

} // namespace

std::unique_ptr<column> like(
strings_column_view const& input,
string_scalar const& pattern,
string_scalar const& escape_character,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto results = make_numeric_column(data_type{type_id::BOOL8},
input.size(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);
if (input.is_empty()) { return results; }

CUDF_EXPECTS(pattern.is_valid(stream), "Parameter pattern must be valid");
CUDF_EXPECTS(escape_character.is_valid(stream), "Parameter escape_character must be valid");

auto const d_strings = column_device_view::create(input.parent(), stream);
auto const d_pattern = pattern.value(stream);
auto const d_escape = escape_character.value(stream);

auto d_results = results->mutable_view().data<bool>();

thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
results->mutable_view().data<bool>(),
like_fn{*d_strings, d_pattern, d_escape});

return results;
}

} // namespace detail

// external API

std::unique_ptr<column> like(strings_column_view const& input,
string_scalar const& pattern,
string_scalar const& escape_character,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::like(input, pattern, escape_character, cudf::default_stream_value, mr);
}

} // namespace strings
} // namespace cudf
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ ConfigureTest(
strings/integers_tests.cpp
strings/ipv4_tests.cpp
strings/json_tests.cpp
strings/like_tests.cpp
strings/pad_tests.cpp
strings/repeat_strings_tests.cpp
strings/replace_regex_tests.cpp
Expand Down
Loading

0 comments on commit ccd72f2

Please sign in to comment.