Skip to content

Commit

Permalink
Merge branch 'branch-22.06' into strings-count-re-factor
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Apr 8, 2022
2 parents ac91a2c + 1cc3d8b commit 9c658a1
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 55 deletions.
85 changes: 68 additions & 17 deletions cpp/benchmarks/string/contains.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,48 +19,99 @@
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <cudf/filling.hpp>
#include <cudf/strings/contains.hpp>
#include <cudf/strings/findall.hpp>
#include <cudf/strings/strings_column_view.hpp>

class StringContains : public cudf::benchmark {
};

std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows, int32_t hit_rate)
{
// build input table using the following data
auto data = cudf::test::strings_column_wrapper({
"123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns;
"012345 6789 01234 56789 0123 456", // the rest do not match
"abc 4567890 DEFGHI 0987 Wxyz 123",
"abcdefghijklmnopqrstuvwxyz 01234",
"",
"AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
"9876543210,abcdefghijklmnopqrstU",
"9876543210,abcdefghijklmnopqrstU",
"123 édf 4567890 DéFG 0987 X5",
"1",
});
auto data_view = cudf::column_view(data);

// compute number of rows in n_rows that should match
auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;

// Create a randomized gather-map to build a column out of the strings in data.
data_profile gather_profile;
gather_profile.set_distribution_params(
cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
gather_profile.set_null_frequency(0.0); // no nulls for gather-map
gather_profile.set_cardinality(0);
auto gather_table =
create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);

// Create scatter map by placing 0-index values throughout the gather-map
auto scatter_data = cudf::sequence(
matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
auto table = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
auto gather_map = table->view().column(0);
table = cudf::gather(cudf::table_view({data_view}), gather_map);

return std::move(table->release().front());
}

enum contains_type { contains, count, findall };

// longer pattern lengths demand more working memory per string
std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};

static void BM_contains(benchmark::State& state, contains_type ct)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows});
cudf::strings_column_view input(table->view().column(0));
auto const n_rows = static_cast<cudf::size_type>(state.range(0));
auto const pattern_index = static_cast<int32_t>(state.range(1));
auto const hit_rate = static_cast<int32_t>(state.range(2));

auto col = build_input_column(n_rows, hit_rate);
auto input = cudf::strings_column_view(col->view());

auto pattern = patterns[pattern_index];

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
// contains_re(), matches_re(), and count_re() all have similar functions
// with count_re() being the most regex intensive
switch (ct) {
case contains_type::contains: // contains_re and matches_re use the same main logic
cudf::strings::contains_re(input, "\\d+");
cudf::strings::contains_re(input, pattern);
break;
case contains_type::count: // counts occurrences of pattern
cudf::strings::count_re(input, "\\d+");
case contains_type::count: // counts occurrences of matches
cudf::strings::count_re(input, pattern);
break;
case contains_type::findall: // returns occurrences of matches
cudf::strings::findall(input, "\\d+");
case contains_type::findall: // returns occurrences of all matches
cudf::strings::findall(input, pattern);
break;
}
}

state.SetBytesProcessed(state.iterations() * input.chars_size());
}

#define STRINGS_BENCHMARK_DEFINE(name, b) \
BENCHMARK_DEFINE_F(StringContains, name) \
(::benchmark::State & st) { BM_contains(st, contains_type::b); } \
BENCHMARK_REGISTER_F(StringContains, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 12, 1 << 24}}) \
->UseManualTime() \
#define STRINGS_BENCHMARK_DEFINE(name, b) \
BENCHMARK_DEFINE_F(StringContains, name) \
(::benchmark::State & st) { BM_contains(st, contains_type::b); } \
BENCHMARK_REGISTER_F(StringContains, name) \
->ArgsProduct({{4096, 32768, 262144, 2097152, 16777216}, /* row count */ \
{0, 1}, /* patterns index */ \
{1, 5, 10, 25, 70, 100}}) /* hit rate */ \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(contains_re, contains)
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/strings/regex/regex.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,10 +42,10 @@ class reprog;
using match_pair = thrust::pair<cudf::size_type, cudf::size_type>;
using match_result = thrust::optional<match_pair>;

constexpr int32_t RX_STACK_SMALL = 112; ///< fastest stack size
constexpr int32_t RX_STACK_MEDIUM = 1104; ///< faster stack size
constexpr int32_t RX_STACK_LARGE = 10128; ///< fast stack size
constexpr int32_t RX_STACK_ANY = 8; ///< slowest: uses global memory
constexpr int32_t RX_STACK_SMALL = 112; ///< fastest stack size
constexpr int32_t RX_STACK_MEDIUM = 1104; ///< faster stack size
constexpr int32_t RX_STACK_LARGE = 2560; ///< fast stack size
constexpr int32_t RX_STACK_ANY = 8; ///< slowest: uses global memory

/**
* @brief Mapping the number of instructions to device code stack memory size.
Expand Down
53 changes: 20 additions & 33 deletions cpp/tests/strings/contains_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/strings/contains.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
Expand Down Expand Up @@ -210,7 +211,7 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
"5.79.97.178",
"127.0.0.1"});
auto strings_view = cudf::strings_column_view(strings);
{ // is_ip
{ // is_ip: 58 instructions
std::string pattern =
"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
"$";
Expand All @@ -219,7 +220,7 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
{true, true, false, false, false, false, true, true, true, true});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
}
{ // is_loopback
{ // is_loopback: 72 instructions
std::string pattern =
"^127\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))"
"\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))"
Expand All @@ -229,7 +230,7 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
{false, false, false, false, false, false, false, false, false, true});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
}
{ // is_multicast
{ // is_multicast: 79 instructions
std::string pattern =
"^(2(2[4-9]|3[0-9]))\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))"
"\\.([0-9]|[1-9][0-9]|1([0-9][0-9])|2([0-4][0-9]|5[0-5]))"
Expand Down Expand Up @@ -428,29 +429,22 @@ TEST_F(StringsContainsTests, MediumRegex)
"5678901234567890",
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnop"
"qrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());

auto strings_view = cudf::strings_column_view(strings);
{
auto results = cudf::strings::contains_re(strings_view, medium_regex);
bool h_expected[] = {true, false, false};
cudf::test::fixed_width_column_wrapper<bool> expected(
h_expected,
h_expected + h_strings.size(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
cudf::test::fixed_width_column_wrapper<bool> expected(h_expected,
h_expected + h_strings.size());
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}
{
auto results = cudf::strings::matches_re(strings_view, medium_regex);
bool h_expected[] = {true, false, false};
cudf::test::fixed_width_column_wrapper<bool> expected(
h_expected,
h_expected + h_strings.size(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
cudf::test::fixed_width_column_wrapper<bool> expected(h_expected,
h_expected + h_strings.size());
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}
{
auto results = cudf::strings::count_re(strings_view, medium_regex);
Expand All @@ -475,29 +469,22 @@ TEST_F(StringsContainsTests, LargeRegex)
"5678901234567890",
"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnop"
"qrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());

auto strings_view = cudf::strings_column_view(strings);
{
auto results = cudf::strings::contains_re(strings_view, large_regex);
bool h_expected[] = {true, false, false};
cudf::test::fixed_width_column_wrapper<bool> expected(
h_expected,
h_expected + h_strings.size(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
cudf::test::fixed_width_column_wrapper<bool> expected(h_expected,
h_expected + h_strings.size());
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}
{
auto results = cudf::strings::matches_re(strings_view, large_regex);
bool h_expected[] = {true, false, false};
cudf::test::fixed_width_column_wrapper<bool> expected(
h_expected,
h_expected + h_strings.size(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
cudf::test::fixed_width_column_wrapper<bool> expected(h_expected,
h_expected + h_strings.size());
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}
{
auto results = cudf::strings::count_re(strings_view, large_regex);
Expand All @@ -510,8 +497,8 @@ TEST_F(StringsContainsTests, LargeRegex)

TEST_F(StringsContainsTests, ExtraLargeRegex)
{
// This results in ~950 regex instructions which is above the 'large' range.
std::string data(950, '0');
// This results in 321 regex instructions which is above the 'large' range.
std::string data(320, '0');
cudf::test::strings_column_wrapper strings({data, data, data, data, data, "00"});
std::string pattern = data;

Expand Down

0 comments on commit 9c658a1

Please sign in to comment.