From 713d5b8bd31463a0f5b45b8bf56f6491e43ad408 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 4 Jan 2024 17:42:12 -0500 Subject: [PATCH] Fix strings::contains matching end of string target --- cpp/src/strings/search/find.cu | 4 +-- cpp/tests/strings/find_tests.cpp | 47 ++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 1299e552565..d35f512e0f7 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -364,7 +364,7 @@ __global__ void contains_warp_parallel_fn(column_device_view const d_strings, // each thread of the warp will check just part of the string auto found = false; for (auto i = static_cast(idx % cudf::detail::warp_size); - !found && (i + d_target.size_bytes()) < d_str.size_bytes(); + !found && ((i + d_target.size_bytes()) <= d_str.size_bytes()); i += cudf::detail::warp_size) { // check the target matches this part of the d_str data if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; } diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp index 5c0a5b760f5..3f291e870c0 100644 --- a/cpp/tests/strings/find_tests.cpp +++ b/cpp/tests/strings/find_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -131,22 +131,38 @@ TEST_F(StringsFindTest, FindLongStrings) results = cudf::strings::find(view, cudf::strings_column_view(targets)); expected = cudf::test::fixed_width_column_wrapper({7, 56, 0, 0, -1, 73, -1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + results = cudf::strings::find(view, cudf::string_scalar("ing")); + expected = cudf::test::fixed_width_column_wrapper({-1, 86, 10, 73, -1, 58, -1}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + results = cudf::strings::rfind(view, cudf::string_scalar("ing")); + expected = cudf::test::fixed_width_column_wrapper({-1, 86, 10, 86, -1, 58, -1}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsFindTest, Contains) { - cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""}, - {1, 1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper strings( + {"Héllo", "thesé", "", "lease", "tést strings", "", "eé", "éte"}, {1, 1, 0, 1, 1, 1, 1, 1}); auto strings_view = cudf::strings_column_view(strings); { - cudf::test::fixed_width_column_wrapper expected({0, 1, 0, 1, 0, 0}, {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({0, 1, 0, 1, 0, 0, 1, 1}, + {1, 1, 0, 1, 1, 1, 1, 1}); auto results = cudf::strings::contains(strings_view, cudf::string_scalar("e")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - cudf::test::strings_column_wrapper targets({"Hello", "é", "e", "x", "", ""}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper expected({0, 1, 0, 0, 1, 0}, {1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({1, 1, 0, 0, 1, 0, 1, 1}, + {1, 1, 0, 1, 1, 1, 1, 1}); + auto results = cudf::strings::contains(strings_view, cudf::string_scalar("é")); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + cudf::test::strings_column_wrapper targets({"Hello", "é", "e", "x", "", "", "n", "t"}, + {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper expected({0, 1, 0, 0, 1, 0, 0, 1}, + {1, 1, 0, 1, 1, 1, 1, 1}); auto results = cudf::strings::contains(strings_view, cudf::strings_column_view(targets)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -161,15 +177,24 @@ TEST_F(StringsFindTest, ContainsLongStrings) "it returns the last position where value could be inserted without violating the ordering", "algorithms execution is parallelized as determined by an execution policy. t", "he this is a continuation of previous row to make sure string boundaries are honored", + "abcdefghijklmnopqrstuvwxyz 0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ !@#$%^&*()~", ""}); auto strings_view = cudf::strings_column_view(strings); auto results = cudf::strings::contains(strings_view, cudf::string_scalar("e")); - cudf::test::fixed_width_column_wrapper expected({1, 1, 1, 1, 1, 1, 0}); + auto expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + results = cudf::strings::contains(strings_view, cudf::string_scalar(" the ")); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 1, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + results = cudf::strings::contains(strings_view, cudf::string_scalar("a")); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1, 1, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - results = cudf::strings::contains(strings_view, cudf::string_scalar(" the ")); - cudf::test::fixed_width_column_wrapper expected2({0, 1, 0, 1, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected2); + results = cudf::strings::contains(strings_view, cudf::string_scalar("~")); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsFindTest, StartsWith)