Skip to content

Commit

Permalink
Fix octal pattern matching in regex string (#9993)
Browse files Browse the repository at this point in the history
Closes #9946 

Fixes decoding logic in regex pattern compile step to consume only up to the last octal character. The original logic was incorrectly discarding the next pattern character. And if the octal characters were specified at the end of the pattern invalid bytes were read passed the end of the pattern. This is what caused the intermittent failure since sometimes the invalid bytes were 0 which masked the issue.

This PR also includes tests for octal patterns in various positions in the regex pattern.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: #9993
  • Loading branch information
davidwendt authored Jan 14, 2022
1 parent ca77542 commit ce31d7d
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 5 deletions.
8 changes: 4 additions & 4 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -258,10 +258,10 @@ class regex_parser {
// treating all quoted numbers as Octal, since we are not supporting backreferences
if (yy >= '0' && yy <= '7') {
yy = yy - '0';
char32_t c = *exprp++;
char32_t c = *exprp;
while (c >= '0' && c <= '7') {
yy = (yy << 3) | (c - '0');
c = *exprp++;
c = *(++exprp);
}
return CHAR;
} else {
Expand Down Expand Up @@ -926,7 +926,7 @@ void reprog::optimize2()
_startinst_ids.push_back(-1); // terminator mark
}

#ifndef NDBUG
#ifndef NDEBUG
void reprog::print(regex_flags const flags)
{
printf("Flags = 0x%08x\n", static_cast<uint32_t>(flags));
Expand Down
15 changes: 14 additions & 1 deletion cpp/tests/strings/contains_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -237,6 +237,19 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
}
}

TEST_F(StringsContainsTests, OctalTest)
{
cudf::test::strings_column_wrapper strings({"AZ", "B", "CDAZEY", ""});
auto strings_view = cudf::strings_column_view(strings);
cudf::test::fixed_width_column_wrapper<bool> expected({1, 0, 1, 0});
auto results = cudf::strings::contains_re(strings_view, "\\101");
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
results = cudf::strings::contains_re(strings_view, "\\101Z");
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
results = cudf::strings::contains_re(strings_view, "D*\\101\\132");
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsContainsTests, EmbeddedNullCharacter)
{
std::vector<std::string> data(10);
Expand Down

0 comments on commit ce31d7d

Please sign in to comment.