Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Regex match check #3779

Merged
merged 5 commits into from
Apr 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -64,7 +64,7 @@ once_cell = { version = "1.19.0", default-features = false }
phf = { version = "0.11.2", default-features = false }
pollster = "0.3.0"
regex = "1.10.4"
regress = { version="0.9.0", features = ["utf16"]}
regress = { version="0.9.1", features = ["utf16"]}
rustc-hash = { version = "1.1.0", default-features = false }
serde_json = "1.0.114"
serde = "1.0.197"
122 changes: 57 additions & 65 deletions core/engine/src/builtins/regexp/mod.rs
Original file line number Diff line number Diff line change
@@ -940,82 +940,74 @@ impl RegExp {
// 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false.
let full_unicode = flags.contains(&('u' as u16)) || flags.contains(&('v' as u16));

// 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S.
// 12. NOTE: Each element of input is considered to be a character.

// 10. Let matchSucceeded be false.
// 13. Repeat, while matchSucceeded is false,
let match_value = loop {
// a. If lastIndex > length, then
if last_index > length {
// i. If global is true or sticky is true, then
if global || sticky {
// 1. Perform ? Set(R, "lastIndex", +0𝔽, true).
this.set(utf16!("lastIndex"), 0, true, context)?;
}
// NOTE: The following steps are take care of by regress:
//
// SKIP: 10. Let matchSucceeded be false.
// SKIP: 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S.
// SKIP: 12. NOTE: Each element of input is considered to be a character.
// SKIP: 13. Repeat, while matchSucceeded is false,

// 13.a. If lastIndex > length, then
if last_index > length {
// i. If global is true or sticky is true, then
if global || sticky {
// 1. Perform ? Set(R, "lastIndex", +0𝔽, true).
this.set(utf16!("lastIndex"), 0, true, context)?;
}

// ii. Return null.
return Ok(None);
// ii. Return null.
return Ok(None);
}

// 13.b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S.
// 13.c. Let r be matcher(input, inputIndex).
let r: Option<regress::Match> = if full_unicode {
matcher.find_from_utf16(input, last_index as usize).next()
} else {
matcher.find_from_ucs2(input, last_index as usize).next()
};

let Some(match_value) = r else {
// d. If r is failure, then
//
// NOTE: Merged the following steps (since we no longer have a loop):
// 13.d.i. If sticky is true, then
// 13.a.i. If global is true or sticky is true, then
if global || sticky {
// 1. Perform ? Set(R, "lastIndex", +0𝔽, true).
this.set(utf16!("lastIndex"), 0, true, context)?;
}

// b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S.
// c. Let r be matcher(input, inputIndex).
let r: Option<regress::Match> = if full_unicode {
matcher.find_from_utf16(input, last_index as usize).next()
} else {
matcher.find_from_ucs2(input, last_index as usize).next()
};
// MOVE: ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode).
// NOTE: Handled within the regress matches iterator, see below for last_index assignment.

match r {
// d. If r is failure, then
None => {
// i. If sticky is true, then
if sticky {
// 1. Perform ? Set(R, "lastIndex", +0𝔽, true).
this.set(utf16!("lastIndex"), 0, true, context)?;
// NOTE: Merged and steps:
// 13.a.ii. Return null.
// 13.d.i.2. Return null.
return Ok(None);
};

// 2. Return null.
return Ok(None);
}
// e. Else
// SKIP: i. Assert: r is a MatchState.
// SKIP: ii. Set matchSucceeded to true.

// ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode).
last_index = advance_string_index(input, last_index, full_unicode);
}
// NOTE: regress currently doesn't support the sticky flag so we have to emulate it.
if sticky && match_value.start() != last_index as usize {
// 1. Perform ? Set(R, "lastIndex", +0𝔽, true).
this.set(utf16!("lastIndex"), 0, true, context)?;

Some(m) => {
// d. If r is failure, then
#[allow(clippy::if_not_else)]
if m.start() as u64 != last_index {
// i. If sticky is true, then
if sticky {
// 1. Perform ? Set(R, "lastIndex", +0𝔽, true).
this.set(utf16!("lastIndex"), 0, true, context)?;

// 2. Return null.
return Ok(None);
}
// 2. Return null.
return Ok(None);
}

// ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode).
last_index = advance_string_index(input, last_index, full_unicode);
// e. Else,
} else {
// i. Assert: r is a State.
// ii. Set matchSucceeded to true.
break m;
}
}
}
};
// 13.d.ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode).
// NOTE: Calculation of last_index is done in regress.
last_index = match_value.start() as u64;

// 14. Let e be r's endIndex value.
let e = match_value.end();

// Note: This is already taken care of be regress.
// 15. If fullUnicode is true, set e to GetStringIndex(S, e).
// e is an index into the Input character list, derived from S, matched by matcher.
// Let eUTF be the smallest index into S that corresponds to the character at element e of Input.
// If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S.
// b. Set e to eUTF.
// NOTE: Step 15 is already taken care of by regress.
let e = match_value.end();

// 16. If global is true or sticky is true, then
if global || sticky {