From 76827f3ab7d4c2f1edf0dd648481438e186945fb Mon Sep 17 00:00:00 2001 From: Haled Odat <8566042+HalidOdat@users.noreply.github.com> Date: Tue, 2 Apr 2024 05:58:23 +0200 Subject: [PATCH 1/5] Initial optimization --- core/engine/src/builtins/regexp/mod.rs | 46 ++++++++++++++++---------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 926852cd565..3e657652011 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -943,9 +943,14 @@ impl RegExp { // 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S. // 12. NOTE: Each element of input is considered to be a character. + // TODO: It would be better to put this in an enum. + // enum Matches { Utf16(..), Ucs2(..) } + let mut x = matcher.find_from_utf16(input, last_index as usize); + let mut y = matcher.find_from_ucs2(input, last_index as usize); + // 10. Let matchSucceeded be false. // 13. Repeat, while matchSucceeded is false, - let match_value = loop { + let match_value = { // a. If lastIndex > length, then if last_index > length { // i. If global is true or sticky is true, then @@ -960,12 +965,8 @@ impl RegExp { // b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. // c. Let r be matcher(input, inputIndex). - let r: Option = if full_unicode { - matcher.find_from_utf16(input, last_index as usize).next() - } else { - matcher.find_from_ucs2(input, last_index as usize).next() - }; - + let r: Option = if full_unicode { x.next() } else { y.next() }; + // x. match r { // d. If r is failure, then None => { @@ -978,14 +979,19 @@ impl RegExp { return Ok(None); } - // ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). - last_index = advance_string_index(input, last_index, full_unicode); + // i. If global is true or sticky is true, then + if global || sticky { + // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). + this.set(utf16!("lastIndex"), 0, true, context)?; + } + return Ok(None); } Some(m) => { + // FIXME: Fix unicode regex + // d. If r is failure, then - #[allow(clippy::if_not_else)] - if m.start() as u64 != last_index { + if full_unicode && m.start() != last_index as usize { // i. If sticky is true, then if sticky { // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). @@ -995,20 +1001,24 @@ impl RegExp { return Ok(None); } - // ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). - last_index = advance_string_index(input, last_index, full_unicode); - // e. Else, - } else { - // i. Assert: r is a State. - // ii. Set matchSucceeded to true. - break m; + // i. If global is true or sticky is true, then + // if global || sticky { + // // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). + // this.set(utf16!("lastIndex"), 0, true, context)?; + // } + return Ok(None); } + + // i. Assert: r is a State. + // ii. Set matchSucceeded to true. + m } } }; // 14. Let e be r's endIndex value. let e = match_value.end(); + last_index = match_value.start() as u64; // Note: This is already taken care of be regress. // 15. If fullUnicode is true, set e to GetStringIndex(S, e). From 77bc4a88a95569d639aa384a3b743bb9a00f02f6 Mon Sep 17 00:00:00 2001 From: Haled Odat <8566042+HalidOdat@users.noreply.github.com> Date: Tue, 2 Apr 2024 07:30:17 +0200 Subject: [PATCH 2/5] Fix sticky flag --- core/engine/src/builtins/regexp/mod.rs | 87 +++++++++++--------------- 1 file changed, 35 insertions(+), 52 deletions(-) diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 3e657652011..83f1d08499b 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -943,76 +943,59 @@ impl RegExp { // 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S. // 12. NOTE: Each element of input is considered to be a character. + // TODO: Comment spec deviation // TODO: It would be better to put this in an enum. // enum Matches { Utf16(..), Ucs2(..) } - let mut x = matcher.find_from_utf16(input, last_index as usize); - let mut y = matcher.find_from_ucs2(input, last_index as usize); + let mut utf16_matches = matcher.find_from_utf16(input, last_index as usize); + let mut ucs2_matches = matcher.find_from_ucs2(input, last_index as usize); // 10. Let matchSucceeded be false. // 13. Repeat, while matchSucceeded is false, - let match_value = { - // a. If lastIndex > length, then - if last_index > length { + // a. If lastIndex > length, then + if last_index > length { + // i. If global is true or sticky is true, then + if global || sticky { + // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). + this.set(utf16!("lastIndex"), 0, true, context)?; + } + + // ii. Return null. + return Ok(None); + } + + // b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. + // c. Let r be matcher(input, inputIndex). + let r: Option = if full_unicode { + utf16_matches.next() + } else { + ucs2_matches.next() + }; + + let match_value = match r { + // d. If r is failure, then + None => { // i. If global is true or sticky is true, then if global || sticky { // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). this.set(utf16!("lastIndex"), 0, true, context)?; } - // ii. Return null. return Ok(None); } + // i. Assert: r is a State. + Some(m) => { + if sticky && m.start() != last_index as usize { + // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). + this.set(utf16!("lastIndex"), 0, true, context)?; - // b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. - // c. Let r be matcher(input, inputIndex). - let r: Option = if full_unicode { x.next() } else { y.next() }; - // x. - match r { - // d. If r is failure, then - None => { - // i. If sticky is true, then - if sticky { - // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). - this.set(utf16!("lastIndex"), 0, true, context)?; - - // 2. Return null. - return Ok(None); - } - - // i. If global is true or sticky is true, then - if global || sticky { - // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). - this.set(utf16!("lastIndex"), 0, true, context)?; - } + // 2. Return null. return Ok(None); } - Some(m) => { - // FIXME: Fix unicode regex - - // d. If r is failure, then - if full_unicode && m.start() != last_index as usize { - // i. If sticky is true, then - if sticky { - // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). - this.set(utf16!("lastIndex"), 0, true, context)?; + // FIXME: Fix unicode regex - // 2. Return null. - return Ok(None); - } - - // i. If global is true or sticky is true, then - // if global || sticky { - // // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). - // this.set(utf16!("lastIndex"), 0, true, context)?; - // } - return Ok(None); - } - - // i. Assert: r is a State. - // ii. Set matchSucceeded to true. - m - } + // ii. Set matchSucceeded to true. + m } }; From 3d108246ee39ce508ee71e9f92e3d92f91d90464 Mon Sep 17 00:00:00 2001 From: Haled Odat <8566042+HalidOdat@users.noreply.github.com> Date: Tue, 2 Apr 2024 19:50:46 +0200 Subject: [PATCH 3/5] Apply review --- core/engine/src/builtins/regexp/mod.rs | 49 ++++++++++---------------- 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 83f1d08499b..51656669cce 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -944,10 +944,6 @@ impl RegExp { // 12. NOTE: Each element of input is considered to be a character. // TODO: Comment spec deviation - // TODO: It would be better to put this in an enum. - // enum Matches { Utf16(..), Ucs2(..) } - let mut utf16_matches = matcher.find_from_utf16(input, last_index as usize); - let mut ucs2_matches = matcher.find_from_ucs2(input, last_index as usize); // 10. Let matchSucceeded be false. // 13. Repeat, while matchSucceeded is false, @@ -963,42 +959,33 @@ impl RegExp { return Ok(None); } - // b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. - // c. Let r be matcher(input, inputIndex). + // b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. + // c. Let r be matcher(input, inputIndex). let r: Option = if full_unicode { - utf16_matches.next() + matcher.find_from_utf16(input, last_index as usize).next() } else { - ucs2_matches.next() + matcher.find_from_ucs2(input, last_index as usize).next() }; - let match_value = match r { + let Some(match_value) = r else { // d. If r is failure, then - None => { - // i. If global is true or sticky is true, then - if global || sticky { - // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). - this.set(utf16!("lastIndex"), 0, true, context)?; - } - - return Ok(None); + // i. If global is true or sticky is true, then + if global || sticky { + // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). + this.set(utf16!("lastIndex"), 0, true, context)?; } - // i. Assert: r is a State. - Some(m) => { - if sticky && m.start() != last_index as usize { - // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). - this.set(utf16!("lastIndex"), 0, true, context)?; - - // 2. Return null. - return Ok(None); - } - // FIXME: Fix unicode regex - - // ii. Set matchSucceeded to true. - m - } + return Ok(None); }; + if sticky && match_value.start() != last_index as usize { + // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). + this.set(utf16!("lastIndex"), 0, true, context)?; + + // 2. Return null. + return Ok(None); + } + // 14. Let e be r's endIndex value. let e = match_value.end(); last_index = match_value.start() as u64; From 86eb22553b6a916e9a4d29295498843c1e91ea71 Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Tue, 2 Apr 2024 20:33:30 +0200 Subject: [PATCH 4/5] Update regress to 0.9.1 --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4b0fbb420cf..c18ebc6b01a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3074,9 +3074,9 @@ dependencies = [ [[package]] name = "regress" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d06f9a1f7cd8473611ba1a480cf35f9c5cffc2954336ba90a982fdb7e7d7f51e" +checksum = "0eae2a1ebfecc58aff952ef8ccd364329abe627762f5bf09ff42eb9d98522479" dependencies = [ "hashbrown 0.14.3", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 1dc833c8b79..022c4f44813 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,7 +64,7 @@ once_cell = { version = "1.19.0", default-features = false } phf = { version = "0.11.2", default-features = false } pollster = "0.3.0" regex = "1.10.4" -regress = { version="0.9.0", features = ["utf16"]} +regress = { version="0.9.1", features = ["utf16"]} rustc-hash = { version = "1.1.0", default-features = false } serde_json = "1.0.114" serde = "1.0.197" From 166b44ac1ac9da5112d8d2d3ad0d0c01505d6c58 Mon Sep 17 00:00:00 2001 From: Haled Odat <8566042+HalidOdat@users.noreply.github.com> Date: Tue, 2 Apr 2024 22:11:55 +0200 Subject: [PATCH 5/5] Add spec deviation comments --- core/engine/src/builtins/regexp/mod.rs | 46 ++++++++++++++++---------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 51656669cce..f75552b512c 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -940,14 +940,14 @@ impl RegExp { // 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false. let full_unicode = flags.contains(&('u' as u16)) || flags.contains(&('v' as u16)); - // 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S. - // 12. NOTE: Each element of input is considered to be a character. - - // TODO: Comment spec deviation + // NOTE: The following steps are take care of by regress: + // + // SKIP: 10. Let matchSucceeded be false. + // SKIP: 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S. + // SKIP: 12. NOTE: Each element of input is considered to be a character. + // SKIP: 13. Repeat, while matchSucceeded is false, - // 10. Let matchSucceeded be false. - // 13. Repeat, while matchSucceeded is false, - // a. If lastIndex > length, then + // 13.a. If lastIndex > length, then if last_index > length { // i. If global is true or sticky is true, then if global || sticky { @@ -959,8 +959,8 @@ impl RegExp { return Ok(None); } - // b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. - // c. Let r be matcher(input, inputIndex). + // 13.b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. + // 13.c. Let r be matcher(input, inputIndex). let r: Option = if full_unicode { matcher.find_from_utf16(input, last_index as usize).next() } else { @@ -969,15 +969,29 @@ impl RegExp { let Some(match_value) = r else { // d. If r is failure, then - // i. If global is true or sticky is true, then + // + // NOTE: Merged the following steps (since we no longer have a loop): + // 13.d.i. If sticky is true, then + // 13.a.i. If global is true or sticky is true, then if global || sticky { // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). this.set(utf16!("lastIndex"), 0, true, context)?; } + // MOVE: ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). + // NOTE: Handled within the regress matches iterator, see below for last_index assignment. + + // NOTE: Merged and steps: + // 13.a.ii. Return null. + // 13.d.i.2. Return null. return Ok(None); }; + // e. Else + // SKIP: i. Assert: r is a MatchState. + // SKIP: ii. Set matchSucceeded to true. + + // NOTE: regress currently doesn't support the sticky flag so we have to emulate it. if sticky && match_value.start() != last_index as usize { // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). this.set(utf16!("lastIndex"), 0, true, context)?; @@ -986,16 +1000,14 @@ impl RegExp { return Ok(None); } - // 14. Let e be r's endIndex value. - let e = match_value.end(); + // 13.d.ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). + // NOTE: Calculation of last_index is done in regress. last_index = match_value.start() as u64; - // Note: This is already taken care of be regress. + // 14. Let e be r's endIndex value. // 15. If fullUnicode is true, set e to GetStringIndex(S, e). - // e is an index into the Input character list, derived from S, matched by matcher. - // Let eUTF be the smallest index into S that corresponds to the character at element e of Input. - // If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S. - // b. Set e to eUTF. + // NOTE: Step 15 is already taken care of by regress. + let e = match_value.end(); // 16. If global is true or sticky is true, then if global || sticky {