From 041eccebbb00d81690b0f9f35b3a395d4aa9de41 Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Mon, 29 Jan 2024 18:44:47 +0100 Subject: [PATCH 1/2] Update regress to v0.8.0 and use UTF16 / UCS2 matching --- Cargo.lock | 22 +++++----- Cargo.toml | 4 +- core/engine/src/builtins/regexp/mod.rs | 60 +++++++++----------------- test262_config.toml | 9 +--- 4 files changed, 35 insertions(+), 60 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 21a7220bde2..1fa97970544 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,6 +49,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + [[package]] name = "anes" version = "0.1.6" @@ -1502,15 +1508,6 @@ dependencies = [ "ahash 0.7.7", ] -[[package]] -name = "hashbrown" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" -dependencies = [ - "ahash 0.8.7", -] - [[package]] name = "hashbrown" version = "0.14.3" @@ -1518,6 +1515,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" dependencies = [ "ahash 0.8.7", + "allocator-api2", ] [[package]] @@ -2795,11 +2793,11 @@ dependencies = [ [[package]] name = "regress" -version = "0.7.1" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ed9969cad8051328011596bf549629f1b800cf1731e7964b1eef8dfc480d2c2" +checksum = "4f5f39ba4513916c1b2657b72af6ec671f091cd637992f58d0ede5cae4e5dea0" dependencies = [ - "hashbrown 0.13.2", + "hashbrown 0.14.3", "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 97c3d7b9859..e6bcb03d538 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ members = [ exclude = [ "tests/fuzz", # Does weird things on Windows tests - "tests/src" # Just a hack to have fuzz inside tests + "tests/src", # Just a hack to have fuzz inside tests ] [workspace.package] @@ -59,7 +59,7 @@ once_cell = { version = "1.19.0", default-features = false } phf = { version = "0.11.2", default-features = false } pollster = "0.3.0" regex = "1.10.3" -regress = "0.7.1" +regress = { version="0.8.0", features = ["utf16"]} rustc-hash = { version = "1.1.0", default-features = false } serde_json = "1.0.111" serde = "1.0.195" diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 29fb4fe10ce..0796b1a4f2a 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -904,14 +904,12 @@ impl RegExp { // 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false. let full_unicode = flags.contains(&('u' as u16)) || flags.contains(&('v' as u16)); - // TODO: // 11. If fullUnicode is true, let input be StringToCodePoints(S). Otherwise, let input be a List whose elements are the code units that are the elements of S. // 12. NOTE: Each element of input is considered to be a character. // 10. Let matchSucceeded be false. // 13. Repeat, while matchSucceeded is false, - let lossy_input = input.to_std_string_escaped(); - let (match_value, last_byte_index) = loop { + let match_value = loop { // a. If lastIndex > length, then if last_index > length { // i. If global is true or sticky is true, then @@ -925,18 +923,12 @@ impl RegExp { } // b. Let inputIndex be the index into input of the character that was obtained from element lastIndex of S. - // Check if last_index is a valid utf8 index into input. - // TODO: avoid converting to String - let last_byte_index = match String::from_utf16(&input[..last_index as usize]) { - Ok(s) => s.len(), - Err(_) => { - return Err(JsNativeError::typ() - .with_message("Failed to get byte index from utf16 encoded string") - .into()) - } - }; // c. Let r be matcher(input, inputIndex). - let r = matcher.find_from(&lossy_input, last_byte_index).next(); + let r: Option = if full_unicode { + matcher.find_from_utf16(input, last_index as usize).next() + } else { + matcher.find_from_ucs2(input, last_index as usize).next() + }; match r { // d. If r is failure, then @@ -957,7 +949,7 @@ impl RegExp { Some(m) => { // d. If r is failure, then #[allow(clippy::if_not_else)] - if m.start() != last_byte_index { + if m.start() as u64 != last_index { // i. If sticky is true, then if sticky { // 1. Perform ? Set(R, "lastIndex", +0𝔽, true). @@ -969,38 +961,30 @@ impl RegExp { // ii. Set lastIndex to AdvanceStringIndex(S, lastIndex, fullUnicode). last_index = advance_string_index(input, last_index, full_unicode); - // e. Else, + // e. Else, } else { // i. Assert: r is a State. // ii. Set matchSucceeded to true. - break (m, last_byte_index); + break m; } } } }; // 14. Let e be r's endIndex value. - let mut e = match_value.end(); + let e = match_value.end(); + // Note: This is already taken care of be regress. // 15. If fullUnicode is true, set e to GetStringIndex(S, e). - // TODO: disabled for now until we have UTF-16 support - if false { - // e is an index into the Input character list, derived from S, matched by matcher. - // Let eUTF be the smallest index into S that corresponds to the character at element e of Input. - // If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S. - // b. Set e to eUTF. - e = input.get(..e).map_or_else(|| input.len(), <[u16]>::len); - } + // e is an index into the Input character list, derived from S, matched by matcher. + // Let eUTF be the smallest index into S that corresponds to the character at element e of Input. + // If e is greater than or equal to the number of elements in Input, then eUTF is the number of code units in S. + // b. Set e to eUTF. // 16. If global is true or sticky is true, then if global || sticky { // a. Perform ? Set(R, "lastIndex", 𝔽(e), true). - this.set( - utf16!("lastIndex"), - lossy_input[..e].encode_utf16().count(), - true, - context, - )?; + this.set(utf16!("lastIndex"), e, true, context)?; } // 17. Let n be the number of elements in r's captures List. @@ -1039,7 +1023,7 @@ impl RegExp { .expect("this CreateDataPropertyOrThrow call must not fail"); // 28. Let matchedSubstr be GetMatchString(S, match). - let matched_substr = js_string!(&lossy_input[last_byte_index..e]); + let matched_substr = js_string!(&input[(last_index as usize)..(e)]); // 29. Perform ! CreateDataPropertyOrThrow(A, "0", matchedSubstr). a.create_data_property_or_throw(0, matched_substr, context) @@ -1069,8 +1053,7 @@ impl RegExp { for (name, range) in named_groups { let name = js_string!(name); if let Some(range) = range { - // TODO: Full UTF-16 regex support - let value = js_string!(&lossy_input[range.clone()]); + let value = js_string!(&input[range.clone()]); groups .create_data_property_or_throw(name.clone(), value, context) @@ -1130,10 +1113,9 @@ impl RegExp { // b. If captureI is undefined, let capturedValue be undefined. // c. Else if fullUnicode is true, then // d. Else, - // TODO: Full UTF-16 regex support - let captured_value = capture.clone().map_or_else(JsValue::undefined, |range| { - js_string!(&lossy_input[range]).into() - }); + let captured_value = capture + .clone() + .map_or_else(JsValue::undefined, |range| js_string!(&input[range]).into()); // e. Perform ! CreateDataPropertyOrThrow(A, ! ToString(𝔽(i)), capturedValue). a.create_data_property_or_throw(i, captured_value.clone(), context) diff --git a/test262_config.toml b/test262_config.toml index 39111cd2cc3..794c973f24a 100644 --- a/test262_config.toml +++ b/test262_config.toml @@ -16,6 +16,7 @@ features = [ "Intl.RelativeTimeFormat", "Intl-enumeration", "Intl.NumberFormat-v3", + "regexp-v-flag", ### Pending proposals @@ -68,12 +69,6 @@ features = [ ### Non-standard "caller", - - ### RegExp tests that check individual codepoints. - ### They are not useful considering the cpu time they waste. - "regexp-unicode-property-escapes", ] -# RegExp tests that check individual codepoints. -# They are not useful considering the cpu time they waste. -tests = ["CharacterClassEscapes", "NumberFormat"] +tests = ["NumberFormat"] From 175ae8945cd4a167f99dca5317425d511896e08c Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Mon, 29 Jan 2024 19:13:16 +0100 Subject: [PATCH 2/2] Fix test --- core/engine/src/builtins/regexp/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/engine/src/builtins/regexp/tests.rs b/core/engine/src/builtins/regexp/tests.rs index 710234d91b5..ac711fd0ec9 100644 --- a/core/engine/src/builtins/regexp/tests.rs +++ b/core/engine/src/builtins/regexp/tests.rs @@ -125,7 +125,7 @@ fn no_panic_on_parse_fail() { TestAction::assert_native_error( r"var re = /]/u;", JsNativeErrorKind::Syntax, - "Invalid regular expression literal: Unbalanced bracket at line 1, col 10", + "Invalid regular expression literal: Invalid atom character at line 1, col 10", ), TestAction::assert_native_error( r"var re = /a{/u;",