From 4398ffe9e403bb1254ee9b0d02b9a8d50e5b0673 Mon Sep 17 00:00:00 2001 From: Iban Eguia Moraza Date: Mon, 5 Sep 2022 16:59:33 +0200 Subject: [PATCH 1/9] Added URI decoders and encoders --- boa_engine/src/builtins/mod.rs | 1 + boa_engine/src/builtins/regexp/mod.rs | 5 +- boa_engine/src/builtins/string/mod.rs | 36 +- .../src/builtins/string/string_iterator.rs | 8 +- boa_engine/src/builtins/uri/mod.rs | 555 ++++++++++++++++++ 5 files changed, 596 insertions(+), 9 deletions(-) create mode 100644 boa_engine/src/builtins/uri/mod.rs diff --git a/boa_engine/src/builtins/mod.rs b/boa_engine/src/builtins/mod.rs index 89d8f32fed1..735e7bb70d2 100644 --- a/boa_engine/src/builtins/mod.rs +++ b/boa_engine/src/builtins/mod.rs @@ -32,6 +32,7 @@ pub mod string; pub mod symbol; pub mod typed_array; pub mod undefined; +pub mod uri; #[cfg(feature = "console")] pub mod console; diff --git a/boa_engine/src/builtins/regexp/mod.rs b/boa_engine/src/builtins/regexp/mod.rs index 8ca3f71ea95..98064bda9d1 100644 --- a/boa_engine/src/builtins/regexp/mod.rs +++ b/boa_engine/src/builtins/regexp/mod.rs @@ -1745,7 +1745,8 @@ fn advance_string_index(s: &JsString, index: u64, unicode: bool) -> u64 { } // 5. Let cp be ! CodePointAt(S, index). - let (_, offset, _) = crate::builtins::string::code_point_at(s, index); + let cp = crate::builtins::string::code_point_at(s, index); - index + u64::from(offset) + // 6. Return index + cp.[[CodeUnitCount]]. + index + u64::from(cp.code_unit_count) } diff --git a/boa_engine/src/builtins/string/mod.rs b/boa_engine/src/builtins/string/mod.rs index f77315dd40f..f3bdcda7280 100644 --- a/boa_engine/src/builtins/string/mod.rs +++ b/boa_engine/src/builtins/string/mod.rs @@ -40,7 +40,15 @@ pub(crate) enum Placement { End, } -pub(crate) fn code_point_at(string: &JsString, position: u64) -> (u32, u8, bool) { +/// Code point information for the `code_point_at()` function. +#[derive(Debug, Clone, Copy)] +pub(crate) struct CodePointInfo { + pub(crate) code_point: u32, + pub(crate) code_unit_count: u8, + pub(crate) is_unpaired_surrogate: bool, +} + +pub(crate) fn code_point_at(string: &JsString, position: u64) -> CodePointInfo { let mut encoded = string.encode_utf16(); let size = encoded.clone().count() as u64; @@ -48,21 +56,37 @@ pub(crate) fn code_point_at(string: &JsString, position: u64) -> (u32, u8, bool) .nth(position as usize) .expect("The callers of this function must've already checked bounds."); if !is_leading_surrogate(first) && !is_trailing_surrogate(first) { - return (u32::from(first), 1, false); + return CodePointInfo { + code_point: u32::from(first), + code_unit_count: 1, + is_unpaired_surrogate: false, + }; } if is_trailing_surrogate(first) || position + 1 == size { - return (u32::from(first), 1, true); + return CodePointInfo { + code_point: u32::from(first), + code_unit_count: 1, + is_unpaired_surrogate: true, + }; } let second = encoded .next() .expect("The callers of this function must've already checked bounds."); if !is_trailing_surrogate(second) { - return (u32::from(first), 1, true); + return CodePointInfo { + code_point: u32::from(first), + code_unit_count: 1, + is_unpaired_surrogate: true, + }; } let cp = (u32::from(first) - 0xD800) * 0x400 + (u32::from(second) - 0xDC00) + 0x10000; - (cp, 2, false) + CodePointInfo { + code_point: cp, + code_unit_count: 2, + is_unpaired_surrogate: false, + } } /// Helper function to check if a `char` is trimmable. @@ -544,7 +568,7 @@ impl String { IntegerOrInfinity::Integer(position) if (0..size).contains(&position) => { // 6. Let cp be ! CodePointAt(S, position). // 7. Return 𝔽(cp.[[CodePoint]]). - Ok(code_point_at(&string, position as u64).0.into()) + Ok(code_point_at(&string, position as u64).code_point.into()) } // 5. If position < 0 or position ≥ size, return undefined. _ => Ok(JsValue::undefined()), diff --git a/boa_engine/src/builtins/string/string_iterator.rs b/boa_engine/src/builtins/string/string_iterator.rs index 09983f532ad..21b1568ad6e 100644 --- a/boa_engine/src/builtins/string/string_iterator.rs +++ b/boa_engine/src/builtins/string/string_iterator.rs @@ -10,6 +10,8 @@ use crate::{ use boa_gc::{Finalize, Trace}; use boa_profiler::Profiler; +use super::CodePointInfo; + #[derive(Debug, Clone, Finalize, Trace)] pub struct StringIterator { string: JsValue, @@ -61,7 +63,11 @@ impl StringIterator { context, )); } - let (_, code_unit_count, _) = code_point_at(&native_string, position as u64); + let CodePointInfo { + code_point: _, + code_unit_count, + is_unpaired_surrogate: _, + } = code_point_at(&native_string, position as u64); string_iterator.next_index += i32::from(code_unit_count); let result_string = crate::builtins::string::String::substring( &string_iterator.string, diff --git a/boa_engine/src/builtins/uri/mod.rs b/boa_engine/src/builtins/uri/mod.rs new file mode 100644 index 00000000000..66ec732ea1b --- /dev/null +++ b/boa_engine/src/builtins/uri/mod.rs @@ -0,0 +1,555 @@ +//! URI Handling Functions +//! +//! Uniform Resource Identifiers, or URIs, are Strings that identify resources (e.g. web pages or +//! files) and transport protocols by which to access them (e.g. HTTP or FTP) on the Internet. The +//! ECMAScript language itself does not provide any support for using URIs except for functions +//! that encode and decode URIs as described in 19.2.6.2, 19.2.6.3, 19.2.6.4 and 19.2.6.5 +//! +//! More information: +//! - [ECMAScript reference][spec] +//! +//! [spec]: https://tc39.es/ecma262/#sec-number-object + +use super::BuiltIn; +use crate::{ + builtins::JsArgs, object::FunctionBuilder, property::Attribute, Context, JsResult, JsString, + JsValue, +}; + +/// Constant with all the unescaped URI characters. +/// +/// Contains `uriAlpha`, `DecimalDigit` and `uriMark` +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: hhttps://tc39.es/ecma262/#prod-uriUnescaped +const URI_UNESCAPED: [u16; 69] = [ + // uriAlpha + b'a' as u16, + b'b' as u16, + b'c' as u16, + b'd' as u16, + b'e' as u16, + b'f' as u16, + b'g' as u16, + b'h' as u16, + b'i' as u16, + b'j' as u16, + b'k' as u16, + b'l' as u16, + b'm' as u16, + b'n' as u16, + b'o' as u16, + b'p' as u16, + b'q' as u16, + b'r' as u16, + b's' as u16, + b't' as u16, + b'u' as u16, + b'v' as u16, + b'w' as u16, + b'x' as u16, + b'y' as u16, + b'z' as u16, + b'A' as u16, + b'B' as u16, + b'C' as u16, + b'D' as u16, + b'E' as u16, + b'F' as u16, + b'G' as u16, + b'H' as u16, + b'I' as u16, + b'J' as u16, + b'K' as u16, + b'L' as u16, + b'M' as u16, + b'N' as u16, + b'O' as u16, + b'P' as u16, + b'Q' as u16, + b'R' as u16, + b'S' as u16, + b'T' as u16, + b'U' as u16, + b'V' as u16, + b'W' as u16, + b'X' as u16, + b'Y' as u16, + // DecimalDigit + b'0' as u16, + b'1' as u16, + b'2' as u16, + b'3' as u16, + b'4' as u16, + b'5' as u16, + b'6' as u16, + b'7' as u16, + b'8' as u16, + b'9' as u16, + // uriMark + b'-' as u16, + b'_' as u16, + b'.' as u16, + b'!' as u16, + b'~' as u16, + b'*' as u16, + b'\'' as u16, + b'(' as u16, +]; + +/// URI Handling Functions +#[derive(Debug, Clone, Copy)] +pub(crate) struct Uri; + +impl BuiltIn for Uri { + const NAME: &'static str = "Number"; + + fn init(context: &mut Context) -> Option { + let decode_uri = FunctionBuilder::native(context, Self::decode_uri) + .name("decodeURI") + .length(1) + .constructor(false) + .build(); + + context.register_global_property( + "decodeURI", + decode_uri, + Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, + ); + + let decode_uri_component = FunctionBuilder::native(context, Self::decode_uri_component) + .name("decodeURIComponent") + .length(1) + .constructor(false) + .build(); + + context.register_global_property( + "encodeURI", + decode_uri_component, + Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, + ); + + let encode_uri = FunctionBuilder::native(context, Self::encode_uri) + .name("encodeURI") + .length(1) + .constructor(false) + .build(); + + context.register_global_property( + "encodeURI", + encode_uri, + Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, + ); + + let encode_uri_component = FunctionBuilder::native(context, Self::encode_uri_component) + .name("encodeURIComponent") + .length(1) + .constructor(false) + .build(); + + context.register_global_property( + "encodeURIComponent", + encode_uri_component, + Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, + ); + + None + } +} + +impl Uri { + /// Builtin JavaScript `decodeURI ( encodedURI )` function. + /// + /// This function computes a new version of a URI in which each escape sequence and UTF-8 + /// encoding of the sort that might be introduced by the `encodeURI` function is replaced with + /// the UTF-16 encoding of the code points that it represents. Escape sequences that could not + /// have been introduced by `encodeURI` are not replaced. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-decodeuri-encodeduri + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURI + pub(crate) fn decode_uri( + _: &JsValue, + args: &[JsValue], + context: &mut Context, + ) -> JsResult { + let encoded_uri = args.get_or_undefined(0); + + // 1. Let uriString be ? ToString(encodedURI). + let uri_string = encoded_uri.to_string(context)?; + + // 2. Let reservedURISet be a String containing one instance of each code unit valid in uriReserved plus "#". + let reserved_uri_set = &URI_UNESCAPED; // TODO: add # + + // 3. Return ? Decode(uriString, reservedURISet). + Ok(JsValue::from(decode( + context, + &uri_string, + reserved_uri_set, + )?)) + } + + /// Builtin JavaScript `decodeURIComponent ( encodedURIComponent )` function. + /// + /// This function computes a new version of a URI in which each escape sequence and UTF-8 + /// encoding of the sort that might be introduced by the `encodeURIComponent` function is + /// replaced with the UTF-16 encoding of the code points that it represents. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-decodeuricomponent-encodeduricomponent + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent + pub(crate) fn decode_uri_component( + _: &JsValue, + args: &[JsValue], + context: &mut Context, + ) -> JsResult { + let encoded_uri_component = args.get_or_undefined(0); + + // 1. Let componentString be ? ToString(encodedURIComponent). + let component_string = encoded_uri_component.to_string(context)?; + + // 2. Let reservedURIComponentSet be the empty String. + let reserved_uri_component_set = &[]; + + // 3. Return ? Decode(componentString, reservedURIComponentSet). + Ok(JsValue::from(decode( + context, + &component_string, + reserved_uri_component_set, + )?)) + } + + /// Builtin JavaScript `encodeURI ( uri )` function. + /// + /// This function computes a new version of a UTF-16 encoded (6.1.4) URI in which each instance + /// of certain code points is replaced by one, two, three, or four escape sequences + /// representing the UTF-8 encoding of the code points. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-encodeuri-uri + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI + pub(crate) fn encode_uri( + _: &JsValue, + args: &[JsValue], + context: &mut Context, + ) -> JsResult { + let uri = args.get_or_undefined(0); + + // 1. Let uriString be ? ToString(uri). + let uri_string = uri.to_string(context)?; + + // 2. Let unescapedURISet be a String containing one instance of each code unit valid in uriReserved and uriUnescaped plus "#". + let unescaped_uri_set = &URI_UNESCAPED; // TODO: add # + + // 3. Return ? Encode(uriString, unescapedURISet). + Ok(JsValue::from(encode( + context, + &uri_string, + unescaped_uri_set, + )?)) + } + + /// Builtin JavaScript `encodeURIComponent ( uriComponent )` function. + /// + /// This function computes a new version of a UTF-16 encoded (6.1.4) URI in which each instance + /// of certain code points is replaced by one, two, three, or four escape sequences + /// representing the UTF-8 encoding of the code point. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-encodeuricomponent-uricomponent + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent + pub(crate) fn encode_uri_component( + _: &JsValue, + args: &[JsValue], + context: &mut Context, + ) -> JsResult { + let uri_component = args.get_or_undefined(0); + + // 1. Let componentString be ? ToString(uriComponent). + let component_string = uri_component.to_string(context)?; + + // 2. Let unescapedURIComponentSet be a String containing one instance of each code unit valid in uriUnescaped. + let unescaped_uri_component_set = &URI_UNESCAPED; + + // 3. Return ? Encode(componentString, unescapedURIComponentSet). + Ok(JsValue::from(encode( + context, + &component_string, + unescaped_uri_component_set, + )?)) + } +} + +/// The `Encode ( string, unescapedSet )` abstract operation +/// +/// The abstract operation Encode takes arguments `string` (a String) and `unescapedSet` (a String) +/// and returns either a normal completion containing a String or a throw completion. It performs +/// URI encoding and escaping. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#sec-encode +fn encode(context: &mut Context, string: &JsString, unescaped_set: &[u16]) -> JsResult { + let code_units = string.encode_utf16().collect::>(); + + // 1. Let strLen be the length of string. + let str_len = code_units.len(); + + // 2. Let R be the empty String. + let mut r = String::new(); + + // 3. Let k be 0. + let mut k = 0; + // 4. Repeat, + loop { + // a. If k = strLen, return R. + if k == str_len { + return Ok(r); + } + + // b. Let C be the code unit at index k within string. + let c = code_units[k]; + + // c. If C is in unescapedSet, then + if unescaped_set.contains(&c) { + // i. Set k to k + 1. + k += 1; + + // ii. Set R to the string-concatenation of R and C. + r.push(char::from_u32(u32::from(c)).expect("char from code point cannot fail here")); + } else { + // d. Else, + // i. Let cp be CodePointAt(string, k). + let cp = crate::builtins::string::code_point_at(string, k as u64); + // ii. If cp.[[IsUnpairedSurrogate]] is true, throw a URIError exception. + if cp.is_unpaired_surrogate { + context.throw_uri_error("trying to encode an invalid string")?; + } + // iii. Set k to k + cp.[[CodeUnitCount]]. + k += cp.code_unit_count as usize; + + // iv. Let Octets be the List of octets resulting by applying the UTF-8 transformation + // to cp.[[CodePoint]]. + let mut buff = [0_u8; 4]; // Will never be more than 4 bytes + + let octets = char::from_u32(cp.code_point) + .expect("valid unicode code point to char conversion failed") + .encode_utf8(&mut buff); + + // v. For each element octet of Octets, do + for octet in octets.bytes() { + // 1. Set R to the string-concatenation of: + // R + // "%" + // the String representation of octet, formatted as a two-digit uppercase + // hexadecimal number, padded to the left with a zero if necessary + r = format!("{r}%{octet:0>2X}"); + } + } + } +} + +/// The `Decode ( string, reservedSet )` abstract operation. +/// +/// The abstract operation Decode takes arguments `string` (a String) and `reservedSet` (a String) +/// and returns either a normal completion containing a String or a throw completion. It performs +/// URI unescaping and decoding. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#sec-decode +#[allow(clippy::many_single_char_names)] +fn decode(context: &mut Context, string: &JsString, reserved_set: &[u16]) -> JsResult { + let code_units = string.encode_utf16().collect::>(); + + // 1. Let strLen be the length of string. + let str_len = code_units.len(); + // 2. Let R be the empty String. + let mut r = Vec::new(); + + // 3. Let k be 0. + let mut k = 0; + // 4. Repeat, + loop { + // a. If k = strLen, return R. + if k == str_len { + return Ok(String::from_utf16(&r).expect("invalid UTF-16 characters found")); + } + + // b. Let C be the code unit at index k within string. + let c = code_units[k]; + + // c. If C is not the code unit 0x0025 (PERCENT SIGN), then + #[allow(clippy::if_not_else)] + let s = if c != 0x0025_u16 { + // i. Let S be the String value containing only the code unit C. + vec![c] + } else { + // d. Else, + // i. Let start be k. + let start = k; + + // ii. If k + 2 ≥ strLen, throw a URIError exception. + if k + 2 >= str_len { + context.throw_uri_error("invalid escape character found")?; + } + + // iii. If the code units at index (k + 1) and (k + 2) within string do not represent + // hexadecimal digits, throw a URIError exception. + let unit_k_1 = code_units[k + 1]; + let unit_k_2 = code_units[k + 2]; + if !is_hexdigit(unit_k_1) || !is_hexdigit(unit_k_2) { + context.throw_uri_error("invalid escape character found")?; + } + + // iv. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). + let b = decode_byte(unit_k_1, unit_k_2); + + // v. Set k to k + 2. + k += 2; + + // vi. Let n be the number of leading 1 bits in B. + let n = leading_one_bits(b); + + // vii. If n = 0, then + if n == 0 { + // 1. Let C be the code unit whose value is B. + let c = u16::from(b); + + // 2. If C is not in reservedSet, then + if !reserved_set.contains(&c) { + // a. Let S be the String value containing only the code unit C. + vec![c] + } else { + // 3. Else, + // a. Let S be the substring of string from start to k + 1. + let mut s = Vec::new(); + s.extend_from_slice(&code_units[start..k + 1]); + s + } + } else { + // viii. Else, + // 1. If n = 1 or n > 4, throw a URIError exception. + if n == 1 || n > 4 { + context.throw_uri_error("TO-DO")?; + } + + // 2. If k + (3 × (n - 1)) ≥ strLen, throw a URIError exception. + if k + (3 * (n - 1)) > str_len { + context.throw_uri_error("TO-DO")?; + } + + // 3. Let Octets be « B ». + let mut octets = vec![b]; + + // 4. Let j be 1. + // 5. Repeat, while j < n, + for _j in 1..n { + // a. Set k to k + 1. + k += 1; + + // b. If the code unit at index k within string is not the code unit 0x0025 (PERCENT SIGN), throw a URIError exception. + if code_units[k] != 0x0025 { + context + .throw_uri_error("escape characters must be preceded with a % sign")?; + } + + // c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception. + let unit_k_1 = code_units[k + 1]; + let unit_k_2 = code_units[k + 2]; + if !is_hexdigit(unit_k_1) || !is_hexdigit(unit_k_2) { + context.throw_uri_error("invalid escape character")?; + } + + // d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). + let b = decode_byte(unit_k_1, unit_k_2); + + // e. Set k to k + 2. + k += 2; + + // f. Append B to Octets. + octets.push(b); + + // g. Set j to j + 1. + } + + // 6. Assert: The length of Octets is n. + assert_eq!(octets.len(), n); + + // 7. If Octets does not contain a valid UTF-8 encoding of a Unicode code point, throw a URIError exception. + todo!(); + + // 8. Let V be the code point obtained by applying the UTF-8 transformation to Octets, that is, from a List of octets into a 21-bit value. + todo!(); + + // 9. Let S be UTF16EncodeCodePoint(V). + todo!() + } + }; + + // e. Set R to the string-concatenation of R and S. + r.extend_from_slice(&s); + + // f. Set k to k + 1. + k += 1; + } +} + +/// Checks if a given code unit is an hexadecimal digit represented in UTF-16. +fn is_hexdigit(code_unit: u16) -> bool { + use std::ops::RangeInclusive; + + const DIGIT: RangeInclusive = b'0' as u16..=b'9' as u16; + const HEX_UPPER: RangeInclusive = b'A' as u16..=b'F' as u16; + const HEX_LOWER: RangeInclusive = b'a' as u16..=b'f' as u16; + + DIGIT.contains(&code_unit) || HEX_UPPER.contains(&code_unit) || HEX_LOWER.contains(&code_unit) +} + +/// Decodes a byte from two unicode code units. It expects both to be hexadecimal characters. +fn decode_byte(high: u16, low: u16) -> u8 { + let high = high as u8 - b'0'; + let low = low as u8 - b'0'; + + (high << 4) + low +} + +/// Counts the number of leading 1 bits in a given byte. +fn leading_one_bits(byte: u8) -> usize { + // This uses a value table for speed + if byte == u8::MAX { + 8 + } else if byte == 0b1111_1110 { + 7 + } else if byte & 0b1111_1100 == 0b1111_1100 { + 6 + } else if byte & 0b1111_1000 == 0b1111_1000 { + 5 + } else if byte & 0b1111_0000 == 0b1111_0000 { + 4 + } else if byte & 0b1110_0000 == 0b1110_0000 { + 3 + } else if byte & 0b1100_1100 == 0b1100_0000 { + 2 + } else if byte & 0b1000_0000 == 0b1000_0000 { + 1 + } else { + 0 + } +} From ff0afd51cb52a4811def9c4e9bccab973191d251 Mon Sep 17 00:00:00 2001 From: Iban Eguia Moraza Date: Mon, 5 Sep 2022 23:58:23 +0200 Subject: [PATCH 2/9] Fixed some bugs, moved constants to submodule --- boa_engine/src/builtins/mod.rs | 5 +- boa_engine/src/builtins/uri/consts.rs | 202 ++++++++++++++++++++++++ boa_engine/src/builtins/uri/mod.rs | 212 ++++++++++++++------------ 3 files changed, 320 insertions(+), 99 deletions(-) create mode 100644 boa_engine/src/builtins/uri/consts.rs diff --git a/boa_engine/src/builtins/mod.rs b/boa_engine/src/builtins/mod.rs index 735e7bb70d2..d6aeda78608 100644 --- a/boa_engine/src/builtins/mod.rs +++ b/boa_engine/src/builtins/mod.rs @@ -82,7 +82,7 @@ use crate::{ builtins::{ array_buffer::ArrayBuffer, async_generator::AsyncGenerator, async_generator_function::AsyncGeneratorFunction, generator::Generator, - generator_function::GeneratorFunction, typed_array::TypedArray, + generator_function::GeneratorFunction, typed_array::TypedArray, uri::Uri, }, property::{Attribute, PropertyDescriptor}, Context, JsValue, @@ -194,7 +194,8 @@ pub fn init(context: &mut Context) { Promise, AsyncFunction, AsyncGenerator, - AsyncGeneratorFunction + AsyncGeneratorFunction, + Uri }; #[cfg(feature = "intl")] diff --git a/boa_engine/src/builtins/uri/consts.rs b/boa_engine/src/builtins/uri/consts.rs new file mode 100644 index 00000000000..ff25ac6f7e1 --- /dev/null +++ b/boa_engine/src/builtins/uri/consts.rs @@ -0,0 +1,202 @@ +/// Constant with all the unescaped URI characters. +/// +/// Contains `uriAlpha`, `DecimalDigit` and `uriMark`. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriUnescaped +pub(super) const URI_UNESCAPED: [u16; 71] = [ + // uriAlpha + b'a' as u16, + b'b' as u16, + b'c' as u16, + b'd' as u16, + b'e' as u16, + b'f' as u16, + b'g' as u16, + b'h' as u16, + b'i' as u16, + b'j' as u16, + b'k' as u16, + b'l' as u16, + b'm' as u16, + b'n' as u16, + b'o' as u16, + b'p' as u16, + b'q' as u16, + b'r' as u16, + b's' as u16, + b't' as u16, + b'u' as u16, + b'v' as u16, + b'w' as u16, + b'x' as u16, + b'y' as u16, + b'z' as u16, + b'A' as u16, + b'B' as u16, + b'C' as u16, + b'D' as u16, + b'E' as u16, + b'F' as u16, + b'G' as u16, + b'H' as u16, + b'I' as u16, + b'J' as u16, + b'K' as u16, + b'L' as u16, + b'M' as u16, + b'N' as u16, + b'O' as u16, + b'P' as u16, + b'Q' as u16, + b'R' as u16, + b'S' as u16, + b'T' as u16, + b'U' as u16, + b'V' as u16, + b'W' as u16, + b'X' as u16, + b'Y' as u16, + b'Z' as u16, + // DecimalDigit + b'0' as u16, + b'1' as u16, + b'2' as u16, + b'3' as u16, + b'4' as u16, + b'5' as u16, + b'6' as u16, + b'7' as u16, + b'8' as u16, + b'9' as u16, + // uriMark + b'-' as u16, + b'_' as u16, + b'.' as u16, + b'!' as u16, + b'~' as u16, + b'*' as u16, + b'\'' as u16, + b'(' as u16, + b')' as u16, +]; + +/// Constant with all the reserved URI characters, plus the hashtag symbol (`#`). +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriReserved +pub(super) const URI_RESERVED_HASH: [u16; 11] = [ + b';' as u16, + b'/' as u16, + b'?' as u16, + b':' as u16, + b'@' as u16, + b'&' as u16, + b'=' as u16, + b'+' as u16, + b'$' as u16, + b',' as u16, + // Extra: # symbol + b'#' as u16, +]; + +/// Constant with all the reserved and unescaped URI characters, plus the hashtag symbol (`#`). +/// +/// More information: +/// - [`uriReserved` in ECMAScript spec][uri_reserved] +/// - [`uriUnescaped` in ECMAScript spec][uri_unescaped] +/// +/// [uri_reserved]: https://tc39.es/ecma262/#prod-uriUnescaped +pub(super) const URI_RESERVED_UNESCAPED_HASH: [u16; 82] = [ + // uriAlpha + b'a' as u16, + b'b' as u16, + b'c' as u16, + b'd' as u16, + b'e' as u16, + b'f' as u16, + b'g' as u16, + b'h' as u16, + b'i' as u16, + b'j' as u16, + b'k' as u16, + b'l' as u16, + b'm' as u16, + b'n' as u16, + b'o' as u16, + b'p' as u16, + b'q' as u16, + b'r' as u16, + b's' as u16, + b't' as u16, + b'u' as u16, + b'v' as u16, + b'w' as u16, + b'x' as u16, + b'y' as u16, + b'z' as u16, + b'A' as u16, + b'B' as u16, + b'C' as u16, + b'D' as u16, + b'E' as u16, + b'F' as u16, + b'G' as u16, + b'H' as u16, + b'I' as u16, + b'J' as u16, + b'K' as u16, + b'L' as u16, + b'M' as u16, + b'N' as u16, + b'O' as u16, + b'P' as u16, + b'Q' as u16, + b'R' as u16, + b'S' as u16, + b'T' as u16, + b'U' as u16, + b'V' as u16, + b'W' as u16, + b'X' as u16, + b'Y' as u16, + b'Z' as u16, + // DecimalDigit + b'0' as u16, + b'1' as u16, + b'2' as u16, + b'3' as u16, + b'4' as u16, + b'5' as u16, + b'6' as u16, + b'7' as u16, + b'8' as u16, + b'9' as u16, + // uriMark + b'-' as u16, + b'_' as u16, + b'.' as u16, + b'!' as u16, + b'~' as u16, + b'*' as u16, + b'\'' as u16, + b'(' as u16, + b')' as u16, + // uriReserved + b';' as u16, + b'/' as u16, + b'?' as u16, + b':' as u16, + b'@' as u16, + b'&' as u16, + b'=' as u16, + b'+' as u16, + b'$' as u16, + b',' as u16, + // Extra: # symbol + b'#' as u16, +]; diff --git a/boa_engine/src/builtins/uri/mod.rs b/boa_engine/src/builtins/uri/mod.rs index 66ec732ea1b..f8824dfebf3 100644 --- a/boa_engine/src/builtins/uri/mod.rs +++ b/boa_engine/src/builtins/uri/mod.rs @@ -10,101 +10,21 @@ //! //! [spec]: https://tc39.es/ecma262/#sec-number-object +mod consts; + use super::BuiltIn; use crate::{ builtins::JsArgs, object::FunctionBuilder, property::Attribute, Context, JsResult, JsString, JsValue, }; - -/// Constant with all the unescaped URI characters. -/// -/// Contains `uriAlpha`, `DecimalDigit` and `uriMark` -/// -/// More information: -/// - [ECMAScript reference][spec] -/// -/// [spec]: hhttps://tc39.es/ecma262/#prod-uriUnescaped -const URI_UNESCAPED: [u16; 69] = [ - // uriAlpha - b'a' as u16, - b'b' as u16, - b'c' as u16, - b'd' as u16, - b'e' as u16, - b'f' as u16, - b'g' as u16, - b'h' as u16, - b'i' as u16, - b'j' as u16, - b'k' as u16, - b'l' as u16, - b'm' as u16, - b'n' as u16, - b'o' as u16, - b'p' as u16, - b'q' as u16, - b'r' as u16, - b's' as u16, - b't' as u16, - b'u' as u16, - b'v' as u16, - b'w' as u16, - b'x' as u16, - b'y' as u16, - b'z' as u16, - b'A' as u16, - b'B' as u16, - b'C' as u16, - b'D' as u16, - b'E' as u16, - b'F' as u16, - b'G' as u16, - b'H' as u16, - b'I' as u16, - b'J' as u16, - b'K' as u16, - b'L' as u16, - b'M' as u16, - b'N' as u16, - b'O' as u16, - b'P' as u16, - b'Q' as u16, - b'R' as u16, - b'S' as u16, - b'T' as u16, - b'U' as u16, - b'V' as u16, - b'W' as u16, - b'X' as u16, - b'Y' as u16, - // DecimalDigit - b'0' as u16, - b'1' as u16, - b'2' as u16, - b'3' as u16, - b'4' as u16, - b'5' as u16, - b'6' as u16, - b'7' as u16, - b'8' as u16, - b'9' as u16, - // uriMark - b'-' as u16, - b'_' as u16, - b'.' as u16, - b'!' as u16, - b'~' as u16, - b'*' as u16, - b'\'' as u16, - b'(' as u16, -]; +use consts::*; /// URI Handling Functions #[derive(Debug, Clone, Copy)] pub(crate) struct Uri; impl BuiltIn for Uri { - const NAME: &'static str = "Number"; + const NAME: &'static str = "Uri"; fn init(context: &mut Context) -> Option { let decode_uri = FunctionBuilder::native(context, Self::decode_uri) @@ -126,7 +46,7 @@ impl BuiltIn for Uri { .build(); context.register_global_property( - "encodeURI", + "decodeURIComponent", decode_uri_component, Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, ); @@ -184,7 +104,7 @@ impl Uri { let uri_string = encoded_uri.to_string(context)?; // 2. Let reservedURISet be a String containing one instance of each code unit valid in uriReserved plus "#". - let reserved_uri_set = &URI_UNESCAPED; // TODO: add # + let reserved_uri_set = &URI_RESERVED_HASH; // 3. Return ? Decode(uriString, reservedURISet). Ok(JsValue::from(decode( @@ -250,7 +170,7 @@ impl Uri { let uri_string = uri.to_string(context)?; // 2. Let unescapedURISet be a String containing one instance of each code unit valid in uriReserved and uriUnescaped plus "#". - let unescaped_uri_set = &URI_UNESCAPED; // TODO: add # + let unescaped_uri_set = &URI_RESERVED_UNESCAPED_HASH; // 3. Return ? Encode(uriString, unescapedURISet). Ok(JsValue::from(encode( @@ -336,10 +256,12 @@ fn encode(context: &mut Context, string: &JsString, unescaped_set: &[u16]) -> Js // d. Else, // i. Let cp be CodePointAt(string, k). let cp = crate::builtins::string::code_point_at(string, k as u64); + // ii. If cp.[[IsUnpairedSurrogate]] is true, throw a URIError exception. if cp.is_unpaired_surrogate { context.throw_uri_error("trying to encode an invalid string")?; } + // iii. Set k to k + cp.[[CodeUnitCount]]. k += cp.code_unit_count as usize; @@ -440,7 +362,7 @@ fn decode(context: &mut Context, string: &JsString, reserved_set: &[u16]) -> JsR // 3. Else, // a. Let S be the substring of string from start to k + 1. let mut s = Vec::new(); - s.extend_from_slice(&code_units[start..k + 1]); + s.extend_from_slice(&code_units[start..=k]); s } } else { @@ -493,13 +415,18 @@ fn decode(context: &mut Context, string: &JsString, reserved_set: &[u16]) -> JsR assert_eq!(octets.len(), n); // 7. If Octets does not contain a valid UTF-8 encoding of a Unicode code point, throw a URIError exception. - todo!(); - - // 8. Let V be the code point obtained by applying the UTF-8 transformation to Octets, that is, from a List of octets into a 21-bit value. - todo!(); + match String::from_utf8(octets) { + Err(_) => { + return Err(context.construct_uri_error("invalid UTF-8 encoding found")) + } + Ok(v) => { + // 8. Let V be the code point obtained by applying the UTF-8 transformation to Octets, that is, from a List of octets into a 21-bit value. - // 9. Let S be UTF16EncodeCodePoint(V). - todo!() + // 9. Let S be UTF16EncodeCodePoint(V). + // utf16_encode_codepoint(v) + v.encode_utf16().collect::>() + } + } } }; @@ -524,8 +451,28 @@ fn is_hexdigit(code_unit: u16) -> bool { /// Decodes a byte from two unicode code units. It expects both to be hexadecimal characters. fn decode_byte(high: u16, low: u16) -> u8 { - let high = high as u8 - b'0'; - let low = low as u8 - b'0'; + let high = u8::try_from(high).expect("invalid ASCII character found"); + let low = u8::try_from(low).expect("invalid ASCII character found"); + + let high = if (b'0'..=b'9').contains(&high) { + high - b'0' + } else if (b'A'..=b'Z').contains(&high) { + high - b'A' + 0x0A + } else if (b'a'..=b'z').contains(&high) { + high - b'a' + 0x0A + } else { + panic!("invalid ASCII hexadecimal digit found"); + }; + + let low = if (b'0'..=b'9').contains(&low) { + low - b'0' + } else if (b'A'..=b'Z').contains(&low) { + low - b'A' + 0x0A + } else if (b'a'..=b'z').contains(&low) { + low - b'a' + 0x0A + } else { + panic!("invalid ASCII hexadecimal digit found"); + }; (high << 4) + low } @@ -545,7 +492,7 @@ fn leading_one_bits(byte: u8) -> usize { 4 } else if byte & 0b1110_0000 == 0b1110_0000 { 3 - } else if byte & 0b1100_1100 == 0b1100_0000 { + } else if byte & 0b1100_0000 == 0b1100_0000 { 2 } else if byte & 0b1000_0000 == 0b1000_0000 { 1 @@ -553,3 +500,74 @@ fn leading_one_bits(byte: u8) -> usize { 0 } } + +// /// Generates the string representation of +// fn utf16_encode_codepoint(cp: u16) -> String { +// // 1. Assert: 0 ≤ cp ≤ 0x10FFFF. +// assert!(cp <= 0x10FFFF); + +// // 2. If cp ≤ 0xFFFF, return the String value consisting of the code unit whose value is cp. +// if cp <= 0xFFFF { +// return String::from_utf16(&[cp]).expect("invalid UTF-16 code units found"); +// } + +// // 3. Let cu1 be the code unit whose value is floor((cp - 0x10000) / 0x400) + 0xD800. +// let cu1 = ((cp - 0x10000) as f64 / 0x400 as f64).floor() as u16 + 0xD800; + +// // 4. Let cu2 be the code unit whose value is ((cp - 0x10000) modulo 0x400) + 0xDC00. +// let cu2 = ((cp - 0x10000) % 0x400) + 0xD800; + +// // 5. Return the string-concatenation of cu1 and cu2. +// String::from_utf16(&[cu1, cu2]).expect("invalid UTF-16 code units found") +// } + +#[cfg(test)] +mod tests { + use super::*; + + /// Checks if the `leading_one_bits()` function works as expected. + #[test] + fn ut_leading_one_bits() { + assert_eq!(leading_one_bits(0b1111_1111), 8); + assert_eq!(leading_one_bits(0b1111_1110), 7); + + assert_eq!(leading_one_bits(0b1111_1100), 6); + assert_eq!(leading_one_bits(0b1111_1101), 6); + + assert_eq!(leading_one_bits(0b1111_1011), 5); + assert_eq!(leading_one_bits(0b1111_1000), 5); + + assert_eq!(leading_one_bits(0b1111_0000), 4); + assert_eq!(leading_one_bits(0b1111_0111), 4); + + assert_eq!(leading_one_bits(0b1110_0000), 3); + assert_eq!(leading_one_bits(0b1110_1111), 3); + + assert_eq!(leading_one_bits(0b1100_0000), 2); + assert_eq!(leading_one_bits(0b1101_1111), 2); + + assert_eq!(leading_one_bits(0b1000_0000), 1); + assert_eq!(leading_one_bits(0b1011_1111), 1); + + assert_eq!(leading_one_bits(0b0000_0000), 0); + assert_eq!(leading_one_bits(0b0111_1111), 0); + } + + /// Checks that the `decode_byte()` function works as expected. + #[test] + fn ut_decode_byte() { + assert_eq!(decode_byte(u16::from(b'2'), u16::from(b'0')), 0x20); + assert_eq!(decode_byte(u16::from(b'2'), u16::from(b'A')), 0x2A); + assert_eq!(decode_byte(u16::from(b'3'), u16::from(b'C')), 0x3C); + assert_eq!(decode_byte(u16::from(b'4'), u16::from(b'0')), 0x40); + assert_eq!(decode_byte(u16::from(b'7'), u16::from(b'E')), 0x7E); + assert_eq!(decode_byte(u16::from(b'0'), u16::from(b'0')), 0x00); + } + + /// Checks that the `decode_byte()` panics with invalid ASCII characters. + #[test] + #[should_panic] + fn ut_decode_byte_rainy() { + decode_byte(u16::from(b'-'), u16::from(b'0')); + } +} From 5625de2307e7af482d807657e0aef2069de567d4 Mon Sep 17 00:00:00 2001 From: Iban Eguia Moraza Date: Tue, 6 Sep 2022 00:06:03 +0200 Subject: [PATCH 3/9] Fixed a clippy warning --- boa_engine/src/builtins/uri/mod.rs | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/boa_engine/src/builtins/uri/mod.rs b/boa_engine/src/builtins/uri/mod.rs index f8824dfebf3..23b7339547f 100644 --- a/boa_engine/src/builtins/uri/mod.rs +++ b/boa_engine/src/builtins/uri/mod.rs @@ -17,7 +17,7 @@ use crate::{ builtins::JsArgs, object::FunctionBuilder, property::Attribute, Context, JsResult, JsString, JsValue, }; -use consts::*; +use consts::{URI_RESERVED_HASH, URI_RESERVED_UNESCAPED_HASH, URI_UNESCAPED}; /// URI Handling Functions #[derive(Debug, Clone, Copy)] @@ -501,26 +501,6 @@ fn leading_one_bits(byte: u8) -> usize { } } -// /// Generates the string representation of -// fn utf16_encode_codepoint(cp: u16) -> String { -// // 1. Assert: 0 ≤ cp ≤ 0x10FFFF. -// assert!(cp <= 0x10FFFF); - -// // 2. If cp ≤ 0xFFFF, return the String value consisting of the code unit whose value is cp. -// if cp <= 0xFFFF { -// return String::from_utf16(&[cp]).expect("invalid UTF-16 code units found"); -// } - -// // 3. Let cu1 be the code unit whose value is floor((cp - 0x10000) / 0x400) + 0xD800. -// let cu1 = ((cp - 0x10000) as f64 / 0x400 as f64).floor() as u16 + 0xD800; - -// // 4. Let cu2 be the code unit whose value is ((cp - 0x10000) modulo 0x400) + 0xDC00. -// let cu2 = ((cp - 0x10000) % 0x400) + 0xD800; - -// // 5. Return the string-concatenation of cu1 and cu2. -// String::from_utf16(&[cu1, cu2]).expect("invalid UTF-16 code units found") -// } - #[cfg(test)] mod tests { use super::*; From b2864b842d3a7f3289279ab61a59860777c41733 Mon Sep 17 00:00:00 2001 From: Iban Eguia Moraza Date: Tue, 6 Sep 2022 00:09:06 +0200 Subject: [PATCH 4/9] Fixed documentation link --- boa_engine/src/builtins/uri/consts.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/boa_engine/src/builtins/uri/consts.rs b/boa_engine/src/builtins/uri/consts.rs index ff25ac6f7e1..d934f541f33 100644 --- a/boa_engine/src/builtins/uri/consts.rs +++ b/boa_engine/src/builtins/uri/consts.rs @@ -110,7 +110,8 @@ pub(super) const URI_RESERVED_HASH: [u16; 11] = [ /// - [`uriReserved` in ECMAScript spec][uri_reserved] /// - [`uriUnescaped` in ECMAScript spec][uri_unescaped] /// -/// [uri_reserved]: https://tc39.es/ecma262/#prod-uriUnescaped +/// [uri_reserved]: https://tc39.es/ecma262/#prod-uriReserved +/// [uri_unescaped]: https://tc39.es/ecma262/#prod-uriUnescaped pub(super) const URI_RESERVED_UNESCAPED_HASH: [u16; 82] = [ // uriAlpha b'a' as u16, From 2fd074834316f5ba044c7a1c39798b59d435822e Mon Sep 17 00:00:00 2001 From: Iban Eguia Moraza Date: Tue, 6 Sep 2022 09:42:41 +0200 Subject: [PATCH 5/9] Improved performance, implemented suggestions --- boa_engine/src/builtins/uri/consts.rs | 230 ++++++++------------------ boa_engine/src/builtins/uri/mod.rs | 156 +++++++++++------ 2 files changed, 173 insertions(+), 213 deletions(-) diff --git a/boa_engine/src/builtins/uri/consts.rs b/boa_engine/src/builtins/uri/consts.rs index d934f541f33..d06f1388b86 100644 --- a/boa_engine/src/builtins/uri/consts.rs +++ b/boa_engine/src/builtins/uri/consts.rs @@ -1,77 +1,41 @@ -/// Constant with all the unescaped URI characters. +//! URI handling function constants +//! +//! This module contains a few constants used to handle decoding and encoding for URI handling +//! functions. They make it easier and more performant to compare different ranges and code points. + +use std::ops::RangeInclusive; + +/// A range containing all the lowercase `uriAlpha` code points. /// -/// Contains `uriAlpha`, `DecimalDigit` and `uriMark`. +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriAlpha +const URI_ALPHA_LOWER: RangeInclusive = b'a' as u16..=b'z' as u16; + +/// A range containing all the uppercase `uriAlpha` code points. /// /// More information: /// - [ECMAScript reference][spec] /// -/// [spec]: https://tc39.es/ecma262/#prod-uriUnescaped -pub(super) const URI_UNESCAPED: [u16; 71] = [ - // uriAlpha - b'a' as u16, - b'b' as u16, - b'c' as u16, - b'd' as u16, - b'e' as u16, - b'f' as u16, - b'g' as u16, - b'h' as u16, - b'i' as u16, - b'j' as u16, - b'k' as u16, - b'l' as u16, - b'm' as u16, - b'n' as u16, - b'o' as u16, - b'p' as u16, - b'q' as u16, - b'r' as u16, - b's' as u16, - b't' as u16, - b'u' as u16, - b'v' as u16, - b'w' as u16, - b'x' as u16, - b'y' as u16, - b'z' as u16, - b'A' as u16, - b'B' as u16, - b'C' as u16, - b'D' as u16, - b'E' as u16, - b'F' as u16, - b'G' as u16, - b'H' as u16, - b'I' as u16, - b'J' as u16, - b'K' as u16, - b'L' as u16, - b'M' as u16, - b'N' as u16, - b'O' as u16, - b'P' as u16, - b'Q' as u16, - b'R' as u16, - b'S' as u16, - b'T' as u16, - b'U' as u16, - b'V' as u16, - b'W' as u16, - b'X' as u16, - b'Y' as u16, - b'Z' as u16, - // DecimalDigit - b'0' as u16, - b'1' as u16, - b'2' as u16, - b'3' as u16, - b'4' as u16, - b'5' as u16, - b'6' as u16, - b'7' as u16, - b'8' as u16, - b'9' as u16, - // uriMark +/// [spec]: https://tc39.es/ecma262/#prod-uriAlpha +const URI_ALPHA_UPPER: RangeInclusive = b'A' as u16..=b'Z' as u16; + +/// A range containing all the `DecimalDigit` code points. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-DecimalDigit +const DECIMAL_DIGIT: RangeInclusive = b'0' as u16..=b'9' as u16; + +/// An array containing all the `uriMark` code points. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriMark +const URI_MARK: [u16; 9] = [ b'-' as u16, b'_' as u16, b'.' as u16, @@ -83,13 +47,13 @@ pub(super) const URI_UNESCAPED: [u16; 71] = [ b')' as u16, ]; -/// Constant with all the reserved URI characters, plus the hashtag symbol (`#`). +/// An array containing all the `uriReserved` code points. /// /// More information: /// - [ECMAScript reference][spec] /// /// [spec]: https://tc39.es/ecma262/#prod-uriReserved -pub(super) const URI_RESERVED_HASH: [u16; 11] = [ +const URI_RESERVED: [u16; 10] = [ b';' as u16, b'/' as u16, b'?' as u16, @@ -100,11 +64,37 @@ pub(super) const URI_RESERVED_HASH: [u16; 11] = [ b'+' as u16, b'$' as u16, b',' as u16, - // Extra: # symbol - b'#' as u16, ]; -/// Constant with all the reserved and unescaped URI characters, plus the hashtag symbol (`#`). +/// The number sign (`#`) symbol as a UTF-16 code potint. +const NUMBER_SIGN: u16 = b'#' as u16; + +/// Constant with all the unescaped URI characters. +/// +/// Contains `uriAlpha`, `DecimalDigit` and `uriMark`. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriUnescaped +pub(super) fn is_uri_unescaped(code_point: u16) -> bool { + URI_ALPHA_LOWER.contains(&code_point) + || URI_ALPHA_UPPER.contains(&code_point) + || DECIMAL_DIGIT.contains(&code_point) + || URI_MARK.contains(&code_point) +} + +/// Constant with all the reserved URI characters, plus the number sign symbol (`#`). +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriReserved +pub(super) fn is_uri_reserved_or_number_sign(code_point: u16) -> bool { + code_point == NUMBER_SIGN || URI_RESERVED.contains(&code_point) +} + +/// Constant with all the reserved and unescaped URI characters, plus the number sign symbol (`#`). /// /// More information: /// - [`uriReserved` in ECMAScript spec][uri_reserved] @@ -112,92 +102,6 @@ pub(super) const URI_RESERVED_HASH: [u16; 11] = [ /// /// [uri_reserved]: https://tc39.es/ecma262/#prod-uriReserved /// [uri_unescaped]: https://tc39.es/ecma262/#prod-uriUnescaped -pub(super) const URI_RESERVED_UNESCAPED_HASH: [u16; 82] = [ - // uriAlpha - b'a' as u16, - b'b' as u16, - b'c' as u16, - b'd' as u16, - b'e' as u16, - b'f' as u16, - b'g' as u16, - b'h' as u16, - b'i' as u16, - b'j' as u16, - b'k' as u16, - b'l' as u16, - b'm' as u16, - b'n' as u16, - b'o' as u16, - b'p' as u16, - b'q' as u16, - b'r' as u16, - b's' as u16, - b't' as u16, - b'u' as u16, - b'v' as u16, - b'w' as u16, - b'x' as u16, - b'y' as u16, - b'z' as u16, - b'A' as u16, - b'B' as u16, - b'C' as u16, - b'D' as u16, - b'E' as u16, - b'F' as u16, - b'G' as u16, - b'H' as u16, - b'I' as u16, - b'J' as u16, - b'K' as u16, - b'L' as u16, - b'M' as u16, - b'N' as u16, - b'O' as u16, - b'P' as u16, - b'Q' as u16, - b'R' as u16, - b'S' as u16, - b'T' as u16, - b'U' as u16, - b'V' as u16, - b'W' as u16, - b'X' as u16, - b'Y' as u16, - b'Z' as u16, - // DecimalDigit - b'0' as u16, - b'1' as u16, - b'2' as u16, - b'3' as u16, - b'4' as u16, - b'5' as u16, - b'6' as u16, - b'7' as u16, - b'8' as u16, - b'9' as u16, - // uriMark - b'-' as u16, - b'_' as u16, - b'.' as u16, - b'!' as u16, - b'~' as u16, - b'*' as u16, - b'\'' as u16, - b'(' as u16, - b')' as u16, - // uriReserved - b';' as u16, - b'/' as u16, - b'?' as u16, - b':' as u16, - b'@' as u16, - b'&' as u16, - b'=' as u16, - b'+' as u16, - b'$' as u16, - b',' as u16, - // Extra: # symbol - b'#' as u16, -]; +pub(super) fn is_uri_reserved_or_uri_unescaped_or_number_sign(code_point: u16) -> bool { + code_point == NUMBER_SIGN || is_uri_unescaped(code_point) || URI_RESERVED.contains(&code_point) +} diff --git a/boa_engine/src/builtins/uri/mod.rs b/boa_engine/src/builtins/uri/mod.rs index 23b7339547f..21948bf8bb9 100644 --- a/boa_engine/src/builtins/uri/mod.rs +++ b/boa_engine/src/builtins/uri/mod.rs @@ -8,16 +8,20 @@ //! More information: //! - [ECMAScript reference][spec] //! -//! [spec]: https://tc39.es/ecma262/#sec-number-object +//! [spec]: https://tc39.es/ecma262/#sec-uri-handling-functions mod consts; +use self::consts::{ + is_uri_reserved_or_number_sign, is_uri_reserved_or_uri_unescaped_or_number_sign, + is_uri_unescaped, +}; + use super::BuiltIn; use crate::{ builtins::JsArgs, object::FunctionBuilder, property::Attribute, Context, JsResult, JsString, JsValue, }; -use consts::{URI_RESERVED_HASH, URI_RESERVED_UNESCAPED_HASH, URI_UNESCAPED}; /// URI Handling Functions #[derive(Debug, Clone, Copy)] @@ -104,7 +108,7 @@ impl Uri { let uri_string = encoded_uri.to_string(context)?; // 2. Let reservedURISet be a String containing one instance of each code unit valid in uriReserved plus "#". - let reserved_uri_set = &URI_RESERVED_HASH; + let reserved_uri_set = is_uri_reserved_or_number_sign; // 3. Return ? Decode(uriString, reservedURISet). Ok(JsValue::from(decode( @@ -137,7 +141,7 @@ impl Uri { let component_string = encoded_uri_component.to_string(context)?; // 2. Let reservedURIComponentSet be the empty String. - let reserved_uri_component_set = &[]; + let reserved_uri_component_set = |_: u16| false; // 3. Return ? Decode(componentString, reservedURIComponentSet). Ok(JsValue::from(decode( @@ -170,7 +174,7 @@ impl Uri { let uri_string = uri.to_string(context)?; // 2. Let unescapedURISet be a String containing one instance of each code unit valid in uriReserved and uriUnescaped plus "#". - let unescaped_uri_set = &URI_RESERVED_UNESCAPED_HASH; + let unescaped_uri_set = is_uri_reserved_or_uri_unescaped_or_number_sign; // 3. Return ? Encode(uriString, unescapedURISet). Ok(JsValue::from(encode( @@ -203,7 +207,7 @@ impl Uri { let component_string = uri_component.to_string(context)?; // 2. Let unescapedURIComponentSet be a String containing one instance of each code unit valid in uriUnescaped. - let unescaped_uri_component_set = &URI_UNESCAPED; + let unescaped_uri_component_set = is_uri_unescaped; // 3. Return ? Encode(componentString, unescapedURIComponentSet). Ok(JsValue::from(encode( @@ -224,7 +228,10 @@ impl Uri { /// - [ECMAScript reference][spec] /// /// [spec]: https://tc39.es/ecma262/#sec-encode -fn encode(context: &mut Context, string: &JsString, unescaped_set: &[u16]) -> JsResult { +fn encode(context: &mut Context, string: &JsString, unescaped_set: F) -> JsResult +where + F: Fn(u16) -> bool, +{ let code_units = string.encode_utf16().collect::>(); // 1. Let strLen be the length of string. @@ -246,7 +253,7 @@ fn encode(context: &mut Context, string: &JsString, unescaped_set: &[u16]) -> Js let c = code_units[k]; // c. If C is in unescapedSet, then - if unescaped_set.contains(&c) { + if unescaped_set(c) { // i. Set k to k + 1. k += 1; @@ -297,7 +304,10 @@ fn encode(context: &mut Context, string: &JsString, unescaped_set: &[u16]) -> Js /// /// [spec]: https://tc39.es/ecma262/#sec-decode #[allow(clippy::many_single_char_names)] -fn decode(context: &mut Context, string: &JsString, reserved_set: &[u16]) -> JsResult { +fn decode(context: &mut Context, string: &JsString, reserved_set: F) -> JsResult +where + F: Fn(u16) -> bool, +{ let code_units = string.encode_utf16().collect::>(); // 1. Let strLen be the length of string. @@ -321,7 +331,7 @@ fn decode(context: &mut Context, string: &JsString, reserved_set: &[u16]) -> JsR #[allow(clippy::if_not_else)] let s = if c != 0x0025_u16 { // i. Let S be the String value containing only the code unit C. - vec![c] + Vec::from([c]) } else { // d. Else, // i. Let start be k. @@ -334,14 +344,8 @@ fn decode(context: &mut Context, string: &JsString, reserved_set: &[u16]) -> JsR // iii. If the code units at index (k + 1) and (k + 2) within string do not represent // hexadecimal digits, throw a URIError exception. - let unit_k_1 = code_units[k + 1]; - let unit_k_2 = code_units[k + 2]; - if !is_hexdigit(unit_k_1) || !is_hexdigit(unit_k_2) { - context.throw_uri_error("invalid escape character found")?; - } - // iv. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). - let b = decode_byte(unit_k_1, unit_k_2); + let b = decode_byte(code_units[k + 1], code_units[k + 2], context)?; // v. Set k to k + 2. k += 2; @@ -355,30 +359,28 @@ fn decode(context: &mut Context, string: &JsString, reserved_set: &[u16]) -> JsR let c = u16::from(b); // 2. If C is not in reservedSet, then - if !reserved_set.contains(&c) { + if !reserved_set(c) { // a. Let S be the String value containing only the code unit C. - vec![c] + Vec::from([c]) } else { // 3. Else, // a. Let S be the substring of string from start to k + 1. - let mut s = Vec::new(); - s.extend_from_slice(&code_units[start..=k]); - s + Vec::from(&code_units[start..=k]) } } else { // viii. Else, // 1. If n = 1 or n > 4, throw a URIError exception. if n == 1 || n > 4 { - context.throw_uri_error("TO-DO")?; + context.throw_uri_error("invalid escaped character found")?; } // 2. If k + (3 × (n - 1)) ≥ strLen, throw a URIError exception. if k + (3 * (n - 1)) > str_len { - context.throw_uri_error("TO-DO")?; + context.throw_uri_error("non-terminated escape character found")?; } // 3. Let Octets be « B ». - let mut octets = vec![b]; + let mut octets = Vec::from([b]); // 4. Let j be 1. // 5. Repeat, while j < n, @@ -393,14 +395,8 @@ fn decode(context: &mut Context, string: &JsString, reserved_set: &[u16]) -> JsR } // c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception. - let unit_k_1 = code_units[k + 1]; - let unit_k_2 = code_units[k + 2]; - if !is_hexdigit(unit_k_1) || !is_hexdigit(unit_k_2) { - context.throw_uri_error("invalid escape character")?; - } - // d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). - let b = decode_byte(unit_k_1, unit_k_2); + let b = decode_byte(code_units[k + 1], code_units[k + 2], context)?; // e. Set k to k + 2. k += 2; @@ -439,6 +435,7 @@ fn decode(context: &mut Context, string: &JsString, reserved_set: &[u16]) -> JsR } /// Checks if a given code unit is an hexadecimal digit represented in UTF-16. +#[inline] fn is_hexdigit(code_unit: u16) -> bool { use std::ops::RangeInclusive; @@ -450,9 +447,15 @@ fn is_hexdigit(code_unit: u16) -> bool { } /// Decodes a byte from two unicode code units. It expects both to be hexadecimal characters. -fn decode_byte(high: u16, low: u16) -> u8 { - let high = u8::try_from(high).expect("invalid ASCII character found"); - let low = u8::try_from(low).expect("invalid ASCII character found"); +fn decode_byte(high: u16, low: u16, context: &mut Context) -> JsResult { + // c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception. + if !is_hexdigit(high) || !is_hexdigit(low) { + context.throw_uri_error("invalid escape character")?; + } + + // d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). + let high = high as u8; + let low = low as u8; let high = if (b'0'..=b'9').contains(&high) { high - b'0' @@ -461,7 +464,7 @@ fn decode_byte(high: u16, low: u16) -> u8 { } else if (b'a'..=b'z').contains(&high) { high - b'a' + 0x0A } else { - panic!("invalid ASCII hexadecimal digit found"); + unreachable!("invalid ASCII hexadecimal digit found"); }; let low = if (b'0'..=b'9').contains(&low) { @@ -471,13 +474,14 @@ fn decode_byte(high: u16, low: u16) -> u8 { } else if (b'a'..=b'z').contains(&low) { low - b'a' + 0x0A } else { - panic!("invalid ASCII hexadecimal digit found"); + unreachable!("invalid ASCII hexadecimal digit found"); }; - (high << 4) + low + Ok((high << 4) + low) } /// Counts the number of leading 1 bits in a given byte. +#[inline] fn leading_one_bits(byte: u8) -> usize { // This uses a value table for speed if byte == u8::MAX { @@ -505,6 +509,38 @@ fn leading_one_bits(byte: u8) -> usize { mod tests { use super::*; + /// Checks that the `is_hexdigit()` function works as expected. + #[test] + fn ut_is_hexdigit() { + for b in b'0'..=b'9' { + assert!(is_hexdigit(u16::from(b)), "failed: {b}"); + } + + for b in b'a'..=b'f' { + assert!(is_hexdigit(u16::from(b)), "failed: {b}"); + } + + for b in b'A'..=b'F' { + assert!(is_hexdigit(u16::from(b)), "failed: {b}"); + } + + for b in 0x00..b'0' { + assert!(!is_hexdigit(u16::from(b)), "failed: {b}"); + } + + for b in b'9' + 1..b'A' { + assert!(!is_hexdigit(u16::from(b)), "failed: {b}"); + } + + for b in b'F' + 1..b'a' { + assert!(!is_hexdigit(u16::from(b)), "failed: {b}"); + } + + for b in b'f' + 1..=0xFF { + assert!(!is_hexdigit(u16::from(b)), "failed: {b}"); + } + } + /// Checks if the `leading_one_bits()` function works as expected. #[test] fn ut_leading_one_bits() { @@ -536,18 +572,38 @@ mod tests { /// Checks that the `decode_byte()` function works as expected. #[test] fn ut_decode_byte() { - assert_eq!(decode_byte(u16::from(b'2'), u16::from(b'0')), 0x20); - assert_eq!(decode_byte(u16::from(b'2'), u16::from(b'A')), 0x2A); - assert_eq!(decode_byte(u16::from(b'3'), u16::from(b'C')), 0x3C); - assert_eq!(decode_byte(u16::from(b'4'), u16::from(b'0')), 0x40); - assert_eq!(decode_byte(u16::from(b'7'), u16::from(b'E')), 0x7E); - assert_eq!(decode_byte(u16::from(b'0'), u16::from(b'0')), 0x00); - } + let mut context = Context::default(); - /// Checks that the `decode_byte()` panics with invalid ASCII characters. - #[test] - #[should_panic] - fn ut_decode_byte_rainy() { - decode_byte(u16::from(b'-'), u16::from(b'0')); + // Sunny day tests + assert_eq!( + decode_byte(u16::from(b'2'), u16::from(b'0'), &mut context).unwrap(), + 0x20 + ); + assert_eq!( + decode_byte(u16::from(b'2'), u16::from(b'A'), &mut context).unwrap(), + 0x2A + ); + assert_eq!( + decode_byte(u16::from(b'3'), u16::from(b'C'), &mut context).unwrap(), + 0x3C + ); + assert_eq!( + decode_byte(u16::from(b'4'), u16::from(b'0'), &mut context).unwrap(), + 0x40 + ); + assert_eq!( + decode_byte(u16::from(b'7'), u16::from(b'E'), &mut context).unwrap(), + 0x7E + ); + assert_eq!( + decode_byte(u16::from(b'0'), u16::from(b'0'), &mut context).unwrap(), + 0x00 + ); + + // Rainy day tests + assert!(decode_byte(u16::from(b'-'), u16::from(b'0'), &mut context).is_err()); + assert!(decode_byte(u16::from(b'f'), u16::from(b'~'), &mut context).is_err()); + assert!(decode_byte(u16::from(b'A'), 0_u16, &mut context).is_err()); + assert!(decode_byte(u16::from(b'%'), u16::from(b'&'), &mut context).is_err()); } } From c6eb572242382c1e542f57d924e6f1a23201798a Mon Sep 17 00:00:00 2001 From: Iban Eguia Moraza Date: Tue, 6 Sep 2022 10:23:05 +0200 Subject: [PATCH 6/9] Added some tests, improved documentation, updated some specs --- boa_engine/src/builtins/string/mod.rs | 81 +++++++++++++++++++------ boa_engine/src/builtins/string/tests.rs | 15 +++++ boa_engine/src/builtins/uri/mod.rs | 4 +- 3 files changed, 81 insertions(+), 19 deletions(-) diff --git a/boa_engine/src/builtins/string/mod.rs b/boa_engine/src/builtins/string/mod.rs index f3bdcda7280..900346fc3d3 100644 --- a/boa_engine/src/builtins/string/mod.rs +++ b/boa_engine/src/builtins/string/mod.rs @@ -40,7 +40,7 @@ pub(crate) enum Placement { End, } -/// Code point information for the `code_point_at()` function. +/// Code point information for the `CodePointAt` abstract operation. #[derive(Debug, Clone, Copy)] pub(crate) struct CodePointInfo { pub(crate) code_point: u32, @@ -48,40 +48,74 @@ pub(crate) struct CodePointInfo { pub(crate) is_unpaired_surrogate: bool, } +/// The `CodePointAt ( string, position )` abstract operation. +/// +/// The abstract operation `CodePointAt` takes arguments `string` (a String) and `position` (a +/// non-negative integer) and returns a Record with fields `[[CodePoint]]` (a code point), +/// `[[CodeUnitCount]]` (a positive integer), and `[[IsUnpairedSurrogate]]` (a Boolean). It +/// interprets string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads +/// from it a single code point starting with the code unit at index `position`. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#sec-codepointat pub(crate) fn code_point_at(string: &JsString, position: u64) -> CodePointInfo { let mut encoded = string.encode_utf16(); + + // 1. Let size be the length of string. let size = encoded.clone().count() as u64; + // 2. Assert: position ≥ 0 and position < size. + assert!(position < size); + + // 3. Let first be the code unit at index position within string. let first = encoded .nth(position as usize) .expect("The callers of this function must've already checked bounds."); + + // 4. Let cp be the code point whose numeric value is that of first. + let cp = u32::from(first); + + // 5. If first is not a leading surrogate or trailing surrogate, then if !is_leading_surrogate(first) && !is_trailing_surrogate(first) { + // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: false }. return CodePointInfo { - code_point: u32::from(first), + code_point: cp, code_unit_count: 1, is_unpaired_surrogate: false, }; } + // 6. If first is a trailing surrogate or position + 1 = size, then if is_trailing_surrogate(first) || position + 1 == size { + // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. return CodePointInfo { - code_point: u32::from(first), + code_point: cp, code_unit_count: 1, is_unpaired_surrogate: true, }; } + // 7. Let second be the code unit at index position + 1 within string. let second = encoded .next() .expect("The callers of this function must've already checked bounds."); + + // 8. If second is not a trailing surrogate, then if !is_trailing_surrogate(second) { + // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. return CodePointInfo { - code_point: u32::from(first), + code_point: cp, code_unit_count: 1, is_unpaired_surrogate: true, }; } + + // 9. Set cp to UTF16SurrogatePairToCodePoint(first, second). let cp = (u32::from(first) - 0xD800) * 0x400 + (u32::from(second) - 0xDC00) + 0x10000; + + // 10. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 2, [[IsUnpairedSurrogate]]: false }. CodePointInfo { code_point: cp, code_unit_count: 2, @@ -110,10 +144,22 @@ pub(crate) fn is_trimmable_whitespace(c: char) -> bool { ) } +/// Checks if the given code unit is a leading surrogate. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#leading-surrogate pub(crate) fn is_leading_surrogate(value: u16) -> bool { (0xD800..=0xDBFF).contains(&value) } +/// Checks if the given code unit is a trailing surrogate. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#trailing-surrogate pub(crate) fn is_trailing_surrogate(value: u16) -> bool { (0xDC00..=0xDFFF).contains(&value) } @@ -393,7 +439,7 @@ impl String { } } - /// `String.fromCharCode(...codePoints)` + /// `String.fromCharCode(...codeUnits)` /// /// Construct a `String` from one or more code points (as numbers). /// More information: @@ -405,21 +451,22 @@ impl String { args: &[JsValue], context: &mut Context, ) -> JsResult { - // 1. Let length be the number of elements in codeUnits. - // 2. Let elements be a new empty List. - let mut elements = Vec::new(); - // 3. For each element next of codeUnits, do + // 1. Let result be the empty String. + let mut result = Vec::new(); + + // 2. For each element next of codeUnits, do for next in args { - // 3a. Let nextCU be ℝ(? ToUint16(next)). - // 3b. Append nextCU to the end of elements. - elements.push(next.to_uint16(context)?); - } + // a. Let nextCU be the code unit whose numeric value is ℝ(? ToUint16(next)). + let next_cu = next.to_uint16(context)?; - // 4. Return the String value whose code units are the elements in the List elements. - // If codeUnits is empty, the empty String is returned. + // b. Set result to the string-concatenation of result and nextCU. + result.push(next_cu); + } - let s = std::string::String::from_utf16_lossy(elements.as_slice()); - Ok(JsValue::String(JsString::new(s))) + // 3. Return result. + Ok(JsValue::String(JsString::new( + std::string::String::from_utf16_lossy(&result), + ))) } /// `String.prototype.toString ( )` diff --git a/boa_engine/src/builtins/string/tests.rs b/boa_engine/src/builtins/string/tests.rs index a8f3b2ea809..cefd29a8d87 100644 --- a/boa_engine/src/builtins/string/tests.rs +++ b/boa_engine/src/builtins/string/tests.rs @@ -1,3 +1,4 @@ +use super::{is_leading_surrogate, is_trailing_surrogate}; use crate::{forward, forward_val, Context}; #[test] @@ -1150,3 +1151,17 @@ fn search() { assert_eq!(forward(&mut context, "'aa'.search(/a/g)"), "0"); assert_eq!(forward(&mut context, "'ba'.search(/a/)"), "1"); } + +#[test] +fn ut_is_leading_surrogate() { + for cp in 0xD800..=0xDBFF { + assert!(is_leading_surrogate(cp), "failed: {cp:X}"); + } +} + +#[test] +fn ut_is_trailing_surrogate() { + for cp in 0xDC00..=0xDFFF { + assert!(is_trailing_surrogate(cp), "failed: {cp:X}"); + } +} diff --git a/boa_engine/src/builtins/uri/mod.rs b/boa_engine/src/builtins/uri/mod.rs index 21948bf8bb9..6ae8610df80 100644 --- a/boa_engine/src/builtins/uri/mod.rs +++ b/boa_engine/src/builtins/uri/mod.rs @@ -17,7 +17,7 @@ use self::consts::{ is_uri_unescaped, }; -use super::BuiltIn; +use super::{string::code_point_at, BuiltIn}; use crate::{ builtins::JsArgs, object::FunctionBuilder, property::Attribute, Context, JsResult, JsString, JsValue, @@ -262,7 +262,7 @@ where } else { // d. Else, // i. Let cp be CodePointAt(string, k). - let cp = crate::builtins::string::code_point_at(string, k as u64); + let cp = code_point_at(string, k as u64); // ii. If cp.[[IsUnpairedSurrogate]] is true, throw a URIError exception. if cp.is_unpaired_surrogate { From 57060ed6c22455ac7fd9f6958d9a9d6929359780 Mon Sep 17 00:00:00 2001 From: Iban Eguia Moraza Date: Wed, 7 Sep 2022 18:21:05 +0200 Subject: [PATCH 7/9] Implemented suggestion --- boa_engine/src/builtins/uri/mod.rs | 127 ++++++++++------------------- 1 file changed, 41 insertions(+), 86 deletions(-) diff --git a/boa_engine/src/builtins/uri/mod.rs b/boa_engine/src/builtins/uri/mod.rs index 6ae8610df80..464e5a6bd51 100644 --- a/boa_engine/src/builtins/uri/mod.rs +++ b/boa_engine/src/builtins/uri/mod.rs @@ -345,7 +345,7 @@ where // iii. If the code units at index (k + 1) and (k + 2) within string do not represent // hexadecimal digits, throw a URIError exception. // iv. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). - let b = decode_byte(code_units[k + 1], code_units[k + 2], context)?; + let b = decode_hex_byte(code_units[k + 1], code_units[k + 2], context)?; // v. Set k to k + 2. k += 2; @@ -396,7 +396,7 @@ where // c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception. // d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). - let b = decode_byte(code_units[k + 1], code_units[k + 2], context)?; + let b = decode_hex_byte(code_units[k + 1], code_units[k + 2], context)?; // e. Set k to k + 2. k += 2; @@ -434,50 +434,34 @@ where } } -/// Checks if a given code unit is an hexadecimal digit represented in UTF-16. -#[inline] -fn is_hexdigit(code_unit: u16) -> bool { - use std::ops::RangeInclusive; - - const DIGIT: RangeInclusive = b'0' as u16..=b'9' as u16; - const HEX_UPPER: RangeInclusive = b'A' as u16..=b'F' as u16; - const HEX_LOWER: RangeInclusive = b'a' as u16..=b'f' as u16; - - DIGIT.contains(&code_unit) || HEX_UPPER.contains(&code_unit) || HEX_LOWER.contains(&code_unit) -} +/// Decodes a byte from two unicode code units. +fn decode_hex_byte(high: u16, low: u16, context: &mut Context) -> JsResult { + match (u8::try_from(high), u8::try_from(low)) { + (Ok(high), Ok(low)) => { + let high = if (b'0'..=b'9').contains(&high) { + high - b'0' + } else if (b'A'..=b'F').contains(&high) { + high - b'A' + 0x0A + } else if (b'a'..=b'f').contains(&high) { + high - b'a' + 0x0a + } else { + return Err(context.construct_uri_error("invalid hexadecimal digit found")); + }; + + let low = if (b'0'..=b'9').contains(&low) { + low - b'0' + } else if (b'A'..=b'F').contains(&low) { + low - b'A' + 0x0A + } else if (b'a'..=b'f').contains(&low) { + low - b'a' + 0x0a + } else { + return Err(context.construct_uri_error("invalid hexadecimal digit found")); + }; -/// Decodes a byte from two unicode code units. It expects both to be hexadecimal characters. -fn decode_byte(high: u16, low: u16, context: &mut Context) -> JsResult { - // c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception. - if !is_hexdigit(high) || !is_hexdigit(low) { - context.throw_uri_error("invalid escape character")?; + Ok((high << 4) + low) + } + _ => Err(context.construct_uri_error("invalid hexadecimal digit found")), } - - // d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). - let high = high as u8; - let low = low as u8; - - let high = if (b'0'..=b'9').contains(&high) { - high - b'0' - } else if (b'A'..=b'Z').contains(&high) { - high - b'A' + 0x0A - } else if (b'a'..=b'z').contains(&high) { - high - b'a' + 0x0A - } else { - unreachable!("invalid ASCII hexadecimal digit found"); - }; - - let low = if (b'0'..=b'9').contains(&low) { - low - b'0' - } else if (b'A'..=b'Z').contains(&low) { - low - b'A' + 0x0A - } else if (b'a'..=b'z').contains(&low) { - low - b'a' + 0x0A - } else { - unreachable!("invalid ASCII hexadecimal digit found"); - }; - - Ok((high << 4) + low) } /// Counts the number of leading 1 bits in a given byte. @@ -509,38 +493,6 @@ fn leading_one_bits(byte: u8) -> usize { mod tests { use super::*; - /// Checks that the `is_hexdigit()` function works as expected. - #[test] - fn ut_is_hexdigit() { - for b in b'0'..=b'9' { - assert!(is_hexdigit(u16::from(b)), "failed: {b}"); - } - - for b in b'a'..=b'f' { - assert!(is_hexdigit(u16::from(b)), "failed: {b}"); - } - - for b in b'A'..=b'F' { - assert!(is_hexdigit(u16::from(b)), "failed: {b}"); - } - - for b in 0x00..b'0' { - assert!(!is_hexdigit(u16::from(b)), "failed: {b}"); - } - - for b in b'9' + 1..b'A' { - assert!(!is_hexdigit(u16::from(b)), "failed: {b}"); - } - - for b in b'F' + 1..b'a' { - assert!(!is_hexdigit(u16::from(b)), "failed: {b}"); - } - - for b in b'f' + 1..=0xFF { - assert!(!is_hexdigit(u16::from(b)), "failed: {b}"); - } - } - /// Checks if the `leading_one_bits()` function works as expected. #[test] fn ut_leading_one_bits() { @@ -576,34 +528,37 @@ mod tests { // Sunny day tests assert_eq!( - decode_byte(u16::from(b'2'), u16::from(b'0'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'2'), u16::from(b'0'), &mut context).unwrap(), 0x20 ); assert_eq!( - decode_byte(u16::from(b'2'), u16::from(b'A'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'2'), u16::from(b'A'), &mut context).unwrap(), 0x2A ); assert_eq!( - decode_byte(u16::from(b'3'), u16::from(b'C'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'3'), u16::from(b'C'), &mut context).unwrap(), 0x3C ); assert_eq!( - decode_byte(u16::from(b'4'), u16::from(b'0'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'4'), u16::from(b'0'), &mut context).unwrap(), 0x40 ); assert_eq!( - decode_byte(u16::from(b'7'), u16::from(b'E'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'7'), u16::from(b'E'), &mut context).unwrap(), 0x7E ); assert_eq!( - decode_byte(u16::from(b'0'), u16::from(b'0'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'0'), u16::from(b'0'), &mut context).unwrap(), 0x00 ); // Rainy day tests - assert!(decode_byte(u16::from(b'-'), u16::from(b'0'), &mut context).is_err()); - assert!(decode_byte(u16::from(b'f'), u16::from(b'~'), &mut context).is_err()); - assert!(decode_byte(u16::from(b'A'), 0_u16, &mut context).is_err()); - assert!(decode_byte(u16::from(b'%'), u16::from(b'&'), &mut context).is_err()); + assert!(decode_hex_byte(u16::from(b'-'), u16::from(b'0'), &mut context).is_err()); + assert!(decode_hex_byte(u16::from(b'f'), u16::from(b'~'), &mut context).is_err()); + assert!(decode_hex_byte(u16::from(b'A'), 0_u16, &mut context).is_err()); + assert!(decode_hex_byte(u16::from(b'%'), u16::from(b'&'), &mut context).is_err()); + + assert!(decode_hex_byte(0xFACD_u16, u16::from(b'-'), &mut context).is_err()); + assert!(decode_hex_byte(u16::from(b'-'), 0xA0FD_u16, &mut context).is_err()); } } From e17ea72a63733552dff96292a76dd2325882a363 Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Sun, 18 Sep 2022 03:44:14 +0200 Subject: [PATCH 8/9] Use `char::to_digit` in hex decoding --- boa_engine/src/builtins/uri/mod.rs | 70 ++++++++++++------------------ 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/boa_engine/src/builtins/uri/mod.rs b/boa_engine/src/builtins/uri/mod.rs index 464e5a6bd51..19fb39d9486 100644 --- a/boa_engine/src/builtins/uri/mod.rs +++ b/boa_engine/src/builtins/uri/mod.rs @@ -345,7 +345,8 @@ where // iii. If the code units at index (k + 1) and (k + 2) within string do not represent // hexadecimal digits, throw a URIError exception. // iv. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). - let b = decode_hex_byte(code_units[k + 1], code_units[k + 2], context)?; + let b = decode_hex_byte(code_units[k + 1], code_units[k + 2]) + .ok_or_else(|| context.construct_uri_error("invalid hexadecimal digit found"))?; // v. Set k to k + 2. k += 2; @@ -396,7 +397,10 @@ where // c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception. // d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). - let b = decode_hex_byte(code_units[k + 1], code_units[k + 2], context)?; + let b = + decode_hex_byte(code_units[k + 1], code_units[k + 2]).ok_or_else(|| { + context.construct_uri_error("invalid hexadecimal digit found") + })?; // e. Set k to k + 2. k += 2; @@ -435,32 +439,16 @@ where } /// Decodes a byte from two unicode code units. -fn decode_hex_byte(high: u16, low: u16, context: &mut Context) -> JsResult { - match (u8::try_from(high), u8::try_from(low)) { - (Ok(high), Ok(low)) => { - let high = if (b'0'..=b'9').contains(&high) { - high - b'0' - } else if (b'A'..=b'F').contains(&high) { - high - b'A' + 0x0A - } else if (b'a'..=b'f').contains(&high) { - high - b'a' + 0x0a - } else { - return Err(context.construct_uri_error("invalid hexadecimal digit found")); - }; - - let low = if (b'0'..=b'9').contains(&low) { - low - b'0' - } else if (b'A'..=b'F').contains(&low) { - low - b'A' + 0x0A - } else if (b'a'..=b'f').contains(&low) { - low - b'a' + 0x0a - } else { - return Err(context.construct_uri_error("invalid hexadecimal digit found")); - }; - - Ok((high << 4) + low) - } - _ => Err(context.construct_uri_error("invalid hexadecimal digit found")), +fn decode_hex_byte(high: u16, low: u16) -> Option { + match ( + char::from_u32(u32::from(high)), + char::from_u32(u32::from(low)), + ) { + (Some(high), Some(low)) => match (high.to_digit(16), low.to_digit(16)) { + (Some(high), Some(low)) => Some(((high as u8) << 4) + low as u8), + _ => None, + }, + _ => None, } } @@ -524,41 +512,39 @@ mod tests { /// Checks that the `decode_byte()` function works as expected. #[test] fn ut_decode_byte() { - let mut context = Context::default(); - // Sunny day tests assert_eq!( - decode_hex_byte(u16::from(b'2'), u16::from(b'0'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'2'), u16::from(b'0')).unwrap(), 0x20 ); assert_eq!( - decode_hex_byte(u16::from(b'2'), u16::from(b'A'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'2'), u16::from(b'A')).unwrap(), 0x2A ); assert_eq!( - decode_hex_byte(u16::from(b'3'), u16::from(b'C'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'3'), u16::from(b'C')).unwrap(), 0x3C ); assert_eq!( - decode_hex_byte(u16::from(b'4'), u16::from(b'0'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'4'), u16::from(b'0')).unwrap(), 0x40 ); assert_eq!( - decode_hex_byte(u16::from(b'7'), u16::from(b'E'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'7'), u16::from(b'E')).unwrap(), 0x7E ); assert_eq!( - decode_hex_byte(u16::from(b'0'), u16::from(b'0'), &mut context).unwrap(), + decode_hex_byte(u16::from(b'0'), u16::from(b'0')).unwrap(), 0x00 ); // Rainy day tests - assert!(decode_hex_byte(u16::from(b'-'), u16::from(b'0'), &mut context).is_err()); - assert!(decode_hex_byte(u16::from(b'f'), u16::from(b'~'), &mut context).is_err()); - assert!(decode_hex_byte(u16::from(b'A'), 0_u16, &mut context).is_err()); - assert!(decode_hex_byte(u16::from(b'%'), u16::from(b'&'), &mut context).is_err()); + assert!(decode_hex_byte(u16::from(b'-'), u16::from(b'0')).is_none()); + assert!(decode_hex_byte(u16::from(b'f'), u16::from(b'~')).is_none()); + assert!(decode_hex_byte(u16::from(b'A'), 0_u16).is_none()); + assert!(decode_hex_byte(u16::from(b'%'), u16::from(b'&')).is_none()); - assert!(decode_hex_byte(0xFACD_u16, u16::from(b'-'), &mut context).is_err()); - assert!(decode_hex_byte(u16::from(b'-'), 0xA0FD_u16, &mut context).is_err()); + assert!(decode_hex_byte(0xFACD_u16, u16::from(b'-')).is_none()); + assert!(decode_hex_byte(u16::from(b'-'), 0xA0FD_u16).is_none()); } } From 2be3ca5e880773f387533dd057e8db6ba3adae48 Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Sun, 18 Sep 2022 04:27:56 +0200 Subject: [PATCH 9/9] Inline uri functions --- boa_engine/src/builtins/uri/consts.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/boa_engine/src/builtins/uri/consts.rs b/boa_engine/src/builtins/uri/consts.rs index d06f1388b86..22ad56f5272 100644 --- a/boa_engine/src/builtins/uri/consts.rs +++ b/boa_engine/src/builtins/uri/consts.rs @@ -77,6 +77,7 @@ const NUMBER_SIGN: u16 = b'#' as u16; /// - [ECMAScript reference][spec] /// /// [spec]: https://tc39.es/ecma262/#prod-uriUnescaped +#[inline] pub(super) fn is_uri_unescaped(code_point: u16) -> bool { URI_ALPHA_LOWER.contains(&code_point) || URI_ALPHA_UPPER.contains(&code_point) @@ -90,6 +91,7 @@ pub(super) fn is_uri_unescaped(code_point: u16) -> bool { /// - [ECMAScript reference][spec] /// /// [spec]: https://tc39.es/ecma262/#prod-uriReserved +#[inline] pub(super) fn is_uri_reserved_or_number_sign(code_point: u16) -> bool { code_point == NUMBER_SIGN || URI_RESERVED.contains(&code_point) } @@ -102,6 +104,7 @@ pub(super) fn is_uri_reserved_or_number_sign(code_point: u16) -> bool { /// /// [uri_reserved]: https://tc39.es/ecma262/#prod-uriReserved /// [uri_unescaped]: https://tc39.es/ecma262/#prod-uriUnescaped +#[inline] pub(super) fn is_uri_reserved_or_uri_unescaped_or_number_sign(code_point: u16) -> bool { code_point == NUMBER_SIGN || is_uri_unescaped(code_point) || URI_RESERVED.contains(&code_point) }