diff --git a/Cargo.lock b/Cargo.lock index c38a4399598..3c551b98252 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -406,6 +406,7 @@ dependencies = [ "fast-float", "float-cmp", "icu_calendar", + "icu_casemapping", "icu_collator", "icu_datetime", "icu_list", diff --git a/boa_engine/Cargo.toml b/boa_engine/Cargo.toml index 0d48983bdf1..53567a57fa7 100644 --- a/boa_engine/Cargo.toml +++ b/boa_engine/Cargo.toml @@ -23,6 +23,7 @@ intl = [ "dep:icu_provider", "dep:icu_calendar", "dep:icu_collator", + "dep:icu_casemapping", "dep:icu_list", "dep:writeable", "dep:sys-locale", @@ -83,6 +84,7 @@ icu_collator = { version = "1.1.0", features = ["serde"], optional = true } icu_plurals = { version = "1.1.0", features = ["serde"], optional = true } icu_provider = { version = "1.1.0", optional = true } icu_list = { version = "1.1.0", features = ["serde"], optional = true } +icu_casemapping = { version = "0.7.1", features = ["serde"], optional = true} writeable = { version = "0.5.2", optional = true } sys-locale = { version = "0.3.0", optional = true } diff --git a/boa_engine/src/builtins/string/mod.rs b/boa_engine/src/builtins/string/mod.rs index 01e6b394c18..4a6dde565a4 100644 --- a/boa_engine/src/builtins/string/mod.rs +++ b/boa_engine/src/builtins/string/mod.rs @@ -122,8 +122,10 @@ impl IntrinsicObject for String { .method(Self::pad_end, "padEnd", 1) .method(Self::pad_start, "padStart", 1) .method(Self::trim, "trim", 0) - .method(Self::to_lowercase, "toLowerCase", 0) - .method(Self::to_uppercase, "toUpperCase", 0) + .method(Self::to_case::, "toLowerCase", 0) + .method(Self::to_case::, "toUpperCase", 0) + .method(Self::to_locale_case::, "toLocaleLowerCase", 0) + .method(Self::to_locale_case::, "toLocaleUpperCase", 0) .method(Self::substring, "substring", 2) .method(Self::split, "split", 2) .method(Self::value_of, "valueOf", 0) @@ -1644,18 +1646,19 @@ impl String { Ok(js_string!(string.trim_end()).into()) } - /// `String.prototype.toLowerCase()` + /// [`String.prototype.toUpperCase()`][upper] and [`String.prototype.toLowerCase()`][lower] /// - /// The `toLowerCase()` method returns the calling string value converted to lower case. + /// The case methods return the calling string value converted to uppercase or lowercase. + /// + /// The value will be **converted** to a string if it isn't one. /// /// More information: - /// - [ECMAScript reference][spec] /// - [MDN documentation][mdn] /// - /// [spec]: https://tc39.es/ecma262/#sec-string.prototype.tolowercase - /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toLowerCase - #[allow(clippy::wrong_self_convention)] - pub(crate) fn to_lowercase( + /// [upper]: https://tc39.es/ecma262/#sec-string.prototype.toUppercase + /// [lower]: https://tc39.es/ecma262/#sec-string.prototype.toLowercase + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toUpperCase + pub(crate) fn to_case( this: &JsValue, _: &[JsValue], context: &mut Context<'_>, @@ -1666,101 +1669,95 @@ impl String { // 2. Let S be ? ToString(O). let string = this.to_string(context)?; - let mut code_points = string.code_points(); - let mut lower_text = Vec::with_capacity(string.len()); - let mut next_unpaired_surrogate = None; - // 3. Let sText be ! StringToCodePoints(S). - // 4. Let lowerText be the result of toLowercase(sText), according to + // 4. Let upperText be the result of toUppercase(sText), according to // the Unicode Default Case Conversion algorithm. - loop { - let only_chars = code_points - .by_ref() - .map_while(|cpoint| match cpoint { - CodePoint::Unicode(c) => Some(c), - CodePoint::UnpairedSurrogate(s) => { - next_unpaired_surrogate = Some(s); - None - } - }) - .collect::() - .to_lowercase(); - - lower_text.extend(only_chars.encode_utf16()); - - if let Some(surr) = next_unpaired_surrogate.take() { - lower_text.push(surr); + let text = string.map_valid_segments(|s| { + if UPPER { + s.to_uppercase() } else { - break; + s.to_lowercase() } - } + }); - // 5. Let L be ! CodePointsToString(lowerText). + // 5. Let L be ! CodePointsToString(upperText). // 6. Return L. - Ok(js_string!(lower_text).into()) + Ok(js_string!(text).into()) } - /// `String.prototype.toUpperCase()` - /// - /// The `toUpperCase()` method returns the calling string value converted to uppercase. - /// - /// The value will be **converted** to a string if it isn't one - /// - /// More information: - /// - [ECMAScript reference][spec] - /// - [MDN documentation][mdn] + /// [`String.prototype.toLocaleLowerCase ( [ locales ] )`][lower] and + /// [`String.prototype.toLocaleUpperCase ( [ locales ] )`][upper] /// - /// [spec]: https://tc39.es/ecma262/#sec-string.prototype.toUppercase - /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toUpperCase - #[allow(clippy::wrong_self_convention)] - pub(crate) fn to_uppercase( + /// [lower]: https://tc39.es/ecma402/#sup-string.prototype.tolocalelowercase + /// [upper]: https://tc39.es/ecma402/#sup-string.prototype.tolocaleuppercase + pub(crate) fn to_locale_case( this: &JsValue, - _: &[JsValue], + args: &[JsValue], context: &mut Context<'_>, ) -> JsResult { - // This function behaves in exactly the same way as `String.prototype.toLowerCase`, except that the String is - // mapped using the toUppercase algorithm of the Unicode Default Case Conversion. - - // Comments below are an adaptation of the `String.prototype.toLowerCase` documentation. - - // 1. Let O be ? RequireObjectCoercible(this value). - let this = this.require_object_coercible()?; - - // 2. Let S be ? ToString(O). - let string = this.to_string(context)?; - - let mut code_points = string.code_points(); - let mut upper_text = Vec::with_capacity(string.len()); - let mut next_unpaired_surrogate = None; - - // 3. Let sText be ! StringToCodePoints(S). - // 4. Let upperText be the result of toUppercase(sText), according to - // the Unicode Default Case Conversion algorithm. - loop { - let only_chars = code_points - .by_ref() - .map_while(|cpoint| match cpoint { - CodePoint::Unicode(c) => Some(c), - CodePoint::UnpairedSurrogate(s) => { - next_unpaired_surrogate = Some(s); - None - } - }) - .collect::() - .to_uppercase(); - - upper_text.extend(only_chars.encode_utf16()); + #[cfg(feature = "intl")] + { + use super::intl::locale::{ + best_available_locale, canonicalize_locale_list, default_locale, + }; + use icu_casemapping::{provider::CaseMappingV1Marker, CaseMapping}; + use icu_locid::LanguageIdentifier; + + // 1. Let O be ? RequireObjectCoercible(this value). + let this = this.require_object_coercible()?; + + // 2. Let S be ? ToString(O). + let string = this.to_string(context)?; + + // 3. Return ? TransformCase(S, locales, lower). + + // TransformCase ( S, locales, targetCase ) + // https://tc39.es/ecma402/#sec-transform-case + + // 1. Let requestedLocales be ? CanonicalizeLocaleList(locales). + // 2. If requestedLocales is not an empty List, then + // a. Let requestedLocale be requestedLocales[0]. + let lang = canonicalize_locale_list(args.get_or_undefined(0), context)? + .into_iter() + .next() + // 3. Else, + // a. Let requestedLocale be ! DefaultLocale(). + .unwrap_or_else(|| default_locale(context.icu().locale_canonicalizer())) + .id; + // 4. Let noExtensionsLocale be the String value that is requestedLocale with any Unicode locale extension sequences (6.2.1) removed. + // 5. Let availableLocales be a List with language tags that includes the languages for which the Unicode Character Database contains language sensitive case mappings. Implementations may add additional language tags if they support case mapping for additional locales. + // 6. Let locale be ! BestAvailableLocale(availableLocales, noExtensionsLocale). + // 7. If locale is undefined, set locale to "und". + let lang = + best_available_locale::(lang, &context.icu().provider()) + .unwrap_or(LanguageIdentifier::UND); + + let casemapper = + CaseMapping::try_new_with_locale(&context.icu().provider(), &lang.into()) + .map_err(|err| JsNativeError::typ().with_message(err.to_string()))?; + + // 8. Let codePoints be StringToCodePoints(S). + let result = string.map_valid_segments(|segment| { + if UPPER { + // 10. Else, + // a. Assert: targetCase is upper. + // b. Let newCodePoints be a List whose elements are the result of an uppercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm. + casemapper.to_full_uppercase(&segment) + } else { + // 9. If targetCase is lower, then + // a. Let newCodePoints be a List whose elements are the result of a lowercase transformation of codePoints according to an implementation-derived algorithm using locale or the Unicode Default Case Conversion algorithm. + casemapper.to_full_lowercase(&segment) + } + }); - if let Some(surr) = next_unpaired_surrogate.take() { - upper_text.push(surr); - } else { - break; - } + // 11. Return CodePointsToString(newCodePoints). + Ok(result.into()) } - // 5. Let L be ! CodePointsToString(upperText). - // 6. Return L. - Ok(js_string!(upper_text).into()) + #[cfg(not(feature = "intl"))] + { + Self::to_case::(this, args, context) + } } /// `String.prototype.substring( indexStart[, indexEnd] )` diff --git a/boa_engine/src/string/mod.rs b/boa_engine/src/string/mod.rs index 1e365a5b99f..97430c66559 100644 --- a/boa_engine/src/string/mod.rs +++ b/boa_engine/src/string/mod.rs @@ -37,6 +37,7 @@ use std::{ cell::Cell, convert::Infallible, hash::{Hash, Hasher}, + iter::Peekable, ops::{Deref, Index}, process::abort, ptr::{self, addr_of, addr_of_mut, NonNull}, @@ -285,6 +286,74 @@ impl JsString { String::from_utf16(self) } + /// Decodes a [`JsString`] into an iterator of [`Result`], returning surrogates as + /// errors. + pub fn to_std_string_with_surrogates(&self) -> impl Iterator> + '_ { + struct WideStringDecoderIterator { + codepoints: Peekable, + } + + impl WideStringDecoderIterator { + fn new(iterator: I) -> Self { + WideStringDecoderIterator { + codepoints: iterator.peekable(), + } + } + } + + impl Iterator for WideStringDecoderIterator + where + I: Iterator, + { + type Item = Result; + + fn next(&mut self) -> Option { + let cp = self.codepoints.next()?; + let char = match cp { + CodePoint::Unicode(c) => c, + CodePoint::UnpairedSurrogate(surr) => return Some(Err(surr)), + }; + + let mut string = String::from(char); + + loop { + let Some(cp) = self.codepoints.peek().and_then(|cp| match cp { + CodePoint::Unicode(c) => Some(*c), + CodePoint::UnpairedSurrogate(_) => None, + }) else { break; }; + + string.push(cp); + + self.codepoints + .next() + .expect("should exist by the check above"); + } + + Some(Ok(string)) + } + } + + WideStringDecoderIterator::new(self.code_points()) + } + + /// Maps the valid segments of an UTF16 string and leaves the unpaired surrogates unchanged. + #[must_use] + pub fn map_valid_segments(&self, mut f: F) -> Self + where + F: FnMut(String) -> String, + { + let mut text = Vec::new(); + + for part in self.to_std_string_with_surrogates() { + match part { + Ok(string) => text.extend(f(string).encode_utf16()), + Err(surr) => text.push(surr), + } + } + + js_string!(text) + } + /// Gets an iterator of all the Unicode codepoints of a [`JsString`]. pub fn code_points(&self) -> impl Iterator + '_ { char::decode_utf16(self.iter().copied()).map(|res| match res { diff --git a/boa_icu_provider/data/icudata.postcard b/boa_icu_provider/data/icudata.postcard index cc7773c24ce..0a8b6be8556 100644 Binary files a/boa_icu_provider/data/icudata.postcard and b/boa_icu_provider/data/icudata.postcard differ diff --git a/boa_icu_provider/src/bin/datagen.rs b/boa_icu_provider/src/bin/datagen.rs index a7cfb4ec5a1..61651fa2d87 100644 --- a/boa_icu_provider/src/bin/datagen.rs +++ b/boa_icu_provider/src/bin/datagen.rs @@ -6,7 +6,7 @@ use std::{error::Error, fs::File}; use boa_icu_provider::data_root; -use icu_datagen::{all_keys, datagen, CldrLocaleSubset, Out, SourceData}; +use icu_datagen::{all_keys_with_experimental, datagen, CldrLocaleSubset, Out, SourceData}; fn main() -> Result<(), Box> { simple_logger::SimpleLogger::new() @@ -23,5 +23,11 @@ fn main() -> Result<(), Box> { data_root().join("icudata.postcard"), )?)); - datagen(None, &all_keys(), &source_data, [blob_out].into()).map_err(Into::into) + datagen( + None, + &all_keys_with_experimental(), + &source_data, + [blob_out].into(), + ) + .map_err(Into::into) }