Skip to content

Commit

Permalink
Add writeable::write_or_ref and use it in icu_locale_core (#5738)
Browse files Browse the repository at this point in the history
  • Loading branch information
sffc authored Oct 29, 2024
1 parent 88a1a54 commit 8344161
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 35 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
- Remove `FlexZeroVec` (https://github.com/unicode-org/icu4x/pull/5604)
- `writeable`
- Make `Writeable::writeable_cmp_bytes` a free function `writeable::cmp_bytes` (https://github.com/unicode-org/icu4x/pull/5737)
- Add `writeable::to_string_or_borrow` for writing with reference bytes (https://github.com/unicode-org/icu4x/pull/5738)


## icu4x 1.5.x
Expand Down
20 changes: 2 additions & 18 deletions components/locale_core/src/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use crate::parser::{
};
use crate::subtags;
use alloc::borrow::Cow;
use writeable::Writeable;

/// A core struct representing a [`Unicode BCP47 Language Identifier`].
///
Expand Down Expand Up @@ -187,15 +186,7 @@ impl LanguageIdentifier {
/// ```
pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
let lang_id = Self::try_from_utf8(input)?;
let cow = lang_id.write_to_string();
if cow.as_bytes() == input {
// Safety: input is known to be valid UTF-8 since it has the same
// bytes as `cow`, which is a `str`.
let s = unsafe { core::str::from_utf8_unchecked(input) };
Ok(s.into())
} else {
Ok(cow.into_owned().into())
}
Ok(writeable::to_string_or_borrow(&lang_id, input))
}

/// Canonicalize the language identifier (operating on strings)
Expand All @@ -216,14 +207,7 @@ impl LanguageIdentifier {
/// );
/// ```
pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
let lang_id = Self::try_from_str(input)?;
let cow = lang_id.write_to_string();

if cow == input {
Ok(input.into())
} else {
Ok(cow.into_owned().into())
}
Self::canonicalize_utf8(input.as_bytes())
}

/// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
Expand Down
19 changes: 2 additions & 17 deletions components/locale_core/src/locale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use crate::{extensions, subtags, LanguageIdentifier};
use alloc::borrow::Cow;
use core::cmp::Ordering;
use core::str::FromStr;
use writeable::Writeable;

/// A core struct representing a [`Unicode Locale Identifier`].
///
Expand Down Expand Up @@ -171,15 +170,7 @@ impl Locale {
/// ```
pub fn canonicalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
let locale = Self::try_from_utf8(input)?;
let cow: Cow<str> = locale.write_to_string();
if cow.as_bytes() == input {
// Safety: input is known to be valid UTF-8 since it has the same
// bytes as `cow`, which is a `str`.
let s = unsafe { core::str::from_utf8_unchecked(input) };
Ok(s.into())
} else {
Ok(cow.into_owned().into())
}
Ok(writeable::to_string_or_borrow(&locale, input))
}

/// Canonicalize the locale (operating on strings)
Expand All @@ -198,13 +189,7 @@ impl Locale {
/// );
/// ```
pub fn canonicalize(input: &str) -> Result<Cow<str>, ParseError> {
let locale = Self::try_from_str(input)?;
let cow = locale.write_to_string();
if cow == input {
Ok(input.into())
} else {
Ok(cow.into_owned().into())
}
Self::canonicalize_utf8(input.as_bytes())
}

/// Compare this [`Locale`] with BCP-47 bytes.
Expand Down
2 changes: 2 additions & 0 deletions utils/writeable/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,15 @@ mod impls;
mod ops;
mod parts_write_adapter;
mod testing;
mod to_string_or_borrow;
mod try_writeable;

use alloc::borrow::Cow;
use alloc::string::String;
use core::fmt;

pub use cmp::cmp_bytes;
pub use to_string_or_borrow::to_string_or_borrow;
pub use try_writeable::TryWriteable;

/// Helper types for trait impls.
Expand Down
210 changes: 210 additions & 0 deletions utils/writeable/src/to_string_or_borrow.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::Writeable;
use alloc::borrow::Cow;
use alloc::string::String;
use core::fmt;

/// Bytes that have been partially validated as UTF-8 up to an offset.
struct PartiallyValidatedUtf8<'a> {
// Safety Invariants:
// 1. The offset is less than or equal to the length of the slice.
// 2. The slice is valid UTF-8 up to the offset.
slice: &'a [u8],
offset: usize,
}

impl<'a> PartiallyValidatedUtf8<'a> {
fn new(slice: &'a [u8]) -> Self {
// Safety: Field invariants maintained here trivially:
// 1. The offset 0 is ≤ all possible lengths of slice
// 2. The slice contains nothing up to the offset zero
Self { slice, offset: 0 }
}

/// Check whether the given string is the next chunk of unvalidated bytes.
/// If so, increment offset and return true. Otherwise, return false.
fn try_push(&mut self, valid_str: &str) -> bool {
let new_offset = self.offset + valid_str.len();
if self.slice.get(self.offset..new_offset) == Some(valid_str.as_bytes()) {
// Safety: Field invariants maintained here:
// 1. In the line above, `self.slice.get()` returned `Some()` for `new_offset` at
// the end of a `Range`, so `new_offset` is ≤ the length of `self.slice`.
// 2. By invariant, we have already validated the string up to `self.offset`, and
// the portion of the slice between `self.offset` and `new_offset` is equal to
// `valid_str`, which is a `&str`, so the string is valid up to `new_offset`.
self.offset = new_offset;
true
} else {
false
}
}

/// Return the validated portion as `&str`.
fn validated_as_str(&self) -> &'a str {
debug_assert!(self.offset <= self.slice.len());
// Safety: self.offset is a valid end index in a range (from field invariant)
let valid_slice = unsafe { self.slice.get_unchecked(..self.offset) };
debug_assert!(core::str::from_utf8(valid_slice).is_ok());
// Safety: the UTF-8 of slice has been validated up to offset (from field invariant)
unsafe { core::str::from_utf8_unchecked(valid_slice) }
}
}

enum SliceOrString<'a> {
Slice(PartiallyValidatedUtf8<'a>),
String(String),
}

/// This is an infallible impl. Functions always return Ok, not Err.
impl fmt::Write for SliceOrString<'_> {
#[inline]
fn write_str(&mut self, other: &str) -> fmt::Result {
match self {
SliceOrString::Slice(slice) => {
if !slice.try_push(other) {
// We failed to match. Convert to owned.
let valid_str = slice.validated_as_str();
let mut owned = String::with_capacity(valid_str.len() + other.len());
owned.push_str(valid_str);
owned.push_str(other);
*self = SliceOrString::String(owned);
}
Ok(())
}
SliceOrString::String(owned) => owned.write_str(other),
}
}
}

impl<'a> SliceOrString<'a> {
#[inline]
fn new(slice: &'a [u8]) -> Self {
Self::Slice(PartiallyValidatedUtf8::new(slice))
}

#[inline]
fn finish(self) -> Cow<'a, str> {
match self {
SliceOrString::Slice(slice) => Cow::Borrowed(slice.validated_as_str()),
SliceOrString::String(owned) => Cow::Owned(owned),
}
}
}

/// Writes the contents of a `Writeable` to a string, returning a reference
/// to a slice if it matches the provided reference bytes, and allocating a
/// String otherwise.
///
/// This function is useful if you have borrowed bytes which you expect
/// to be equal to a writeable a high percentage of the time.
///
/// You can also use this function to make a more efficient implementation of
/// [`Writeable::write_to_string`].
///
/// # Examples
///
/// Basic usage and behavior:
///
/// ```
/// use std::fmt;
/// use std::borrow::Cow;
/// use writeable::Writeable;
///
/// struct WelcomeMessage<'s> {
/// pub name: &'s str,
/// }
///
/// impl<'s> Writeable for WelcomeMessage<'s> {
/// // see impl in Writeable docs
/// # fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
/// # sink.write_str("Hello, ")?;
/// # sink.write_str(self.name)?;
/// # sink.write_char('!')?;
/// # Ok(())
/// # }
/// }
///
/// let message = WelcomeMessage { name: "Alice" };
///
/// assert!(matches!(
/// writeable::to_string_or_borrow(&message, b""),
/// Cow::Owned(s) if s == "Hello, Alice!"
/// ));
/// assert!(matches!(
/// writeable::to_string_or_borrow(&message, b"Hello"),
/// Cow::Owned(s) if s == "Hello, Alice!"
/// ));
/// assert!(matches!(
/// writeable::to_string_or_borrow(&message, b"Hello, Bob!"),
/// Cow::Owned(s) if s == "Hello, Alice!"
/// ));
/// assert!(matches!(
/// writeable::to_string_or_borrow(&message, b"Hello, Alice!"),
/// Cow::Borrowed("Hello, Alice!")
/// ));
///
/// // Borrowing can use a prefix:
/// assert!(matches!(
/// writeable::to_string_or_borrow(&message, b"Hello, Alice!..\xFF\x00\xFF"),
/// Cow::Borrowed("Hello, Alice!")
/// ));
/// ```
///
/// Example use case: a function that transforms a string to lowercase.
/// We are also able to write a more efficient implementation of
/// [`Writeable::write_to_string`] in this situation.
///
/// ```
/// use std::fmt;
/// use std::borrow::Cow;
/// use writeable::Writeable;
///
/// struct MakeAsciiLower<'a>(&'a str);
///
/// impl<'a> Writeable for MakeAsciiLower<'a> {
/// fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
/// for c in self.0.chars() {
/// sink.write_char(c.to_ascii_lowercase())?;
/// }
/// Ok(())
/// }
/// #[inline]
/// fn write_to_string(&self) -> Cow<str> {
/// writeable::to_string_or_borrow(self, self.0.as_bytes())
/// }
/// }
///
/// fn make_lowercase(input: &str) -> Cow<str> {
/// let writeable = MakeAsciiLower(input);
/// writeable::to_string_or_borrow(&writeable, input.as_bytes())
/// }
///
/// assert!(matches!(
/// make_lowercase("this is lowercase"),
/// Cow::Borrowed("this is lowercase")
/// ));
/// assert!(matches!(
/// make_lowercase("this is UPPERCASE"),
/// Cow::Owned(s) if s == "this is uppercase"
/// ));
///
/// assert!(matches!(
/// MakeAsciiLower("this is lowercase").write_to_string(),
/// Cow::Borrowed("this is lowercase")
/// ));
/// assert!(matches!(
/// MakeAsciiLower("this is UPPERCASE").write_to_string(),
/// Cow::Owned(s) if s == "this is uppercase"
/// ));
/// ```
pub fn to_string_or_borrow<'a>(
writeable: &impl Writeable,
reference_bytes: &'a [u8],
) -> Cow<'a, str> {
let mut sink = SliceOrString::new(reference_bytes);
let _ = writeable.write_to(&mut sink);
sink.finish()
}

0 comments on commit 8344161

Please sign in to comment.