Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changing baked data to use zerotrie #5064

Merged
merged 13 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion provider/baked/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ include.workspace = true

[dependencies]
icu_provider = { workspace = true }
writeable = { workspace = true }
zerotrie = { workspace = true, features = ["alloc"] }

crlify = { workspace = true, optional = true }
databake = { workspace = true, optional = true}
Expand All @@ -39,5 +41,6 @@ export = [
"dep:itertools",
"dep:log",
"dep:proc-macro2",
"icu_provider/export"
"icu_provider/export",
"zerotrie/databake",
]
1 change: 1 addition & 0 deletions provider/baked/src/binary_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use databake::*;
use icu_provider::prelude::*;

#[cfg(feature = "export")]
#[allow(dead_code)]
pub(crate) fn bake(
marker_bake: &TokenStream,
mut ids_to_idents: Vec<(DataIdentifierCow, proc_macro2::Ident)>,
Expand Down
4 changes: 2 additions & 2 deletions provider/baked/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ impl BakedExporter {
"* {structs_total_size}B[^1] for the singleton data struct\n "
);
} else {
let _infallible = write!(&mut doc, "* {lookup_struct_size}B[^1] for the lookup data structure ({identifiers_count} data identifiers)\n ");
let _infallible = write!(&mut doc, "* {lookup_struct_size}B for the lookup data structure ({identifiers_count} data identifiers)\n ");
let _infallible = write!(&mut doc, "* {structs_total_size}B[^1] for the actual data ({structs_count} unique structs)\n ");
};
let _infallible = write!(
Expand Down Expand Up @@ -556,7 +556,7 @@ impl DataExporter for BakedExporter {
.unwrap();

let (data, lookup_struct_size) =
crate::binary_search::bake(&marker_bake, ids_to_idents, idents_to_bakes);
crate::zerotrie::bake(&marker_bake, ids_to_idents, idents_to_bakes);

stats.lookup_struct_size = lookup_struct_size;

Expand Down
3 changes: 2 additions & 1 deletion provider/baked/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@ pub mod export;
pub use icu_provider::prelude::*;

pub mod binary_search;
pub mod zerotrie;

pub trait DataStore<M: DataMarker> {
fn get(&self, req: DataIdentifierBorrowed) -> Option<&'static M::Yokeable>;

type IterReturn: Iterator<Item = DataIdentifierCow<'static>>;
fn iter(&self) -> Self::IterReturn;
fn iter(&'static self) -> Self::IterReturn;
}
92 changes: 92 additions & 0 deletions provider/baked/src/zerotrie.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! Data stored as as [`ZeroTrieSimpleAscii`]

// This is a valid separator as `DataLocale` will never produce it.
const ID_SEPARATOR: u8 = 0x1E;

#[cfg(feature = "export")]
use databake::*;
use icu_provider::prelude::*;
pub use zerotrie::ZeroTrieSimpleAscii;

#[cfg(feature = "export")]
pub(crate) fn bake(
marker_bake: &TokenStream,
ids_to_idents: Vec<(DataIdentifierCow, proc_macro2::Ident)>,
idents_to_bakes: Vec<(proc_macro2::Ident, TokenStream)>,
) -> (TokenStream, usize) {
let bakes = idents_to_bakes.iter().map(|(_, bake)| bake);

let bake_indices = idents_to_bakes
.iter()
.enumerate()
.map(|(i, (ident, _))| (ident, i))
.collect::<std::collections::HashMap<&proc_macro2::Ident, usize>>();

let trie = ZeroTrieSimpleAscii::from_iter(ids_to_idents.iter().map(|(id, ident)| {
let mut encoded = id.locale.to_string().into_bytes();
if !id.marker_attributes.is_empty() {
encoded.push(ID_SEPARATOR);
encoded.extend_from_slice(id.marker_attributes.as_bytes());
}
(encoded, bake_indices[ident])
}));

let baked_trie = trie.as_borrowed_slice().bake(&Default::default());

(
quote! {
icu_provider_baked::zerotrie::Data<#marker_bake> = icu_provider_baked::zerotrie::Data {
trie: icu_provider_baked:: #baked_trie,
values: &[#(#bakes,)*],
}
},
core::mem::size_of::<Data<icu_provider::hello_world::HelloWorldV1Marker>>()
+ trie.as_borrowed_slice().borrows_size(),
)
}

pub struct Data<M: DataMarker> {
pub trie: ZeroTrieSimpleAscii<&'static [u8]>,
pub values: &'static [M::Yokeable],
}

impl<M: DataMarker> super::DataStore<M> for Data<M> {
fn get(&self, id: DataIdentifierBorrowed) -> Option<&'static <M>::Yokeable> {
use writeable::Writeable;
let mut cursor = self.trie.cursor();
let _is_ascii = id.locale.write_to(&mut cursor);
if !id.marker_attributes.is_empty() {
cursor.step(ID_SEPARATOR);
id.marker_attributes.write_to(&mut cursor).ok()?;
}
cursor
.take_value()
.map(|i| unsafe { self.values.get_unchecked(i) })
}

type IterReturn = core::iter::FilterMap<
core::iter::Map<
zerotrie::ZeroTrieIterator<'static>,
fn((alloc::vec::Vec<u8>, usize)) -> (alloc::string::String, usize),
>,
fn((alloc::string::String, usize)) -> Option<DataIdentifierCow<'static>>,
>;
fn iter(&'static self) -> Self::IterReturn {
#![allow(unused_imports)]
use alloc::borrow::ToOwned;
self.trie.iter().filter_map(move |(s, _)| {
if let Some((locale, attrs)) = s.split_once(ID_SEPARATOR as char) {
Some(DataIdentifierCow::from_owned(
DataMarkerAttributes::try_from_str(attrs).ok()?.to_owned(),
locale.parse().ok()?,
))
} else {
s.parse().ok().map(DataIdentifierCow::from_locale)
}
})
}
}
2 changes: 1 addition & 1 deletion provider/baked/tests/data/fingerprints.csv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
core/helloworld@1, <lookup>, 1096B, 27 identifiers
core/helloworld@1, <lookup>, 176B, 27 identifiers
34 changes: 2 additions & 32 deletions provider/baked/tests/data/hello_world_v1_marker.rs.data
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
/// `icu`'s `_unstable` constructors.
///
/// Using this implementation will embed the following data in the binary's data segment:
/// * 1096B[^1] for the lookup data structure (27 data identifiers)
/// * 176B for the lookup data structure (27 data identifiers)
/// * 1100B[^1] for the actual data (27 unique structs)
///
/// [^1]: these numbers can be smaller in practice due to linker deduplication
Expand All @@ -16,37 +16,7 @@ macro_rules! __impl_hello_world_v1_marker {
const _: () = <$provider>::MUST_USE_MAKE_PROVIDER_MACRO;
#[clippy::msrv = "1.70"]
impl $provider {
const DATA_HELLO_WORLD_V1_MARKER: icu_provider_baked::binary_search::Data<icu_provider_baked::binary_search::AttributesAndLocale, icu_provider::hello_world::HelloWorldV1Marker> = {
type S = <icu_provider::hello_world::HelloWorldV1Marker as icu_provider::DynamicDataMarker>::Yokeable;
const _REVERSE_EN: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Olleh Dlrow") };
const _REVERSE_JA: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("界世はちにんこ") };
const __BN: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("ওহে বিশ\u{9cd}ব") };
const __CS: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Ahoj světe") };
const __DE: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hallo Welt") };
const __DE_AT: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Servus Welt") };
const __EL: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Καλημέρα κόσμε") };
const __EN: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello World") };
const __EN_001: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🗺\u{fe0f}") };
const __EN_002: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🌍") };
const __EN_019: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🌎") };
const __EN_142: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🌏") };
const __EN_GB: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🇬🇧") };
const __EN_GB_U_SD_GBENG: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🏴\u{e0067}\u{e0062}\u{e0065}\u{e006e}\u{e0067}\u{e007f}") };
const __EO: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Saluton, Mondo") };
const __FA: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("سلام دنیا\u{200e}") };
const __FI: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("hei maailma") };
const __IS: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Halló, heimur") };
const __JA: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("こんにちは世界") };
const __LA: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Ave, munde") };
const __PT: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Olá, mundo") };
const __RO: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Salut, lume") };
const __RU: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Привет, мир") };
const __SR: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Поздрав свете") };
const __SR_LATN: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Pozdrav svete") };
const __VI: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Xin chào thế giới") };
const __ZH: &S = &icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("你好世界") };
icu_provider_baked::binary_search::Data(&[(("", "bn"), __BN), (("", "cs"), __CS), (("", "de"), __DE), (("", "de-AT"), __DE_AT), (("", "el"), __EL), (("", "en"), __EN), (("", "en-001"), __EN_001), (("", "en-002"), __EN_002), (("", "en-019"), __EN_019), (("", "en-142"), __EN_142), (("", "en-GB"), __EN_GB), (("", "en-GB-u-sd-gbeng"), __EN_GB_U_SD_GBENG), (("", "eo"), __EO), (("", "fa"), __FA), (("", "fi"), __FI), (("", "is"), __IS), (("", "ja"), __JA), (("", "la"), __LA), (("", "pt"), __PT), (("", "ro"), __RO), (("", "ru"), __RU), (("", "sr"), __SR), (("", "sr-Latn"), __SR_LATN), (("", "vi"), __VI), (("", "zh"), __ZH), (("reverse", "en"), _REVERSE_EN), (("reverse", "ja"), _REVERSE_JA)])
};
const DATA_HELLO_WORLD_V1_MARKER: icu_provider_baked::zerotrie::Data<icu_provider::hello_world::HelloWorldV1Marker> = icu_provider_baked::zerotrie::Data { trie: icu_provider_baked::zerotrie::ZeroTrieSimpleAscii { store: b"\xCDbcdefijlprsvz\x02\x04\nBILX[^fpsn\x82s\x83e\x84-AT\x85\xC3lno\x011\x86\x87\xC2\x1E-\x08reverse\x80\xC301G\x0C\x0F\xC201\x06\xC212\x01\x88\x899\x8A42\x8BB\x8C-u-sd-gbeng\x8D\x8E\xC2ai\x01\x8F\x90\0s\x90\x01a\x90\x02\x1Ereverse\x81a\x90\x03t\x90\x04\xC2ou\x02\x90\x05\x90\x06r\x90\x07-Latn\x90\x08i\x90\th\x90\n" }, values: &[icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Olleh Dlrow") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("界世はちにんこ") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("ওহে বিশ\u{9cd}ব") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Ahoj světe") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hallo Welt") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Servus Welt") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Καλημέρα κόσμε") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello World") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🗺\u{fe0f}") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🌍") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🌎") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🌏") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🇬🇧") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Hello from 🏴\u{e0067}\u{e0062}\u{e0065}\u{e006e}\u{e0067}\u{e007f}") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Saluton, Mondo") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("سلام دنیا\u{200e}") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("hei maailma") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Halló, heimur") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("こんにちは世界") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Ave, munde") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Olá, mundo") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Salut, lume") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Привет, мир") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Поздрав свете") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Pozdrav svete") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("Xin chào thế giới") }, icu_provider::hello_world::HelloWorldV1 { message: alloc::borrow::Cow::Borrowed("你好世界") }] };
}
#[clippy::msrv = "1.70"]
impl icu_provider::DataProvider<icu_provider::hello_world::HelloWorldV1Marker> for $provider {
Expand Down
Loading
Loading