diff --git a/Cargo.toml b/Cargo.toml index 675cd9d..245055c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ edition = "2021" cfg-if = "1.0" hashbrown = { version = "0.11", default-features = false, features = ["ahash"] } serde = { version = "1.0", optional = true } +len-trait = {version = "0.6.1", optional = true} [dev-dependencies] serde_json = "1.0" @@ -48,8 +49,7 @@ inline-more = ["hashbrown/inline-more"] # the need for those present backends. Reduces compilation time of this crate. # # Enabled by default. -backends = [] - +backends = ["len-trait"] # Enables testing of memory heap allocations. # # These tests are disabled by default since they are slow diff --git a/benches/setup.rs b/benches/setup.rs index 0824a36..4a12cdc 100644 --- a/benches/setup.rs +++ b/benches/setup.rs @@ -6,7 +6,6 @@ use string_interner::{ SimpleBackend, StringBackend, }, - DefaultSymbol, StringInterner, }; @@ -90,7 +89,7 @@ type StringInternerWith = StringInterner; pub trait BackendBenchmark { const NAME: &'static str; - type Backend: Backend; + type Backend: Backend; fn setup() -> StringInternerWith { >::new() @@ -122,23 +121,23 @@ pub trait BackendBenchmark { pub struct BenchBucket; impl BackendBenchmark for BenchBucket { const NAME: &'static str = "BucketBackend"; - type Backend = BucketBackend; + type Backend = BucketBackend; } pub struct BenchSimple; impl BackendBenchmark for BenchSimple { const NAME: &'static str = "SimpleBackend"; - type Backend = SimpleBackend; + type Backend = SimpleBackend; } pub struct BenchString; impl BackendBenchmark for BenchString { const NAME: &'static str = "StringBackend"; - type Backend = StringBackend; + type Backend = StringBackend; } pub struct BenchBuffer; impl BackendBenchmark for BenchBuffer { const NAME: &'static str = "BufferBackend"; - type Backend = BufferBackend; + type Backend = BufferBackend; } diff --git a/src/backend/bucket/fixed.rs b/src/backend/bucket/fixed.rs new file mode 100644 index 0000000..ebd55d8 --- /dev/null +++ b/src/backend/bucket/fixed.rs @@ -0,0 +1,65 @@ +use std::{ + ops::Deref, + ptr::NonNull, +}; + +use len_trait::{ + Len, + WithCapacity, +}; + +/// Represents a container with a fixed initial capacity that +/// is capable of pushing elements of type `&S` into its internal buffer only if the +/// elements don't exceed its fixed capacity. +/// +/// # Safety +/// +/// It is Undefined Behaviour if any mutable or internally mutable operation +/// invalidates previously generated [`NonNull`] pointers. +/// +/// In other words, implementations must guarantee that no reallocations +/// occur after creating the container. +pub unsafe trait FixedContainer: + Deref + WithCapacity + Len +{ + /// Push the given string into the fixed string if there is enough capacity. + /// + /// Returns a reference to the pushed string if there was enough capacity to + /// perform the operation. Otherwise returns `None`. + fn try_push_str(&mut self, string: &S) -> Option>; +} + +unsafe impl FixedContainer for String { + #[inline] + fn try_push_str(&mut self, string: &str) -> Option> { + let len = self.len(); + if self.capacity() < len + string.len() { + return None + } + self.push_str(string); + debug_assert_eq!(self.len(), len + string.len()); + Some(NonNull::from( + // SAFETY: We convert from bytes to utf8 from which we know through the + // input string that they must represent valid utf8. + unsafe { + core::str::from_utf8_unchecked(&self.as_bytes()[len..len + string.len()]) + }, + )) + } +} + +unsafe impl FixedContainer<[T]> for Vec +where + T: Clone, +{ + #[inline] + fn try_push_str(&mut self, string: &[T]) -> Option> { + let len = self.len(); + if self.capacity() < len + string.len() { + return None + } + self.extend_from_slice(string); + debug_assert_eq!(self.len(), len + string.len()); + Some(NonNull::from(&self[len..len + string.len()])) + } +} diff --git a/src/backend/bucket/fixed_str.rs b/src/backend/bucket/fixed_str.rs deleted file mode 100644 index cd4cc23..0000000 --- a/src/backend/bucket/fixed_str.rs +++ /dev/null @@ -1,64 +0,0 @@ -use super::InternedStr; - -#[derive(Debug, Default, Clone, PartialEq, Eq)] -pub struct FixedString { - contents: String, -} - -impl FixedString { - /// Creates a new fixed string with the given fixed capacity. - #[inline] - pub fn with_capacity(cap: usize) -> Self { - Self { - contents: String::with_capacity(cap), - } - } - - /// Returns the underlying [`Box`]. - /// - /// Guarantees not to perform any reallocations in this process. - #[inline] - pub fn finish(self) -> String { - self.contents - } - - /// Returns the capacity in bytes of the fixed string. - #[inline] - pub fn capacity(&self) -> usize { - self.contents.capacity() - } - - /// Returns the length in bytes of the fixed string. - #[inline] - pub fn len(&self) -> usize { - self.contents.len() - } - - /// Pushes the given string into the fixed string if there is enough capacity. - /// - /// Returns a reference to the pushed string if there was enough capacity to - /// perform the operation. Otherwise returns `None`. - #[inline] - pub fn push_str(&mut self, string: &str) -> Option { - let len = self.len(); - if self.capacity() < len + string.len() { - return None - } - self.contents.push_str(string); - debug_assert_eq!(self.contents.len(), len + string.len()); - Some(InternedStr::new( - // SAFETY: We convert from bytes to utf8 from which we know through the - // input string that they must represent valid utf8. - unsafe { - core::str::from_utf8_unchecked( - &self.contents.as_bytes()[len..len + string.len()], - ) - }, - )) - } - - /// Shrink capacity to fit the contents exactly. - pub fn shrink_to_fit(&mut self) { - self.contents.shrink_to_fit(); - } -} diff --git a/src/backend/bucket/interned_str.rs b/src/backend/bucket/interned_str.rs deleted file mode 100644 index 397f6c0..0000000 --- a/src/backend/bucket/interned_str.rs +++ /dev/null @@ -1,56 +0,0 @@ -#![cfg(feature = "backends")] - -use core::ptr::NonNull; - -/// Reference to an interned string. -/// -/// It is inherently `unsafe` to use instances of this type and should not be -/// done outside of the `string-interner` crate itself. -#[derive(Debug)] -#[repr(transparent)] -pub struct InternedStr { - ptr: NonNull, -} - -impl InternedStr { - /// Creates a new interned string from the given `str`. - #[inline] - pub fn new(val: &str) -> Self { - InternedStr { - ptr: NonNull::from(val), - } - } - - /// Returns a shared reference to the underlying string. - /// - /// # Safety - /// - /// The user has to make sure that no lifetime guarantees are invalidated. - #[inline] - pub(super) fn as_str(&self) -> &str { - // SAFETY: This is safe since we only ever operate on interned `str` - // that are never moved around in memory to avoid danling - // references. - unsafe { self.ptr.as_ref() } - } -} - -impl Eq for InternedStr {} - -impl PartialEq for InternedStr { - #[inline] - fn eq(&self, other: &Self) -> bool { - self.as_str() == other.as_str() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn size_of() { - use std::mem; - assert_eq!(mem::size_of::(), mem::size_of::<&str>()); - } -} diff --git a/src/backend/bucket/mod.rs b/src/backend/bucket/mod.rs index e3a2815..9153368 100644 --- a/src/backend/bucket/mod.rs +++ b/src/backend/bucket/mod.rs @@ -1,13 +1,45 @@ #![cfg(feature = "backends")] -mod fixed_str; -mod interned_str; +//! An interner backend that reduces memory allocations by using string buckets. +//! +//! # Note +//! +//! Implementation inspired by matklad's blog post that can be found here: +//! +//! +//! # Usage Hint +//! +//! Use when deallocations or copy overhead is costly or when +//! interning of static strings is especially common. +//! +//! # Usage +//! +//! - **Fill:** Efficiency of filling an empty string interner. +//! - **Resolve:** Efficiency of interned string look-up given a symbol. +//! - **Allocations:** The number of allocations performed by the backend. +//! - **Footprint:** The total heap memory consumed by the backend. +//! - **Contiguous:** True if the returned symbols have contiguous values. +//! +//! Rating varies between **bad**, **ok**, **good** and **best**. +//! +//! | Scenario | Rating | +//! |:------------|:--------:| +//! | Fill | **good** | +//! | Resolve | **ok** | +//! | Allocations | **good** | +//! | Footprint | **ok** | +//! | Supports `get_or_intern_static` | **yes** | +//! | `Send` + `Sync` | **yes** | +//! | Contiguous | **yes** | -use self::{ - fixed_str::FixedString, - interned_str::InternedStr, +mod fixed; + +pub use fixed::FixedContainer; + +use super::{ + Backend, + Internable, }; -use super::Backend; use crate::{ compat::Vec, symbol::expect_valid_symbol, @@ -19,44 +51,26 @@ use core::{ marker::PhantomData, slice, }; +use len_trait::{ + Capacity, + Len, + WithCapacity, +}; +use std::ptr::NonNull; /// An interner backend that reduces memory allocations by using string buckets. /// -/// # Note -/// -/// Implementation inspired by matklad's blog post that can be found here: -/// -/// -/// # Usage Hint -/// -/// Use when deallocations or copy overhead is costly or when -/// interning of static strings is especially common. -/// -/// # Usage -/// -/// - **Fill:** Efficiency of filling an empty string interner. -/// - **Resolve:** Efficiency of interned string look-up given a symbol. -/// - **Allocations:** The number of allocations performed by the backend. -/// - **Footprint:** The total heap memory consumed by the backend. -/// - **Contiguous:** True if the returned symbols have contiguous values. -/// -/// Rating varies between **bad**, **ok**, **good** and **best**. -/// -/// | Scenario | Rating | -/// |:------------|:--------:| -/// | Fill | **good** | -/// | Resolve | **ok** | -/// | Allocations | **good** | -/// | Footprint | **ok** | -/// | Supports `get_or_intern_static` | **yes** | -/// | `Send` + `Sync` | **yes** | -/// | Contiguous | **yes** | +/// See the [module-level documentation](self) for more. #[derive(Debug)] -pub struct BucketBackend { - spans: Vec, - head: FixedString, - full: Vec, - marker: PhantomData S>, +pub struct BucketBackend +where + S: ?Sized + Internable, + S::Container: FixedContainer, +{ + spans: Vec>, + head: S::Container, + full: Vec, + _marker: PhantomData Sym>, } /// # Safety @@ -64,45 +78,64 @@ pub struct BucketBackend { /// The bucket backend requires a manual [`Send`] impl because it is self /// referential. When cloning a bucket backend a deep clone is performed and /// all references to itself are updated for the clone. -unsafe impl Send for BucketBackend where S: Symbol {} +unsafe impl Send for BucketBackend +where + Sym: Symbol, + S: ?Sized + Internable, + S::Container: FixedContainer, +{ +} /// # Safety /// /// The bucket backend requires a manual [`Send`] impl because it is self /// referential. Those references won't escape its own scope and also /// the bucket backend has no interior mutability. -unsafe impl Sync for BucketBackend where S: Symbol {} +unsafe impl Sync for BucketBackend +where + Sym: Symbol, + S: ?Sized + Internable, + S::Container: FixedContainer, +{ +} -impl Default for BucketBackend { +impl Default for BucketBackend +where + S: ?Sized + Internable, + S::Container: FixedContainer, +{ #[cfg_attr(feature = "inline-more", inline)] fn default() -> Self { Self { spans: Vec::new(), - head: FixedString::default(), + head: S::Container::default(), full: Vec::new(), - marker: Default::default(), + _marker: Default::default(), } } } -impl Backend for BucketBackend +impl Backend for BucketBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, + S::Container: FixedContainer, { - type Symbol = S; + type Str = S; + type Symbol = Sym; #[cfg_attr(feature = "inline-more", inline)] fn with_capacity(cap: usize) -> Self { Self { spans: Vec::with_capacity(cap), - head: FixedString::with_capacity(cap), + head: S::Container::with_capacity(cap), full: Vec::new(), - marker: Default::default(), + _marker: Default::default(), } } #[inline] - fn intern(&mut self, string: &str) -> Self::Symbol { + fn intern(&mut self, string: &S) -> Self::Symbol { // SAFETY: This is safe because we never hand out the returned // interned string instance to the outside and only operate // on it within this backend. @@ -111,103 +144,135 @@ where } #[cfg_attr(feature = "inline-more", inline)] - fn intern_static(&mut self, string: &'static str) -> Self::Symbol { - let interned = InternedStr::new(string); + fn intern_static(&mut self, string: &'static S) -> Self::Symbol { + let interned = NonNull::from(string); self.push_span(interned) } fn shrink_to_fit(&mut self) { self.spans.shrink_to_fit(); - self.head.shrink_to_fit(); self.full.shrink_to_fit(); } #[inline] - fn resolve(&self, symbol: Self::Symbol) -> Option<&str> { - self.spans.get(symbol.to_usize()).map(InternedStr::as_str) + fn resolve(&self, symbol: Self::Symbol) -> Option<&S> { + // SAFETY: A `FixedContainer` cannot invalidate pointers to its interned + // strings, making its spans always valid. + unsafe { self.spans.get(symbol.to_usize()).map(|p| p.as_ref()) } } #[inline] - unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str { + unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &S { // SAFETY: The function is marked unsafe so that the caller guarantees // that required invariants are checked. - unsafe { self.spans.get_unchecked(symbol.to_usize()).as_str() } + unsafe { self.spans.get_unchecked(symbol.to_usize()).as_ref() } } } -impl BucketBackend +impl BucketBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, + S::Container: FixedContainer, { /// Returns the next available symbol. - fn next_symbol(&self) -> S { + fn next_symbol(&self) -> Sym { expect_valid_symbol(self.spans.len()) } /// Pushes the given interned string into the spans and returns its symbol. - fn push_span(&mut self, interned: InternedStr) -> S { + fn push_span(&mut self, interned: NonNull) -> Sym { let symbol = self.next_symbol(); self.spans.push(interned); symbol } /// Interns a new string into the backend and returns a reference to it. - unsafe fn alloc(&mut self, string: &str) -> InternedStr { + unsafe fn alloc(&mut self, string: &S) -> NonNull { let cap = self.head.capacity(); if cap < self.head.len() + string.len() { let new_cap = (usize::max(cap, string.len()) + 1).next_power_of_two(); - let new_head = FixedString::with_capacity(new_cap); + let new_head = S::Container::with_capacity(new_cap); let old_head = core::mem::replace(&mut self.head, new_head); - self.full.push(old_head.finish()); + self.full.push(old_head); } self.head - .push_str(string) + .try_push_str(string) .expect("encountered invalid head capacity (2)") } } -impl Clone for BucketBackend { +impl Clone for BucketBackend +where + S: ?Sized + Internable, + S::Container: FixedContainer, +{ fn clone(&self) -> Self { // For performance reasons we copy all cloned strings into a single cloned // head string leaving the cloned `full` empty. let new_head_cap = self.head.capacity() + self.full.iter().fold(0, |lhs, rhs| lhs + rhs.len()); - let mut head = FixedString::with_capacity(new_head_cap); + let mut head = S::Container::with_capacity(new_head_cap); let mut spans = Vec::with_capacity(self.spans.len()); for span in &self.spans { - let string = span.as_str(); - let interned = head - .push_str(string) - .expect("encountered invalid head capacity"); - spans.push(interned); + // SAFETY: This is safe because a `FixedContainer` cannot invalidate pointers + // to its interned strings, making its references always valid. + unsafe { + let string = span.as_ref(); + let interned = head + .try_push_str(string) + .expect("encountered invalid head capacity"); + spans.push(interned); + } } Self { spans, head, full: Vec::new(), - marker: Default::default(), + _marker: Default::default(), } } } -impl Eq for BucketBackend where S: Symbol {} +impl Eq for BucketBackend +where + Sym: Symbol, + S: ?Sized + Internable + Eq, + S::Container: FixedContainer, +{ +} -impl PartialEq for BucketBackend +impl PartialEq for BucketBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable + PartialEq, + S::Container: FixedContainer, { #[cfg_attr(feature = "inline-more", inline)] fn eq(&self, other: &Self) -> bool { - self.spans == other.spans + if self.spans.len() != other.spans.len() { + return false + } + + // SAFETY: A `FixedContainer` cannot invalidate pointers to its interned + // strings, making its spans always valid. + unsafe { + self.spans + .iter() + .zip(other.spans.iter()) + .all(|(x, y)| x.as_ref() == y.as_ref()) + } } } -impl<'a, S> IntoIterator for &'a BucketBackend +impl<'a, S, Sym> IntoIterator for &'a BucketBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, + S::Container: FixedContainer, { - type Item = (S, &'a str); - type IntoIter = Iter<'a, S>; + type Item = (Sym, &'a S); + type IntoIter = Iter<'a, S, Sym>; #[cfg_attr(feature = "inline-more", inline)] fn into_iter(self) -> Self::IntoIter { @@ -215,26 +280,38 @@ where } } -pub struct Iter<'a, S> { - iter: Enumerate>, - symbol_marker: PhantomData S>, +/// Iterator for a [`BucketBackend`](crate::backend::bucket::BucketBackend) +/// that returns all of its interned strings. +pub struct Iter<'a, S, Sym> +where + S: ?Sized + Internable, + S::Container: FixedContainer, +{ + iter: Enumerate>>, + marker: PhantomData Sym>, } -impl<'a, S> Iter<'a, S> { +impl<'a, S, Sym> Iter<'a, S, Sym> +where + S: ?Sized + Internable, + S::Container: FixedContainer, +{ #[cfg_attr(feature = "inline-more", inline)] - pub fn new(backend: &'a BucketBackend) -> Self { + pub(super) fn new(backend: &'a BucketBackend) -> Self { Self { iter: backend.spans.iter().enumerate(), - symbol_marker: Default::default(), + marker: Default::default(), } } } -impl<'a, S> Iterator for Iter<'a, S> +impl<'a, S, Sym> Iterator for Iter<'a, S, Sym> where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, + S::Container: FixedContainer, { - type Item = (S, &'a str); + type Item = (Sym, &'a S); #[inline] fn size_hint(&self) -> (usize, Option) { @@ -243,8 +320,10 @@ where #[inline] fn next(&mut self) -> Option { - self.iter - .next() - .map(|(id, interned)| (expect_valid_symbol(id), interned.as_str())) + unsafe { + self.iter + .next() + .map(|(id, interned)| (expect_valid_symbol(id), interned.as_ref())) + } } } diff --git a/src/backend/buffer.rs b/src/backend/buffer.rs index 33204fd..359982d 100644 --- a/src/backend/buffer.rs +++ b/src/backend/buffer.rs @@ -1,6 +1,36 @@ #![cfg(feature = "backends")] -use super::Backend; +//! An interner backend that appends all interned string information in a single buffer. +//! +//! # Usage Hint +//! +//! Use this backend if memory consumption is what matters most to you. +//! Note though that unlike all other backends symbol values are not contigous! +//! +//! # Usage +//! +//! - **Fill:** Efficiency of filling an empty string interner. +//! - **Resolve:** Efficiency of interned string look-up given a symbol. +//! - **Allocations:** The number of allocations performed by the backend. +//! - **Footprint:** The total heap memory consumed by the backend. +//! - **Contiguous:** True if the returned symbols have contiguous values. +//! +//! Rating varies between **bad**, **ok**, **good** and **best**. +//! +//! | Scenario | Rating | +//! |:------------|:--------:| +//! | Fill | **best** | +//! | Resolve | **bad** | +//! | Allocations | **best** | +//! | Footprint | **best** | +//! | Supports `get_or_intern_static` | **no** | +//! | `Send` + `Sync` | **yes** | +//! | Contiguous | **no** | + +use super::{ + Backend, + Internable, +}; use crate::{ compat::Vec, symbol::expect_valid_symbol, @@ -10,54 +40,42 @@ use crate::{ use core::{ marker::PhantomData, mem, - str, }; /// An interner backend that appends all interned string information in a single buffer. /// -/// # Usage Hint -/// -/// Use this backend if memory consumption is what matters most to you. -/// Note though that unlike all other backends symbol values are not contigous! -/// -/// # Usage -/// -/// - **Fill:** Efficiency of filling an empty string interner. -/// - **Resolve:** Efficiency of interned string look-up given a symbol. -/// - **Allocations:** The number of allocations performed by the backend. -/// - **Footprint:** The total heap memory consumed by the backend. -/// - **Contiguous:** True if the returned symbols have contiguous values. -/// -/// Rating varies between **bad**, **ok**, **good** and **best**. -/// -/// | Scenario | Rating | -/// |:------------|:--------:| -/// | Fill | **best** | -/// | Resolve | **bad** | -/// | Allocations | **best** | -/// | Footprint | **best** | -/// | Supports `get_or_intern_static` | **no** | -/// | `Send` + `Sync` | **yes** | -/// | Contiguous | **no** | +/// See the [module-level documentation](self) for more. #[derive(Debug)] -pub struct BufferBackend { +pub struct BufferBackend +where + S: ?Sized + Internable, +{ len_strings: usize, buffer: Vec, - marker: PhantomData S>, + marker: PhantomData Sym>, } -impl PartialEq for BufferBackend +impl PartialEq for BufferBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { fn eq(&self, other: &Self) -> bool { self.len_strings.eq(&other.len_strings) && self.buffer.eq(&other.buffer) } } -impl Eq for BufferBackend where S: Symbol {} +impl Eq for BufferBackend +where + Sym: Symbol, + S: ?Sized + Internable, +{ +} -impl Clone for BufferBackend { +impl Clone for BufferBackend +where + S: ?Sized + Internable, +{ fn clone(&self) -> Self { Self { len_strings: self.len_strings, @@ -67,7 +85,10 @@ impl Clone for BufferBackend { } } -impl Default for BufferBackend { +impl Default for BufferBackend +where + S: ?Sized + Internable, +{ #[cfg_attr(feature = "inline-more", inline)] fn default() -> Self { Self { @@ -78,13 +99,14 @@ impl Default for BufferBackend { } } -impl BufferBackend +impl BufferBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { /// Returns the next available symbol. #[inline] - fn next_symbol(&self) -> S { + fn next_symbol(&self) -> Sym { expect_valid_symbol(self.buffer.len()) } @@ -94,15 +116,12 @@ where /// /// Returns the string from the given index if any as well /// as the index of the next string in the buffer. - fn resolve_index_to_str(&self, index: usize) -> Option<(&str, usize)> { - let bytes = self.buffer.get(index..)?; - let (str_len, str_len_bytes) = decode_var_usize(bytes)?; - let index_str = index + str_len_bytes; + fn resolve_index_to_str(&self, index: usize) -> Option<(&S, usize)> { + let buffer = self.buffer.get(index..)?; + let (str_len, decoded_bytes) = decode_var_usize(buffer)?; + let index_str = index + decoded_bytes; let str_bytes = self.buffer.get(index_str..index_str + str_len)?; - // SAFETY: It is guaranteed by the backend that only valid strings - // are stored in this portion of the buffer. - let string = unsafe { str::from_utf8_unchecked(str_bytes) }; - Some((string, index_str + str_len)) + Some((S::from_bytes(str_bytes), index_str + str_len)) } /// Resolves the string for the given symbol. @@ -115,21 +134,23 @@ where /// /// The caller of the function has to ensure that calling this method /// is safe to do. - unsafe fn resolve_index_to_str_unchecked(&self, index: usize) -> &str { + unsafe fn resolve_index_to_str_unchecked(&self, index: usize) -> &S { // SAFETY: The function is marked unsafe so that the caller guarantees // that required invariants are checked. - let slice_len = unsafe { self.buffer.get_unchecked(index..) }; + let buffer = unsafe { self.buffer.get_unchecked(index..) }; + + // SAFETY: The function is marked unsafe so that the caller guarantees + // that required invariants are checked. + let (str_len, decoded_bytes) = unsafe { decode_var_usize_unchecked(buffer) }; + + let start_str = index + decoded_bytes; + // SAFETY: The function is marked unsafe so that the caller guarantees // that required invariants are checked. - let (str_len, str_len_bytes) = unsafe { decode_var_usize_unchecked(slice_len) }; - let start_str = index + str_len_bytes; let str_bytes = - // SAFETY: The function is marked unsafe so that the caller guarantees - // that required invariants are checked. unsafe { self.buffer.get_unchecked(start_str..start_str + str_len) }; - // SAFETY: It is guaranteed by the backend that only valid strings - // are stored in this portion of the buffer. - unsafe { str::from_utf8_unchecked(str_bytes) } + + S::from_bytes(str_bytes) } /// Pushes the given value onto the buffer with `var7` encoding. @@ -145,22 +166,24 @@ where /// # Panics /// /// If the backend ran out of symbols. - fn push_string(&mut self, string: &str) -> S { + fn push_string(&mut self, string: &S) -> Sym { let symbol = self.next_symbol(); - let str_len = string.len(); - let str_bytes = string.as_bytes(); - self.encode_var_usize(str_len); - self.buffer.extend(str_bytes); + let string = string.to_bytes(); + + self.encode_var_usize(string.len()); + self.buffer.extend_from_slice(string); self.len_strings += 1; symbol } } -impl Backend for BufferBackend +impl Backend for BufferBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { - type Symbol = S; + type Str = S; + type Symbol = Sym; #[cfg_attr(feature = "inline-more", inline)] fn with_capacity(capacity: usize) -> Self { @@ -168,7 +191,7 @@ where const LEN_USIZE: usize = mem::size_of::(); /// According to google the approx. word length is 5. const DEFAULT_STR_LEN: usize = 5; - let bytes_per_string = DEFAULT_STR_LEN * LEN_USIZE; + let bytes_per_string = DEFAULT_STR_LEN * LEN_USIZE * mem::size_of::(); Self { len_strings: 0, buffer: Vec::with_capacity(capacity * bytes_per_string), @@ -177,12 +200,12 @@ where } #[inline] - fn intern(&mut self, string: &str) -> Self::Symbol { + fn intern(&mut self, string: &S) -> Self::Symbol { self.push_string(string) } #[inline] - fn resolve(&self, symbol: Self::Symbol) -> Option<&str> { + fn resolve(&self, symbol: Self::Symbol) -> Option<&S> { self.resolve_index_to_str(symbol.to_usize()) .map(|(string, _next_str_index)| string) } @@ -192,7 +215,7 @@ where } #[inline] - unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str { + unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &S { // SAFETY: The function is marked unsafe so that the caller guarantees // that required invariants are checked. unsafe { self.resolve_index_to_str_unchecked(symbol.to_usize()) } @@ -404,12 +427,14 @@ mod tests { } } -impl<'a, S> IntoIterator for &'a BufferBackend +impl<'a, S, Sym> IntoIterator for &'a BufferBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, + S::Element: Copy, { - type Item = (S, &'a str); - type IntoIter = Iter<'a, S>; + type Item = (Sym, &'a S); + type IntoIter = Iter<'a, S, Sym>; #[cfg_attr(feature = "inline-more", inline)] fn into_iter(self) -> Self::IntoIter { @@ -417,15 +442,23 @@ where } } -pub struct Iter<'a, S> { - backend: &'a BufferBackend, +/// Iterator for a [`BufferBackend`](crate::backend::buffer::BufferBackend) +/// that returns all of its interned strings. +pub struct Iter<'a, S, Sym> +where + S: ?Sized + Internable, +{ + backend: &'a BufferBackend, yielded: usize, current: usize, } -impl<'a, S> Iter<'a, S> { +impl<'a, S, Sym> Iter<'a, S, Sym> +where + S: ?Sized + Internable, +{ #[cfg_attr(feature = "inline-more", inline)] - pub fn new(backend: &'a BufferBackend) -> Self { + pub(super) fn new(backend: &'a BufferBackend) -> Self { Self { backend, yielded: 0, @@ -434,11 +467,13 @@ impl<'a, S> Iter<'a, S> { } } -impl<'a, S> Iterator for Iter<'a, S> +impl<'a, S, Sym> Iterator for Iter<'a, S, Sym> where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, + S::Element: Copy, { - type Item = (S, &'a str); + type Item = (Sym, &'a S); #[inline] fn size_hint(&self) -> (usize, Option) { @@ -450,7 +485,7 @@ where fn next(&mut self) -> Option { self.backend.resolve_index_to_str(self.current).and_then( |(string, next_string_index)| { - let symbol = S::try_from_usize(self.current)?; + let symbol = Sym::try_from_usize(self.current)?; self.current = next_string_index; self.yielded += 1; Some((symbol, string)) @@ -459,9 +494,11 @@ where } } -impl<'a, S> ExactSizeIterator for Iter<'a, S> +impl<'a, S, Sym> ExactSizeIterator for Iter<'a, S, Sym> where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, + S::Element: Copy, { fn len(&self) -> usize { self.backend.len_strings - self.yielded diff --git a/src/backend/mod.rs b/src/backend/mod.rs index 2920e50..f2a823b 100644 --- a/src/backend/mod.rs +++ b/src/backend/mod.rs @@ -4,10 +4,18 @@ //! There are trade-offs for the different kinds of backends. A user should //! find the backend that suits their use case best. -mod bucket; -mod buffer; -mod simple; -mod string; +pub mod bucket; +pub mod buffer; +pub mod simple; +pub mod string; + +use std::ops::Deref; + +use len_trait::{ + CapacityMut, + Len, + WithCapacity, +}; #[cfg(feature = "backends")] pub use self::{ @@ -20,15 +28,15 @@ use crate::Symbol; #[cfg(not(feature = "backends"))] /// Indicates that no proper backend is in use. -pub struct NoBackend(core::marker::PhantomData); +pub struct NoBackend; cfg_if::cfg_if! { if #[cfg(feature = "backends")] { /// The default backend recommended for general use. - pub type DefaultBackend = StringBackend; + pub type DefaultBackend = StringBackend; } else { /// The `backends` crate feature is disabled thus there is no default backend. - pub type DefaultBackend = NoBackend; + pub type DefaultBackend = NoBackend; } } @@ -38,6 +46,8 @@ cfg_if::cfg_if! { /// strings. Different backends have different trade-offs. Users should pick /// their backend with hinsight of their personal use-case. pub trait Backend: Default { + /// The type of the interned strings + type Str: Internable + ?Sized; /// The symbol used by the string interner backend. type Symbol: Symbol; @@ -52,7 +62,7 @@ pub trait Backend: Default { /// /// The backend must make sure that the returned symbol maps back to the /// original string in its [`resolve`](`Backend::resolve`) method. - fn intern(&mut self, string: &str) -> Self::Symbol; + fn intern(&mut self, string: &Self::Str) -> Self::Symbol; /// Interns the given static string and returns its interned ref and symbol. /// @@ -61,7 +71,7 @@ pub trait Backend: Default { /// The backend must make sure that the returned symbol maps back to the /// original string in its [`resolve`](`Backend::resolve`) method. #[inline] - fn intern_static(&mut self, string: &'static str) -> Self::Symbol { + fn intern_static(&mut self, string: &'static Self::Str) -> Self::Symbol { // The default implementation simply forwards to the normal [`intern`] // implementation. Backends that can optimize for this use case should // implement this method. @@ -72,7 +82,7 @@ pub trait Backend: Default { fn shrink_to_fit(&mut self); /// Resolves the given symbol to its original string contents. - fn resolve(&self, symbol: Self::Symbol) -> Option<&str>; + fn resolve(&self, symbol: Self::Symbol) -> Option<&Self::Str>; /// Resolves the given symbol to its original string contents. /// @@ -83,5 +93,118 @@ pub trait Backend: Default { /// by the [`intern`](`Backend::intern`) or /// [`intern_static`](`Backend::intern_static`) methods of the same /// interner backend. - unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str; + unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &Self::Str; +} + +/// Represents a type that is internable within all backends. +/// This type must be supported by a slice `&[Self::Element]`. +/// +/// This trait is the only bound needed for a type to be compatible +/// with almost all backends implemented in this crate, with the exception +/// of [`BucketBackend`](crate::backend::BucketBackend), which also requires a +/// [`FixedContainer`](crate::backend::bucket::FixedContainer) implementation +/// for `::Owned` +pub trait Internable: Len { + /// The container type used as a buffer for storing `Self`. + type Container: Into> + Deref + CapacityMut; + /// The element type of the slice view. + type Element: Copy; + /// Convert from a slice `[Self::Element]` to `Self`. + fn from_slice(input: &[Self::Element]) -> &Self; + /// Convert `Self` to a slice `[Self::Element]`. + fn to_slice(&self) -> &[Self::Element]; + /// Push the contents of Self into a `Self::Container`. + fn push_str(buffer: &mut Self::Container, str: &Self); + /// Create a new `Box` from the data of `Self`. + #[inline(always)] + fn to_boxed(&self) -> Box { + let mut c = Self::Container::with_capacity(self.len()); + Self::push_str(&mut c, self); + c.into() + } + /// Convert from a slice of bits `[u8]` to `Self`. + /// + /// For performance reasons, + /// the default implementation should be overriden for types where it's + /// trivial to transform between `Self` and `[u8]`. + #[inline] + fn from_bytes(buffer: &[u8]) -> &Self { + let count = buffer.len() / core::mem::size_of::(); + // SAFETY: It is guaranteed by the backend that only valid strings + // are stored. + unsafe { + Self::from_slice(core::slice::from_raw_parts( + buffer.as_ptr().cast::(), + count, + )) + } + } + /// Convert from `Self` to a slice of bits `[u8]`. + /// + /// For performance reasons, + /// the default implementation should be overriden for types where it's + /// trivial to transform between `Self` and `[u8]`. + #[inline] + fn to_bytes(&self) -> &[u8] { + let elems = self.to_slice(); + // SAFETY: A conversion from an own slice to its byte representation + // must always be valid. + unsafe { + core::slice::from_raw_parts( + elems.as_ptr().cast::(), + elems.len() * core::mem::size_of::(), + ) + } + } +} + +impl Internable for str { + type Container = String; + type Element = u8; + #[inline(always)] + fn from_slice(input: &[Self::Element]) -> &Self { + // SAFETY: Internally the backends only manipulate `&[u8]` slices + // which are valid utf-8. + unsafe { std::str::from_utf8_unchecked(input) } + } + #[inline(always)] + fn to_slice(&self) -> &[Self::Element] { + self.as_bytes() + } + #[inline(always)] + fn push_str(buffer: &mut Self::Container, str: &Self) { + buffer.push_str(str) + } + #[inline(always)] + fn to_boxed(&self) -> Box { + self.to_owned().into_boxed_str() + } + #[inline(always)] + fn from_bytes(buffer: &[u8]) -> &Self { + Self::from_slice(buffer) + } + #[inline(always)] + fn to_bytes(&self) -> &[u8] { + self.to_slice() + } +} + +impl Internable for [T] +where + T: Copy, +{ + type Container = Vec; + type Element = T; + #[inline(always)] + fn from_slice(input: &[Self::Element]) -> &Self { + input + } + #[inline(always)] + fn to_slice(&self) -> &[Self::Element] { + self + } + #[inline(always)] + fn push_str(buffer: &mut Self::Container, str: &Self) { + buffer.extend_from_slice(str) + } } diff --git a/src/backend/simple.rs b/src/backend/simple.rs index 564e92e..3f444a4 100644 --- a/src/backend/simple.rs +++ b/src/backend/simple.rs @@ -1,10 +1,42 @@ #![cfg(feature = "backends")] -use super::Backend; +//! A simple backend that stores a separate allocation for every interned string. +//! +//! Use this if you can afford many small allocations and if you want to have +//! especially decent performance for look-ups when the string interner is +//! already filled to some extend. +//! +//! # Usage Hint +//! +//! Never actually use this interner backend since it only acts as a trivial baseline. +//! +//! # Usage +//! +//! - **Fill:** Efficiency of filling an empty string interner. +//! - **Resolve:** Efficiency of interned string look-up given a symbol. +//! - **Allocations:** The number of allocations performed by the backend. +//! - **Footprint:** The total heap memory consumed by the backend. +//! - **Contiguous:** True if the returned symbols have contiguous values. +//! +//! Rating varies between **bad**, **ok**, **good** and **best**. +//! +//! | Scenario | Rating | +//! |:------------|:--------:| +//! | Fill | **bad** | +//! | Resolve | **good** | +//! | Allocations | **bad** | +//! | Footprint | **bad** | +//! | Supports `get_or_intern_static` | **no** | +//! | `Send` + `Sync` | **yes** | +//! | Contiguous | **yes** | + +use super::{ + Backend, + Internable, +}; use crate::{ compat::{ Box, - ToString, Vec, }, symbol::expect_valid_symbol, @@ -19,40 +51,20 @@ use core::{ /// A simple backend that stores a separate allocation for every interned string. /// -/// Use this if you can afford many small allocations and if you want to have -/// especially decent performance for look-ups when the string interner is -/// already filled to some extend. -/// -/// # Usage Hint -/// -/// Never actually use this interner backend since it only acts as a trivial baseline. -/// -/// # Usage -/// -/// - **Fill:** Efficiency of filling an empty string interner. -/// - **Resolve:** Efficiency of interned string look-up given a symbol. -/// - **Allocations:** The number of allocations performed by the backend. -/// - **Footprint:** The total heap memory consumed by the backend. -/// - **Contiguous:** True if the returned symbols have contiguous values. -/// -/// Rating varies between **bad**, **ok**, **good** and **best**. -/// -/// | Scenario | Rating | -/// |:------------|:--------:| -/// | Fill | **bad** | -/// | Resolve | **good** | -/// | Allocations | **bad** | -/// | Footprint | **bad** | -/// | Supports `get_or_intern_static` | **no** | -/// | `Send` + `Sync` | **yes** | -/// | Contiguous | **yes** | +/// See the [module-level documentation](self) for more. #[derive(Debug)] -pub struct SimpleBackend { - strings: Vec>, - symbol_marker: PhantomData S>, +pub struct SimpleBackend +where + S: ?Sized + Internable, +{ + strings: Vec>, + symbol_marker: PhantomData Sym>, } -impl Default for SimpleBackend { +impl Default for SimpleBackend +where + S: ?Sized + Internable, +{ #[cfg_attr(feature = "inline-more", inline)] fn default() -> Self { Self { @@ -62,11 +74,13 @@ impl Default for SimpleBackend { } } -impl Backend for SimpleBackend +impl Backend for SimpleBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { - type Symbol = S; + type Str = S; + type Symbol = Sym; #[cfg_attr(feature = "inline-more", inline)] fn with_capacity(cap: usize) -> Self { @@ -77,10 +91,10 @@ where } #[inline] - fn intern(&mut self, string: &str) -> Self::Symbol { + fn intern(&mut self, string: &S) -> Self::Symbol +where { let symbol = expect_valid_symbol(self.strings.len()); - let str = string.to_string().into_boxed_str(); - self.strings.push(str); + self.strings.push(string.to_boxed()); symbol } @@ -89,33 +103,42 @@ where } #[inline] - fn resolve(&self, symbol: Self::Symbol) -> Option<&str> { + fn resolve(&self, symbol: Self::Symbol) -> Option<&S> { self.strings.get(symbol.to_usize()).map(|pinned| &**pinned) } #[inline] - unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str { + unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &S { // SAFETY: The function is marked unsafe so that the caller guarantees // that required invariants are checked. unsafe { self.strings.get_unchecked(symbol.to_usize()) } } } -impl Clone for SimpleBackend { +impl Clone for SimpleBackend +where + S: ?Sized + Internable, +{ #[cfg_attr(feature = "inline-more", inline)] fn clone(&self) -> Self { Self { - strings: self.strings.clone(), + strings: self.strings.iter().map(|s| s.as_ref().to_boxed()).collect(), symbol_marker: Default::default(), } } } -impl Eq for SimpleBackend where S: Symbol {} +impl Eq for SimpleBackend +where + Sym: Symbol, + S: ?Sized + Internable + Eq, +{ +} -impl PartialEq for SimpleBackend +impl PartialEq for SimpleBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable + PartialEq, { #[cfg_attr(feature = "inline-more", inline)] fn eq(&self, other: &Self) -> bool { @@ -123,12 +146,13 @@ where } } -impl<'a, S> IntoIterator for &'a SimpleBackend +impl<'a, S, Sym> IntoIterator for &'a SimpleBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { - type Item = (S, &'a str); - type IntoIter = Iter<'a, S>; + type Item = (Sym, &'a S); + type IntoIter = Iter<'a, S, Sym>; #[cfg_attr(feature = "inline-more", inline)] fn into_iter(self) -> Self::IntoIter { @@ -136,14 +160,22 @@ where } } -pub struct Iter<'a, S> { - iter: Enumerate>>, - symbol_marker: PhantomData S>, +/// Iterator for a [`SimpleBackend`](crate::backend::simple::SimpleBackend) +/// that returns all of its interned strings. +pub struct Iter<'a, S, Sym> +where + S: ?Sized + Internable, +{ + iter: Enumerate>>, + symbol_marker: PhantomData Sym>, } -impl<'a, S> Iter<'a, S> { +impl<'a, S, Sym> Iter<'a, S, Sym> +where + S: ?Sized + Internable, +{ #[cfg_attr(feature = "inline-more", inline)] - pub fn new(backend: &'a SimpleBackend) -> Self { + pub(super) fn new(backend: &'a SimpleBackend) -> Self { Self { iter: backend.strings.iter().enumerate(), symbol_marker: Default::default(), @@ -151,11 +183,12 @@ impl<'a, S> Iter<'a, S> { } } -impl<'a, S> Iterator for Iter<'a, S> +impl<'a, S, Sym> Iterator for Iter<'a, S, Sym> where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { - type Item = (S, &'a str); + type Item = (Sym, &'a S); #[inline] fn size_hint(&self) -> (usize, Option) { diff --git a/src/backend/string.rs b/src/backend/string.rs index 320de24..1a3d5ae 100644 --- a/src/backend/string.rs +++ b/src/backend/string.rs @@ -1,11 +1,47 @@ #![cfg(feature = "backends")] -use super::Backend; +//! An interner backend that accumulates all interned string contents into one string. +//! +//! # Note +//! +//! Implementation inspired by [CAD97's](https://github.com/CAD97) research +//! project [`strena`](https://github.com/CAD97/strena). +//! +//! # Usage Hint +//! +//! Use this backend if runtime performance is what matters most to you. +//! +//! # Usage +//! +//! - **Fill:** Efficiency of filling an empty string interner. +//! - **Resolve:** Efficiency of interned string look-up given a symbol. +//! - **Allocations:** The number of allocations performed by the backend. +//! - **Footprint:** The total heap memory consumed by the backend. +//! - **Contiguous:** True if the returned symbols have contiguous values. +//! +//! Rating varies between **bad**, **ok**, **good** and **best**. +//! +//! | Scenario | Rating | +//! |:------------|:--------:| +//! | Fill | **good** | +//! | Resolve | **ok** | +//! | Allocations | **good** | +//! | Footprint | **good** | +//! | Supports `get_or_intern_static` | **no** | +//! | `Send` + `Sync` | **yes** | +//! | Contiguous | **yes** | + +use len_trait::{ + CapacityMut, + WithCapacity, +}; + +use super::{ + Backend, + Internable, +}; use crate::{ - compat::{ - String, - Vec, - }, + compat::Vec, symbol::expect_valid_symbol, DefaultSymbol, Symbol, @@ -18,51 +54,28 @@ use core::{ /// An interner backend that accumulates all interned string contents into one string. /// -/// # Note -/// -/// Implementation inspired by [CAD97's](https://github.com/CAD97) research -/// project [`strena`](https://github.com/CAD97/strena). -/// -/// # Usage Hint -/// -/// Use this backend if runtime performance is what matters most to you. -/// -/// # Usage -/// -/// - **Fill:** Efficiency of filling an empty string interner. -/// - **Resolve:** Efficiency of interned string look-up given a symbol. -/// - **Allocations:** The number of allocations performed by the backend. -/// - **Footprint:** The total heap memory consumed by the backend. -/// - **Contiguous:** True if the returned symbols have contiguous values. -/// -/// Rating varies between **bad**, **ok**, **good** and **best**. -/// -/// | Scenario | Rating | -/// |:------------|:--------:| -/// | Fill | **good** | -/// | Resolve | **ok** | -/// | Allocations | **good** | -/// | Footprint | **good** | -/// | Supports `get_or_intern_static` | **no** | -/// | `Send` + `Sync` | **yes** | -/// | Contiguous | **yes** | +/// See the [module-level documentation](self) for more. #[derive(Debug)] -pub struct StringBackend { +pub struct StringBackend +where + S: ?Sized + Internable, +{ ends: Vec, - buffer: String, - marker: PhantomData S>, + buffer: S::Container, + marker: PhantomData Sym>, } /// Represents a `[from, to)` index into the `StringBackend` buffer. #[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct Span { +struct Span { from: usize, to: usize, } -impl PartialEq for StringBackend +impl PartialEq for StringBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable + PartialEq, { fn eq(&self, other: &Self) -> bool { if self.ends.len() != other.ends.len() { @@ -77,9 +90,18 @@ where } } -impl Eq for StringBackend where S: Symbol {} +impl Eq for StringBackend +where + Sym: Symbol, + S: ?Sized + Internable + Eq, +{ +} -impl Clone for StringBackend { +impl Clone for StringBackend +where + S: ?Sized + Internable, + S::Container: Clone, +{ fn clone(&self) -> Self { Self { ends: self.ends.clone(), @@ -89,43 +111,40 @@ impl Clone for StringBackend { } } -impl Default for StringBackend { +impl Default for StringBackend +where + S: ?Sized + Internable, + S::Container: Default, +{ #[cfg_attr(feature = "inline-more", inline)] fn default() -> Self { Self { ends: Vec::default(), - buffer: String::default(), + buffer: S::Container::default(), marker: Default::default(), } } } -impl StringBackend +impl StringBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { /// Returns the next available symbol. - fn next_symbol(&self) -> S { + fn next_symbol(&self) -> Sym { expect_valid_symbol(self.ends.len()) } /// Returns the string associated to the span. - fn span_to_str(&self, span: Span) -> &str { - // SAFETY: - We convert a `String` into its underlying bytes and then - // directly reinterpret it as `&str` again which is safe. - // - Nothing mutates the string in between since this is a `&self` - // method. - // - The spans we use for `(start..end]` ranges are always - // constructed in accordance to valid utf8 byte ranges. - unsafe { - core::str::from_utf8_unchecked( - &self.buffer.as_bytes()[(span.from as usize)..(span.to as usize)], - ) - } + fn span_to_str(&self, span: Span) -> &S { + S::from_slice( + &(*self.buffer).to_slice()[(span.from as usize)..(span.to as usize)], + ) } /// Returns the span for the given symbol if any. - fn symbol_to_span(&self, symbol: S) -> Option { + fn symbol_to_span(&self, symbol: Sym) -> Option { let index = symbol.to_usize(); self.ends.get(index).copied().map(|to| { let from = self.ends.get(index.wrapping_sub(1)).copied().unwrap_or(0); @@ -134,7 +153,7 @@ where } /// Returns the span for the given symbol if any. - unsafe fn symbol_to_span_unchecked(&self, symbol: S) -> Span { + unsafe fn symbol_to_span_unchecked(&self, symbol: Sym) -> Span { let index = symbol.to_usize(); // SAFETY: The function is marked unsafe so that the caller guarantees // that required invariants are checked. @@ -148,20 +167,22 @@ where /// # Panics /// /// If the backend ran out of symbols. - fn push_string(&mut self, string: &str) -> S { - self.buffer.push_str(string); - let to = self.buffer.as_bytes().len(); + fn push_string(&mut self, string: &S) -> Sym { + S::push_str(&mut self.buffer, string); + let to = (*self.buffer).to_slice().len(); let symbol = self.next_symbol(); self.ends.push(to); symbol } } -impl Backend for StringBackend +impl Backend for StringBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { - type Symbol = S; + type Str = S; + type Symbol = Sym; #[cfg_attr(feature = "inline-more", inline)] fn with_capacity(cap: usize) -> Self { @@ -169,18 +190,18 @@ where let default_word_len = 5; Self { ends: Vec::with_capacity(cap), - buffer: String::with_capacity(cap * default_word_len), + buffer: ::with_capacity(cap * default_word_len), marker: Default::default(), } } #[inline] - fn intern(&mut self, string: &str) -> Self::Symbol { + fn intern(&mut self, string: &S) -> Self::Symbol { self.push_string(string) } #[inline] - fn resolve(&self, symbol: Self::Symbol) -> Option<&str> { + fn resolve(&self, symbol: Self::Symbol) -> Option<&S> { self.symbol_to_span(symbol) .map(|span| self.span_to_str(span)) } @@ -191,19 +212,20 @@ where } #[inline] - unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str { + unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &S { // SAFETY: The function is marked unsafe so that the caller guarantees // that required invariants are checked. unsafe { self.span_to_str(self.symbol_to_span_unchecked(symbol)) } } } -impl<'a, S> IntoIterator for &'a StringBackend +impl<'a, S, Sym> IntoIterator for &'a StringBackend where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { - type Item = (S, &'a str); - type IntoIter = Iter<'a, S>; + type Item = (Sym, &'a S); + type IntoIter = Iter<'a, S, Sym>; #[cfg_attr(feature = "inline-more", inline)] fn into_iter(self) -> Self::IntoIter { @@ -211,15 +233,23 @@ where } } -pub struct Iter<'a, S> { - backend: &'a StringBackend, +/// Iterator for a [`StringBackend`](crate::backend::string::StringBackend) +/// that returns all of its interned strings. +pub struct Iter<'a, S, Sym> +where + S: ?Sized + Internable, +{ + backend: &'a StringBackend, start: usize, ends: Enumerate>, } -impl<'a, S> Iter<'a, S> { +impl<'a, S, Sym> Iter<'a, S, Sym> +where + S: ?Sized + Internable, +{ #[cfg_attr(feature = "inline-more", inline)] - pub fn new(backend: &'a StringBackend) -> Self { + pub(super) fn new(backend: &'a StringBackend) -> Self { Self { backend, start: 0, @@ -228,11 +258,12 @@ impl<'a, S> Iter<'a, S> { } } -impl<'a, S> Iterator for Iter<'a, S> +impl<'a, S, Sym> Iterator for Iter<'a, S, Sym> where - S: Symbol, + Sym: Symbol, + S: ?Sized + Internable, { - type Item = (S, &'a str); + type Item = (Sym, &'a S); #[inline] fn size_hint(&self) -> (usize, Option) { diff --git a/src/interner.rs b/src/interner.rs index f33a8e9..cffe4bd 100644 --- a/src/interner.rs +++ b/src/interner.rs @@ -5,7 +5,6 @@ use crate::{ HashMap, }, DefaultBackend, - DefaultSymbol, Symbol, }; use core::{ @@ -43,12 +42,12 @@ where /// - This maps from `string` type to `symbol` type. /// - [`StringInterner::resolve`]: To resolve your already interned strings. /// - This maps from `symbol` type to `string` type. -pub struct StringInterner, H = DefaultHashBuilder> +pub struct StringInterner where B: Backend, H: BuildHasher, { - dedup: HashMap<::Symbol, (), ()>, + dedup: HashMap, hasher: H, backend: B, } @@ -56,7 +55,7 @@ where impl Debug for StringInterner where B: Backend + Debug, - ::Symbol: Symbol + Debug, + B::Symbol: Symbol + Debug, H: BuildHasher, { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { @@ -68,7 +67,12 @@ where } #[cfg(feature = "backends")] -impl Default for StringInterner { +impl Default for StringInterner +where + B: Backend, + B::Symbol: Symbol, + H: BuildHasher + Default, +{ #[cfg_attr(feature = "inline-more", inline)] fn default() -> Self { StringInterner::new() @@ -78,7 +82,7 @@ impl Default for StringInterner { impl Clone for StringInterner where B: Backend + Clone, - ::Symbol: Symbol, + B::Symbol: Symbol, H: BuildHasher + Clone, { fn clone(&self) -> Self { @@ -93,7 +97,7 @@ where impl PartialEq for StringInterner where B: Backend + PartialEq, - ::Symbol: Symbol, + B::Symbol: Symbol, H: BuildHasher, { fn eq(&self, rhs: &Self) -> bool { @@ -104,7 +108,8 @@ where impl Eq for StringInterner where B: Backend + Eq, - ::Symbol: Symbol, + B::Str: Hash, + B::Symbol: Symbol, H: BuildHasher, { } @@ -112,7 +117,26 @@ where impl StringInterner where B: Backend, - ::Symbol: Symbol, + B::Symbol: Symbol, + H: BuildHasher, +{ + /// Returns the number of strings interned by the interner. + #[cfg_attr(feature = "inline-more", inline)] + pub fn len(&self) -> usize { + self.dedup.len() + } + + /// Returns `true` if the string interner has no interned strings. + #[cfg_attr(feature = "inline-more", inline)] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl StringInterner +where + B: Backend, + B::Symbol: Symbol, H: BuildHasher + Default, { /// Creates a new empty `StringInterner`. @@ -139,7 +163,8 @@ where impl StringInterner where B: Backend, - ::Symbol: Symbol, + B::Str: Hash + PartialEq, + B::Symbol: Symbol, H: BuildHasher, { /// Creates a new empty `StringInterner` with the given hasher. @@ -162,25 +187,13 @@ where } } - /// Returns the number of strings interned by the interner. - #[cfg_attr(feature = "inline-more", inline)] - pub fn len(&self) -> usize { - self.dedup.len() - } - - /// Returns `true` if the string interner has no interned strings. - #[cfg_attr(feature = "inline-more", inline)] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - /// Returns the symbol for the given string if any. /// /// Can be used to query if a string has already been interned without interning. #[inline] - pub fn get(&self, string: T) -> Option<::Symbol> + pub fn get(&self, string: T) -> Option where - T: AsRef, + T: AsRef, { let string = string.as_ref(); let Self { @@ -206,20 +219,17 @@ where /// [1]: [`StringInterner::get_or_intern`] /// [2]: [`StringInterner::get_or_intern_static`] #[cfg_attr(feature = "inline-more", inline)] - fn get_or_intern_using( + fn get_or_intern_using<'a>( &mut self, - string: T, - intern_fn: fn(&mut B, T) -> ::Symbol, - ) -> ::Symbol - where - T: Copy + Hash + AsRef + for<'a> PartialEq<&'a str>, - { + string: &'a B::Str, + intern_fn: fn(&mut B, &'a B::Str) -> B::Symbol, + ) -> B::Symbol { let Self { dedup, hasher, backend, } = self; - let hash = make_hash(hasher, string.as_ref()); + let hash = make_hash(hasher, string); let entry = dedup.raw_entry_mut().from_hash(hash, |symbol| { // SAFETY: This is safe because we only operate on symbols that // we receive from our backend making them valid. @@ -250,9 +260,9 @@ where /// If the interner already interns the maximum number of strings possible /// by the chosen symbol type. #[inline] - pub fn get_or_intern(&mut self, string: T) -> ::Symbol + pub fn get_or_intern(&mut self, string: T) -> B::Symbol where - T: AsRef, + T: AsRef, { self.get_or_intern_using(string.as_ref(), B::intern) } @@ -271,10 +281,7 @@ where /// If the interner already interns the maximum number of strings possible /// by the chosen symbol type. #[inline] - pub fn get_or_intern_static( - &mut self, - string: &'static str, - ) -> ::Symbol { + pub fn get_or_intern_static(&mut self, string: &'static B::Str) -> B::Symbol { self.get_or_intern_using(string, B::intern_static) } @@ -285,7 +292,7 @@ where /// Returns the string for the given symbol if any. #[inline] - pub fn resolve(&self, symbol: ::Symbol) -> Option<&str> { + pub fn resolve(&self, symbol: B::Symbol) -> Option<&B::Str> { self.backend.resolve(symbol) } } @@ -293,9 +300,10 @@ where impl FromIterator for StringInterner where B: Backend, - ::Symbol: Symbol, + B::Str: Hash + PartialEq, + B::Symbol: Symbol, H: BuildHasher + Default, - T: AsRef, + T: AsRef, { fn from_iter(iter: I) -> Self where @@ -312,16 +320,17 @@ where impl Extend for StringInterner where B: Backend, - ::Symbol: Symbol, + B::Str: Hash + PartialEq, + B::Symbol: Symbol, H: BuildHasher, - T: AsRef, + T: AsRef, { fn extend(&mut self, iter: I) where I: IntoIterator, { for s in iter { - self.get_or_intern(s.as_ref()); + self.get_or_intern(s); } } } @@ -329,11 +338,10 @@ where impl<'a, B, H> IntoIterator for &'a StringInterner where B: Backend, - ::Symbol: Symbol, - &'a B: IntoIterator::Symbol, &'a str)>, + &'a B: IntoIterator, H: BuildHasher, { - type Item = (::Symbol, &'a str); + type Item = (B::Symbol, &'a B::Str); type IntoIter = <&'a B as IntoIterator>::IntoIter; #[cfg_attr(feature = "inline-more", inline)] diff --git a/src/lib.rs b/src/lib.rs index 026ec64..6313098 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,9 +9,9 @@ //! ### Example: Interning & Symbols //! //! ``` -//! use string_interner::StringInterner; +//! use string_interner::{StringInterner, DefaultBackend}; //! -//! let mut interner = StringInterner::default(); +//! let mut interner: StringInterner = StringInterner::default(); //! let sym0 = interner.get_or_intern("Elephant"); //! let sym1 = interner.get_or_intern("Tiger"); //! let sym2 = interner.get_or_intern("Horse"); @@ -25,7 +25,7 @@ //! ### Example: Creation by `FromIterator` //! //! ``` -//! # use string_interner::StringInterner; +//! # use string_interner::{StringInterner, DefaultBackend}; //! let interner = ["Elephant", "Tiger", "Horse", "Tiger"] //! .into_iter() //! .collect::(); @@ -34,8 +34,8 @@ //! ### Example: Look-up //! //! ``` -//! # use string_interner::StringInterner; -//! let mut interner = StringInterner::default(); +//! # use string_interner::{StringInterner, DefaultBackend}; +//! let mut interner: StringInterner = StringInterner::default(); //! let sym = interner.get_or_intern("Banana"); //! assert_eq!(interner.resolve(sym), Some("Banana")); //! ``` @@ -43,8 +43,8 @@ //! ### Example: Iteration //! //! ``` -//! # use string_interner::{StringInterner, Symbol}; -//! let interner = ::from_iter(["Earth", "Water", "Fire", "Air"]); +//! # use string_interner::{StringInterner, DefaultBackend, Symbol}; +//! let interner: StringInterner = StringInterner::from_iter(["Earth", "Water", "Fire", "Air"]); //! for (sym, str) in &interner { //! println!("{} = {}", sym.to_usize(), str); //! } @@ -55,7 +55,7 @@ //! ``` //! # use string_interner::StringInterner; //! use string_interner::backend::BufferBackend; -//! type Interner = StringInterner; +//! type Interner = StringInterner; //! let mut interner = Interner::new(); //! let sym1 = interner.get_or_intern("Tiger"); //! let sym2 = interner.get_or_intern("Horse"); @@ -69,7 +69,7 @@ //! ``` //! # use string_interner::StringInterner; //! use string_interner::{backend::BucketBackend, symbol::SymbolU16}; -//! type Interner = StringInterner>; +//! type Interner = StringInterner; //! let mut interner = Interner::new(); //! let sym1 = interner.get_or_intern("Tiger"); //! let sym2 = interner.get_or_intern("Horse"); diff --git a/src/serde_impl.rs b/src/serde_impl.rs index 326befc..e8d8fd9 100644 --- a/src/serde_impl.rs +++ b/src/serde_impl.rs @@ -22,12 +22,14 @@ use serde::{ Serializer, }, }; +use std::hash::Hash; impl Serialize for StringInterner where B: Backend, - ::Symbol: Symbol, - for<'a> &'a B: IntoIterator::Symbol, &'a str)>, + B::Symbol: Symbol, + B::Str: Serialize, + for<'a> &'a B: IntoIterator, H: BuildHasher, { fn serialize(&self, serializer: T) -> Result @@ -45,7 +47,8 @@ where impl<'de, B, H> Deserialize<'de> for StringInterner where B: Backend, - ::Symbol: Symbol, + B::Symbol: Symbol, + B::Str: Deserialize<'de> + Hash + PartialEq, H: BuildHasher + Default, { fn deserialize(deserializer: D) -> Result, D::Error> @@ -59,16 +62,16 @@ where struct StringInternerVisitor where B: Backend, - ::Symbol: Symbol, + B::Symbol: Symbol, H: BuildHasher, { - mark: marker::PhantomData<(::Symbol, B, H)>, + mark: marker::PhantomData<(B::Symbol, B, H, *const B::Str)>, } impl Default for StringInternerVisitor where B: Backend, - ::Symbol: Symbol, + B::Symbol: Symbol, H: BuildHasher, { fn default() -> Self { @@ -81,7 +84,8 @@ where impl<'de, B, H> Visitor<'de> for StringInternerVisitor where B: Backend, - ::Symbol: Symbol, + B::Str: Deserialize<'de> + Hash + PartialEq, + B::Symbol: Symbol, H: BuildHasher + Default, { type Value = StringInterner; @@ -98,8 +102,8 @@ where seq.size_hint().unwrap_or(0), H::default(), ); - while let Some(s) = seq.next_element::>()? { - interner.get_or_intern(s); + while let Some(s) = seq.next_element::>()? { + interner.get_or_intern(&s); } Ok(interner) } diff --git a/tests/allocator.rs b/tests/allocator/mod.rs similarity index 98% rename from tests/allocator.rs rename to tests/allocator/mod.rs index 6b8ec79..b0fad8d 100644 --- a/tests/allocator.rs +++ b/tests/allocator/mod.rs @@ -44,8 +44,7 @@ impl TracingAllocator { unsafe impl GlobalAlloc for TracingAllocator { unsafe fn alloc(&self, layout: Layout) -> *mut u8 { self.stats.push_allocations(layout); - let res = self.inner.alloc(layout); - res + self.inner.alloc(layout) } unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { diff --git a/tests/tests.rs b/tests/tests.rs index ae51517..5978c42 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -38,15 +38,15 @@ pub trait BackendStats { const NAME: &'static str; } -impl BackendStats for backend::BucketBackend { - const MIN_OVERHEAD: f64 = 2.1; - const MAX_OVERHEAD: f64 = 2.33; - const MAX_ALLOCATIONS: usize = 66; - const MAX_DEALLOCATIONS: usize = 43; +impl BackendStats for backend::BucketBackend { + const MIN_OVERHEAD: f64 = 2.18; + const MAX_OVERHEAD: f64 = 3.01; + const MAX_ALLOCATIONS: usize = 65; + const MAX_DEALLOCATIONS: usize = 42; const NAME: &'static str = "BucketBackend"; } -impl BackendStats for backend::SimpleBackend { +impl BackendStats for backend::SimpleBackend { const MIN_OVERHEAD: f64 = 2.1; const MAX_OVERHEAD: f64 = 2.33; const MAX_ALLOCATIONS: usize = 1000040; @@ -54,7 +54,7 @@ impl BackendStats for backend::SimpleBackend { const NAME: &'static str = "SimpleBackend"; } -impl BackendStats for backend::StringBackend { +impl BackendStats for backend::StringBackend { const MIN_OVERHEAD: f64 = 1.7; const MAX_OVERHEAD: f64 = 1.93; const MAX_ALLOCATIONS: usize = 62; @@ -62,7 +62,7 @@ impl BackendStats for backend::StringBackend { const NAME: &'static str = "StringBackend"; } -impl BackendStats for backend::BufferBackend { +impl BackendStats for backend::buffer::BufferBackend { const MIN_OVERHEAD: f64 = 1.35; const MAX_OVERHEAD: f64 = 1.58; const MAX_ALLOCATIONS: usize = 43; @@ -379,23 +379,23 @@ macro_rules! gen_tests_for_backend { mod bucket_backend { use super::*; - gen_tests_for_backend!(backend::BucketBackend); + gen_tests_for_backend!(backend::BucketBackend); } mod simple_backend { use super::*; - gen_tests_for_backend!(backend::SimpleBackend); + gen_tests_for_backend!(backend::SimpleBackend); } mod string_backend { use super::*; - gen_tests_for_backend!(backend::StringBackend); + gen_tests_for_backend!(backend::StringBackend); } mod buffer_backend { use super::*; - gen_tests_for_backend!(backend::BufferBackend); + gen_tests_for_backend!(backend::BufferBackend); }