From 000355449b5d6a6599ba795358122549ad2e68fd Mon Sep 17 00:00:00 2001 From: Russell Johnston Date: Sun, 12 Apr 2020 20:05:50 -0700 Subject: [PATCH] Make `Symbol` non-zero to distinguish it from infinity in `vm::Value` Includes some `Symbol` clean-up from #42. Closes #43 by treating NaN as a tagged value. Future work on #40 may take another approach. --- gml/src/symbol.rs | 337 ++++++++++++++++++++++---------------------- gml/src/vm/value.rs | 77 ++++++++-- 2 files changed, 235 insertions(+), 179 deletions(-) diff --git a/gml/src/symbol.rs b/gml/src/symbol.rs index e4c7dc4..ec39370 100644 --- a/gml/src/symbol.rs +++ b/gml/src/symbol.rs @@ -1,4 +1,5 @@ -use std::{mem, ops, cmp, fmt}; +use std::{ops, cmp, fmt}; +use std::num::NonZeroUsize; use std::marker::PhantomData; use std::hash::{Hash, Hasher}; use std::borrow::Borrow; @@ -8,260 +9,258 @@ use std::collections::HashSet; /// A symbol is an index into a thread-local interner. #[derive(Copy, Clone, PartialEq, Eq, Hash)] pub struct Symbol { - index: u32, + // Symbols must be non-zero for use in `vm::Value`. + index: NonZeroUsize, + + // Symbols must be `!Send` and `!Sync` to avoid crossing interners. _marker: PhantomData<*const str>, } -impl Default for Symbol { - fn default() -> Self { - Symbol::intern("") - } +struct Interner { + strings: HashSet, + indices: Vec<*const str>, +} + +struct Entry { + string: Box, + index: NonZeroUsize, } impl Symbol { - /// Map a string to its interned symbol + /// Map a string to its interned symbol. pub fn intern(string: &str) -> Self { - Interner::with(|interner| interner.intern(string)) + Interner::with(|interner| Symbol { index: interner.intern(string), _marker: PhantomData }) } - pub fn into_index(self) -> u32 { - self.index - } + pub fn into_index(self) -> NonZeroUsize { self.index } - pub fn from_index(index: u32) -> Symbol { - Symbol { index, _marker: PhantomData } - } + pub fn from_index(index: NonZeroUsize) -> Symbol { Symbol { index, _marker: PhantomData } } +} + +impl Default for Symbol { + fn default() -> Self { Symbol::intern("") } } impl ops::Deref for Symbol { type Target = str; fn deref(&self) -> &str { - Interner::with(|interner| unsafe { mem::transmute(interner.get(*self)) }) + // Safety: `Symbol` is not `Send` or `Sync`, and is always allocated from a thread-local + // `Interner`. This ensures the string will not be freed until the thread dies and takes + // all associated `Symbol`s with it. + unsafe { &*Interner::with(|interner| interner.get(self.index)) } } } impl Borrow for Symbol { - fn borrow(&self) -> &str { - self - } + fn borrow(&self) -> &str { self } } -impl cmp::PartialOrd for Symbol { - fn partial_cmp(&self, other: &Self) -> Option { - Some(Symbol::cmp(self, other)) - } +impl cmp::Ord for Symbol { + fn cmp(&self, other: &Self) -> cmp::Ordering { str::cmp(self, other) } } -impl cmp::Ord for Symbol { - fn cmp(&self, other: &Self) -> cmp::Ordering { - let a: &str = self; - let b: &str = other; - str::cmp(a, b) - } +impl cmp::PartialOrd for Symbol { + fn partial_cmp(&self, other: &Self) -> Option { Some(Self::cmp(self, other)) } } impl fmt::Debug for Symbol { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}({})", self, self.into_index()) + ::fmt(self, f)?; + f.write_str("@")?; + ::fmt(&self.index, f)?; + Ok(()) } } impl fmt::Display for Symbol { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let string: &str = self; - write!(f, "{}", string) - } -} - -#[derive(Default)] -struct Interner { - strings: HashSet, - ids: Vec<*const str>, -} - -struct Entry { - string: Box, - id: u32, -} - -impl cmp::PartialEq for Entry { - fn eq(&self, other: &Self) -> bool { - let a: &str = self.borrow(); - let b: &str = other.borrow(); - a == b - } -} - -impl cmp::Eq for Entry {} - -impl Hash for Entry { - fn hash(&self, state: &mut H) where H: Hasher { - let a: &str = self.borrow(); - a.hash(state) - } -} - -impl Borrow for Entry { - fn borrow(&self) -> &str { - &self.string - } + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { ::fmt(self, f) } } impl Interner { - fn fill(strings: &[&str]) -> Self { - let mut interner = Interner::default(); - for &string in strings { - interner.intern(string); - } - interner - } - - fn intern(&mut self, string: &str) -> Symbol { + fn intern(&mut self, string: &str) -> NonZeroUsize { if let Some(entry) = self.strings.get(string) { - return Symbol::from_index(entry.id); + return entry.index; } let string = String::from(string).into_boxed_str(); let data = &*string as *const str; - let id = self.ids.len() as u32; - self.strings.insert(Entry { string, id }); - self.ids.push(data); + // Safety: `self.indices` always has at least one entry. + let index = unsafe { NonZeroUsize::new_unchecked(self.indices.len()) }; + self.strings.insert(Entry { string, index }); + self.indices.push(data); - Symbol::from_index(id) + index } - fn get(&self, symbol: Symbol) -> &str { - let index = symbol.into_index(); - unsafe { &*self.ids[index as usize] } + fn get(&self, index: NonZeroUsize) -> *const str { + self.indices[index.get()] } fn with T>(f: F) -> T { - thread_local!(static INTERNER: RefCell = { - RefCell::new(Interner::new()) - }); + thread_local!(static INTERNER: RefCell = RefCell::new(Interner::with_keywords())); INTERNER.with(|interner| f(&mut *interner.borrow_mut())) } } +impl Default for Interner { + fn default() -> Self { + Interner { strings: HashSet::default(), indices: vec!["UNUSED"] } + } +} + +impl Borrow for Entry { + fn borrow(&self) -> &str { &self.string } +} + +impl cmp::Eq for Entry {} + +impl cmp::PartialEq for Entry { + fn eq(&self, other: &Self) -> bool { ::eq(self.borrow(), other.borrow()) } +} + +impl Hash for Entry { + fn hash(&self, state: &mut H) { str::hash(self.borrow(), state) } +} + macro_rules! declare_symbols {( keywords: $(($index: expr, $name: ident, $string: expr))* arguments: $(($symbol_index: expr, $argument_index: expr))* ) => { #[allow(non_upper_case_globals)] pub mod keyword { + use std::num::NonZeroUsize; use std::marker::PhantomData; use super::Symbol; - $(pub const $name: Symbol = Symbol { index: $index, _marker: PhantomData };)* + // Safety: The indices below are all non-zero. + $(pub const $name: Symbol = unsafe { + let index = NonZeroUsize::new_unchecked($index); + Symbol { index, _marker: PhantomData } + };)* } impl Interner { - fn new() -> Self { - Interner::fill(&[ - $($string,)* - $(concat!("argument", $argument_index),)* - ]) + fn with_keywords() -> Self { + let mut interner = Self::default(); + + $(interner.intern($string);)* + $(interner.intern(concat!("argument", $argument_index));)* + + interner } } }} declare_symbols! { keywords: - (0, True, "true") - (1, False, "false") - - (2, Self_, "self") - (3, Other, "other") - (4, All, "all") - (5, NoOne, "noone") - (6, Global, "global") - (7, Local, "local") - - (8, Var, "var") - (9, GlobalVar, "globalvar") - - (10, If, "if") - (11, Then, "then") - (12, Else, "else") - (13, Repeat, "repeat") - (14, While, "while") - (15, Do, "do") - (16, Until, "until") - (17, For, "for") - (18, With, "with") - (19, Switch, "switch") - (20, Case, "case") - (21, Default, "default") - (22, Break, "break") - (23, Continue, "continue") - (24, Exit, "exit") - (25, Return, "return") - - (26, Begin, "begin") - (27, End, "end") - - (28, Not, "not") - (29, Div, "div") - (30, Mod, "mod") - (31, And, "and") - (32, Or, "or") - (33, Xor, "xor") + (1, True, "true") + (2, False, "false") + + (3, Self_, "self") + (4, Other, "other") + (5, All, "all") + (6, NoOne, "noone") + (7, Global, "global") + (8, Local, "local") + + (9, Var, "var") + (10, GlobalVar, "globalvar") + + (11, If, "if") + (12, Then, "then") + (13, Else, "else") + (14, Repeat, "repeat") + (15, While, "while") + (16, Do, "do") + (17, Until, "until") + (18, For, "for") + (19, With, "with") + (20, Switch, "switch") + (21, Case, "case") + (22, Default, "default") + (23, Break, "break") + (24, Continue, "continue") + (25, Exit, "exit") + (26, Return, "return") + + (27, Begin, "begin") + (28, End, "end") + + (29, Not, "not") + (30, Div, "div") + (31, Mod, "mod") + (32, And, "and") + (33, Or, "or") + (34, Xor, "xor") arguments: - (34, 0) - (35, 1) - (36, 2) - (37, 3) - (38, 4) - (39, 5) - (40, 6) - (41, 7) - (42, 8) - (43, 9) - (44, 10) - (45, 11) - (46, 12) - (47, 13) - (48, 14) - (49, 15) + (35, 0) + (36, 1) + (37, 2) + (38, 3) + (39, 4) + (40, 5) + (41, 6) + (42, 7) + (43, 8) + (44, 9) + (45, 10) + (46, 11) + (47, 12) + (48, 13) + (49, 14) + (50, 15) } impl Symbol { - pub fn is_keyword(&self) -> bool { - self.index < 34 - } + pub fn is_keyword(&self) -> bool { self.index.get() < 35 } - pub fn is_argument(&self) -> bool { - 34 <= self.index && self.index < 50 - } + pub fn is_argument(&self) -> bool { 35 <= self.index.get() && self.index.get() < 51 } pub fn as_argument(&self) -> Option { - if self.is_argument() { - Some(self.index - 34) - } else { - None - } + if self.is_argument() { Some(self.index.get() as u32 - 35) } else { None } } pub fn from_argument(argument: u32) -> Symbol { assert!(argument < 16); - Symbol::from_index(34 + argument) + let index = NonZeroUsize::new(35 + argument as usize).unwrap(); + Symbol::from_index(index) } } #[cfg(test)] mod tests { - use super::*; + use super::{Symbol, keyword}; #[test] - fn intern() { - let mut i = Interner::default(); - - assert_eq!(i.intern("dog"), Symbol::from_index(0)); - assert_eq!(i.intern("dog"), Symbol::from_index(0)); - assert_eq!(i.intern("cat"), Symbol::from_index(1)); - assert_eq!(i.intern("cat"), Symbol::from_index(1)); - assert_eq!(i.intern("dog"), Symbol::from_index(0)); + fn keywords() { + let empty = Symbol::default(); + assert_eq!(empty, empty); + + let keyword = Symbol::intern("other"); + assert_eq!(keyword, keyword::Other); + + let arg = Symbol::intern("argument3"); + assert_eq!(arg, Symbol::from_argument(3)); + } + + #[test] + fn alloc() { + let dog1 = Symbol::intern("dog"); + assert_eq!(&*dog1, "dog"); + + let dog2 = Symbol::intern("dog"); + assert_eq!(&*dog2, "dog"); + assert_eq!(dog1, dog2); + + let cat1 = Symbol::intern("cat"); + assert_eq!(&*cat1, "cat"); + + let cat2 = Symbol::intern("cat"); + assert_eq!(&*cat2, "cat"); + assert_eq!(cat1, cat2); + + assert_ne!(dog1, cat1); } } diff --git a/gml/src/vm/value.rs b/gml/src/vm/value.rs index 607e3ac..fe01ccb 100644 --- a/gml/src/vm/value.rs +++ b/gml/src/vm/value.rs @@ -1,4 +1,5 @@ -use std::{mem, fmt}; +use std::{mem, fmt, f64}; +use std::num::NonZeroUsize; use std::convert::TryFrom; use crate::symbol::Symbol; @@ -11,9 +12,8 @@ use crate::vm; /// payloads. /// /// To avoid ambiguity, NaNs are canonicalized. The hardware seems to use positive qNaN with a zero -/// payload (0x7fff8_0000_0000_0000), so other types are encoded as negative NaNs, leaving 52 bits -/// for tag and value (including the quiet bit). This could be expanded to positive NaNs at the cost -/// of more complicated type checking. +/// payload (0x7ff8_0000_0000_0000), so other types are encoded as negative NaNs, leaving 52 bits +/// for tag and value (including the quiet bit- we ensure the payload is non-zero elsewhere). /// /// By limiting ourselves to 48-bit pointers (the current limit on x86_64 and AArch64, and a nice /// round number for sign extension), we get 4 bits for a tag. This could be expanded to 5 bits by @@ -23,6 +23,8 @@ use crate::vm; /// 4-bit tag values: /// 0000 - string /// 0001 - array +/// +/// 1000 - canonical NaN #[derive(Copy, Clone, PartialEq, Eq, Hash)] pub struct Value(u64); @@ -42,16 +44,25 @@ pub enum Type { impl Value { pub fn data(self) -> Data { let Value(value) = self; - let tag = value >> 48; - let payload = value & ((1 << 48) - 1); - if tag & !0xf != 0xfff0 { - return Data::Real(unsafe { mem::transmute::<_, f64>(value) }); + // This check covers finite, infinite, and positive NaN reals. + // Negative NaN is handled below with tagged values. + if value <= 0xfff0_0000_0000_0000 { + return Data::Real(f64::from_bits(value)); } + let tag = value >> 48; + let payload = value & ((1 << 48) - 1); match tag & 0xf { - 0x0 => Data::String(Symbol::from_index(payload as u32)), + // Safety: String values are always constructed from non-zero `Symbol`s. + // (The check above also excludes cases where both `tag & 0xf` and `payload` are zero.) + 0x0 => Data::String(Symbol::from_index(unsafe { + NonZeroUsize::new_unchecked(payload as usize) + })), + 0x1 => Data::Array(unsafe { vm::Array::clone_from_raw(payload as *const _) }), + + 0x8 => Data::Real(f64::from_bits(value)), _ => unreachable!("corrupt value"), } } @@ -95,7 +106,7 @@ impl From for Value { impl From for Value { fn from(value: Symbol) -> Value { let tag = 0xfff0 | 0x0; - let value = value.into_index() as u64; + let value = value.into_index().get() as u64; Value((tag << 48) | value) } @@ -214,3 +225,49 @@ impl TryFrom for bool { } } } + +#[cfg(test)] +mod tests { + use crate::symbol::{keyword, Symbol}; + use crate::vm; + + #[test] + fn reals() { + let value = vm::Value::from(0.0); + assert!(match value.data() { vm::Data::Real(x) if x == 0.0 => true, _ => false }); + + let value = vm::Value::from(3.5); + assert!(match value.data() { vm::Data::Real(x) if x == 3.5 => true, _ => false }); + + let value = vm::Value::from(f64::INFINITY); + assert!(match value.data() { + vm::Data::Real(x) if x == f64::INFINITY => true, + _ => false + }); + + let value = vm::Value::from(f64::NEG_INFINITY); + assert!(match value.data() { + vm::Data::Real(x) if x == f64::NEG_INFINITY => true, + _ => false + }); + + let value = vm::Value::from(f64::NAN); + assert!(match value.data() { vm::Data::Real(x) if f64::is_nan(x) => true, _ => false }); + } + + #[test] + fn strings() { + let value = vm::Value::from(Symbol::intern("true")); + assert!(match value.data() { vm::Data::String(keyword::True) => true, _ => false }); + + let value = vm::Value::from(Symbol::intern("argument0")); + assert!(match value.data() { + vm::Data::String(x) if x == Symbol::from_argument(0) => true, + _ => false + }); + + let symbol = Symbol::intern("foo"); + let value = vm::Value::from(symbol); + assert!(match value.data() { vm::Data::String(x) if x == symbol => true, _ => false }); + } +}