From 8bc957cc88f224f3693a75d9b2101f64f1176a2b Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 19 Oct 2021 17:36:23 +0100 Subject: [PATCH 01/17] Apply rustfmt to rand::distributions::uniform module --- src/distributions/uniform.rs | 104 ++++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 31 deletions(-) diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 261357b245..296e528c37 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -106,8 +106,8 @@ //! [`UniformDuration`]: crate::distributions::uniform::UniformDuration //! [`SampleBorrow::borrow`]: crate::distributions::uniform::SampleBorrow::borrow -use core::time::Duration; use core::ops::{Range, RangeInclusive}; +use core::time::Duration; use crate::distributions::float::IntoFloat; use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, WideningMultiply}; @@ -120,8 +120,7 @@ use crate::distributions::utils::Float; #[cfg(feature = "simd_support")] use packed_simd::*; -#[cfg(feature = "serde1")] -use serde::{Serialize, Deserialize}; +#[cfg(feature = "serde1")] use serde::{Deserialize, Serialize}; /// Sample values uniformly between two bounds. /// @@ -175,7 +174,10 @@ use serde::{Serialize, Deserialize}; #[derive(Clone, Copy, Debug, PartialEq)] #[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] #[cfg_attr(feature = "serde1", serde(bound(serialize = "X::Sampler: Serialize")))] -#[cfg_attr(feature = "serde1", serde(bound(deserialize = "X::Sampler: Deserialize<'de>")))] +#[cfg_attr( + feature = "serde1", + serde(bound(deserialize = "X::Sampler: Deserialize<'de>")) +)] pub struct Uniform(X::Sampler); impl Uniform { @@ -291,10 +293,10 @@ pub trait UniformSampler: Sized { /// some types more optimal implementations for single usage may be provided /// via this method. /// Results may not be identical. - fn sample_single_inclusive(low: B1, high: B2, rng: &mut R) - -> Self::X - where B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized + fn sample_single_inclusive(low: B1, high: B2, rng: &mut R) -> Self::X + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, { let uniform: Self = UniformSampler::new_inclusive(low, high); uniform.sample(rng) @@ -516,14 +518,19 @@ macro_rules! uniform_int_impl { } #[inline] - fn sample_single_inclusive(low_b: B1, high_b: B2, rng: &mut R) -> Self::X + fn sample_single_inclusive( + low_b: B1, high_b: B2, rng: &mut R, + ) -> Self::X where B1: SampleBorrow + Sized, B2: SampleBorrow + Sized, { let low = *low_b.borrow(); let high = *high_b.borrow(); - assert!(low <= high, "UniformSampler::sample_single_inclusive: low > high"); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; // If the above resulted in wrap-around to 0, the range is $ty::MIN..=$ty::MAX, // and any integer will do. @@ -934,7 +941,10 @@ macro_rules! uniform_float_impl { "UniformSampler::sample_single: low >= high" ); let mut scale = high - low; - assert!(scale.all_finite(), "UniformSampler::sample_single: range overflow"); + assert!( + scale.all_finite(), + "UniformSampler::sample_single: range overflow" + ); loop { // Generate a value in the range [1, 2) @@ -1156,24 +1166,43 @@ mod tests { #[cfg(feature = "serde1")] fn test_serialization_uniform_duration() { let distr = UniformDuration::new(Duration::from_secs(10), Duration::from_secs(60)); - let de_distr: UniformDuration = bincode::deserialize(&bincode::serialize(&distr).unwrap()).unwrap(); - assert_eq!( - distr.offset, de_distr.offset - ); + let de_distr: UniformDuration = + bincode::deserialize(&bincode::serialize(&distr).unwrap()).unwrap(); + assert_eq!(distr.offset, de_distr.offset); match (distr.mode, de_distr.mode) { - (UniformDurationMode::Small {secs: a_secs, nanos: a_nanos}, UniformDurationMode::Small {secs, nanos}) => { + ( + UniformDurationMode::Small { + secs: a_secs, + nanos: a_nanos, + }, + UniformDurationMode::Small { secs, nanos }, + ) => { assert_eq!(a_secs, secs); assert_eq!(a_nanos.0.low, nanos.0.low); assert_eq!(a_nanos.0.range, nanos.0.range); assert_eq!(a_nanos.0.z, nanos.0.z); } - (UniformDurationMode::Medium {nanos: a_nanos} , UniformDurationMode::Medium {nanos}) => { + ( + UniformDurationMode::Medium { nanos: a_nanos }, + UniformDurationMode::Medium { nanos }, + ) => { assert_eq!(a_nanos.0.low, nanos.0.low); assert_eq!(a_nanos.0.range, nanos.0.range); assert_eq!(a_nanos.0.z, nanos.0.z); } - (UniformDurationMode::Large {max_secs:a_max_secs, max_nanos:a_max_nanos, secs:a_secs}, UniformDurationMode::Large {max_secs, max_nanos, secs} ) => { + ( + UniformDurationMode::Large { + max_secs: a_max_secs, + max_nanos: a_max_nanos, + secs: a_secs, + }, + UniformDurationMode::Large { + max_secs, + max_nanos, + secs, + }, + ) => { assert_eq!(a_max_secs, max_secs); assert_eq!(a_max_nanos, max_nanos); @@ -1181,22 +1210,24 @@ mod tests { assert_eq!(a_secs.0.range, secs.0.range); assert_eq!(a_secs.0.z, secs.0.z); } - _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly") + _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly"), } } - + #[test] #[cfg(feature = "serde1")] fn test_uniform_serialization() { - let unit_box: Uniform = Uniform::new(-1, 1); - let de_unit_box: Uniform = bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap(); + let unit_box: Uniform = Uniform::new(-1, 1); + let de_unit_box: Uniform = + bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap(); assert_eq!(unit_box.0.low, de_unit_box.0.low); assert_eq!(unit_box.0.range, de_unit_box.0.range); assert_eq!(unit_box.0.z, de_unit_box.0.z); let unit_box: Uniform = Uniform::new(-1., 1.); - let de_unit_box: Uniform = bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap(); + let de_unit_box: Uniform = + bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap(); assert_eq!(unit_box.0.low, de_unit_box.0.low); assert_eq!(unit_box.0.scale, de_unit_box.0.scale); @@ -1361,8 +1392,9 @@ mod tests { assert!(low_scalar <= v && v < high_scalar); let v = rng.sample(my_incl_uniform).extract(lane); assert!(low_scalar <= v && v <= high_scalar); - let v = <$ty as SampleUniform>::Sampler - ::sample_single(low, high, &mut rng).extract(lane); + let v = + <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng) + .extract(lane); assert!(low_scalar <= v && v < high_scalar); } @@ -1373,9 +1405,15 @@ mod tests { assert_eq!(zero_rng.sample(my_uniform).extract(lane), low_scalar); assert_eq!(zero_rng.sample(my_incl_uniform).extract(lane), low_scalar); - assert_eq!(<$ty as SampleUniform>::Sampler - ::sample_single(low, high, &mut zero_rng) - .extract(lane), low_scalar); + assert_eq!( + <$ty as SampleUniform>::Sampler::sample_single( + low, + high, + &mut zero_rng + ) + .extract(lane), + low_scalar + ); assert!(max_rng.sample(my_uniform).extract(lane) < high_scalar); assert!(max_rng.sample(my_incl_uniform).extract(lane) <= high_scalar); @@ -1388,9 +1426,13 @@ mod tests { (-1i64 << $bits_shifted) as u64, ); assert!( - <$ty as SampleUniform>::Sampler - ::sample_single(low, high, &mut lowering_max_rng) - .extract(lane) < high_scalar + <$ty as SampleUniform>::Sampler::sample_single( + low, + high, + &mut lowering_max_rng + ) + .extract(lane) + < high_scalar ); } } From 8449015a40e695859bc55a0c20c97c362c0afe14 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Thu, 24 Feb 2022 11:22:35 +0000 Subject: [PATCH 02/17] Move UniformChar, UniformDuration to new submodule --- src/distributions/uniform.rs | 358 ++------------------- src/distributions/uniform/uniform_other.rs | 341 ++++++++++++++++++++ 2 files changed, 363 insertions(+), 336 deletions(-) create mode 100644 src/distributions/uniform/uniform_other.rs diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 296e528c37..07a4ddbbf2 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -107,7 +107,6 @@ //! [`SampleBorrow::borrow`]: crate::distributions::uniform::SampleBorrow::borrow use core::ops::{Range, RangeInclusive}; -use core::time::Duration; use crate::distributions::float::IntoFloat; use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, WideningMultiply}; @@ -122,6 +121,10 @@ use crate::distributions::utils::Float; #[cfg(feature = "serde1")] use serde::{Deserialize, Serialize}; +mod uniform_other; + +pub use uniform_other::{UniformChar, UniformDuration}; + /// Sample values uniformly between two bounds. /// /// [`Uniform::new`] and [`Uniform::new_inclusive`] construct a uniform @@ -721,79 +724,6 @@ uniform_simd_int_impl! { u8 } -impl SampleUniform for char { - type Sampler = UniformChar; -} - -/// The back-end implementing [`UniformSampler`] for `char`. -/// -/// Unless you are implementing [`UniformSampler`] for your own type, this type -/// should not be used directly, use [`Uniform`] instead. -/// -/// This differs from integer range sampling since the range `0xD800..=0xDFFF` -/// are used for surrogate pairs in UCS and UTF-16, and consequently are not -/// valid Unicode code points. We must therefore avoid sampling values in this -/// range. -#[derive(Clone, Copy, Debug)] -#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] -pub struct UniformChar { - sampler: UniformInt, -} - -/// UTF-16 surrogate range start -const CHAR_SURROGATE_START: u32 = 0xD800; -/// UTF-16 surrogate range size -const CHAR_SURROGATE_LEN: u32 = 0xE000 - CHAR_SURROGATE_START; - -/// Convert `char` to compressed `u32` -fn char_to_comp_u32(c: char) -> u32 { - match c as u32 { - c if c >= CHAR_SURROGATE_START => c - CHAR_SURROGATE_LEN, - c => c, - } -} - -impl UniformSampler for UniformChar { - type X = char; - - #[inline] // if the range is constant, this helps LLVM to do the - // calculations at compile-time. - fn new(low_b: B1, high_b: B2) -> Self - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = char_to_comp_u32(*low_b.borrow()); - let high = char_to_comp_u32(*high_b.borrow()); - let sampler = UniformInt::::new(low, high); - UniformChar { sampler } - } - - #[inline] // if the range is constant, this helps LLVM to do the - // calculations at compile-time. - fn new_inclusive(low_b: B1, high_b: B2) -> Self - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = char_to_comp_u32(*low_b.borrow()); - let high = char_to_comp_u32(*high_b.borrow()); - let sampler = UniformInt::::new_inclusive(low, high); - UniformChar { sampler } - } - - fn sample(&self, rng: &mut R) -> Self::X { - let mut x = self.sampler.sample(rng); - if x >= CHAR_SURROGATE_START { - x += CHAR_SURROGATE_LEN; - } - // SAFETY: x must not be in surrogate range or greater than char::MAX. - // This relies on range constructors which accept char arguments. - // Validity of input char values is assumed. - unsafe { core::char::from_u32_unchecked(x) } - } -} - /// The back-end implementing [`UniformSampler`] for floating-point types. /// /// Unless you are implementing [`UniformSampler`] for your own type, this type @@ -1025,195 +955,11 @@ uniform_float_impl! { f64x4, u64x4, f64, u64, 64 - 52 } uniform_float_impl! { f64x8, u64x8, f64, u64, 64 - 52 } -/// The back-end implementing [`UniformSampler`] for `Duration`. -/// -/// Unless you are implementing [`UniformSampler`] for your own types, this type -/// should not be used directly, use [`Uniform`] instead. -#[derive(Clone, Copy, Debug)] -#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] -pub struct UniformDuration { - mode: UniformDurationMode, - offset: u32, -} - -#[derive(Debug, Copy, Clone)] -#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] -enum UniformDurationMode { - Small { - secs: u64, - nanos: Uniform, - }, - Medium { - nanos: Uniform, - }, - Large { - max_secs: u64, - max_nanos: u32, - secs: Uniform, - }, -} - -impl SampleUniform for Duration { - type Sampler = UniformDuration; -} - -impl UniformSampler for UniformDuration { - type X = Duration; - - #[inline] - fn new(low_b: B1, high_b: B2) -> Self - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!(low < high, "Uniform::new called with `low >= high`"); - UniformDuration::new_inclusive(low, high - Duration::new(0, 1)) - } - - #[inline] - fn new_inclusive(low_b: B1, high_b: B2) -> Self - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!( - low <= high, - "Uniform::new_inclusive called with `low > high`" - ); - - let low_s = low.as_secs(); - let low_n = low.subsec_nanos(); - let mut high_s = high.as_secs(); - let mut high_n = high.subsec_nanos(); - - if high_n < low_n { - high_s -= 1; - high_n += 1_000_000_000; - } - - let mode = if low_s == high_s { - UniformDurationMode::Small { - secs: low_s, - nanos: Uniform::new_inclusive(low_n, high_n), - } - } else { - let max = high_s - .checked_mul(1_000_000_000) - .and_then(|n| n.checked_add(u64::from(high_n))); - - if let Some(higher_bound) = max { - let lower_bound = low_s * 1_000_000_000 + u64::from(low_n); - UniformDurationMode::Medium { - nanos: Uniform::new_inclusive(lower_bound, higher_bound), - } - } else { - // An offset is applied to simplify generation of nanoseconds - let max_nanos = high_n - low_n; - UniformDurationMode::Large { - max_secs: high_s, - max_nanos, - secs: Uniform::new_inclusive(low_s, high_s), - } - } - }; - UniformDuration { - mode, - offset: low_n, - } - } - - #[inline] - fn sample(&self, rng: &mut R) -> Duration { - match self.mode { - UniformDurationMode::Small { secs, nanos } => { - let n = nanos.sample(rng); - Duration::new(secs, n) - } - UniformDurationMode::Medium { nanos } => { - let nanos = nanos.sample(rng); - Duration::new(nanos / 1_000_000_000, (nanos % 1_000_000_000) as u32) - } - UniformDurationMode::Large { - max_secs, - max_nanos, - secs, - } => { - // constant folding means this is at least as fast as `Rng::sample(Range)` - let nano_range = Uniform::new(0, 1_000_000_000); - loop { - let s = secs.sample(rng); - let n = nano_range.sample(rng); - if !(s == max_secs && n > max_nanos) { - let sum = n + self.offset; - break Duration::new(s, sum); - } - } - } - } - } -} - #[cfg(test)] mod tests { use super::*; use crate::rngs::mock::StepRng; - #[test] - #[cfg(feature = "serde1")] - fn test_serialization_uniform_duration() { - let distr = UniformDuration::new(Duration::from_secs(10), Duration::from_secs(60)); - let de_distr: UniformDuration = - bincode::deserialize(&bincode::serialize(&distr).unwrap()).unwrap(); - assert_eq!(distr.offset, de_distr.offset); - match (distr.mode, de_distr.mode) { - ( - UniformDurationMode::Small { - secs: a_secs, - nanos: a_nanos, - }, - UniformDurationMode::Small { secs, nanos }, - ) => { - assert_eq!(a_secs, secs); - - assert_eq!(a_nanos.0.low, nanos.0.low); - assert_eq!(a_nanos.0.range, nanos.0.range); - assert_eq!(a_nanos.0.z, nanos.0.z); - } - ( - UniformDurationMode::Medium { nanos: a_nanos }, - UniformDurationMode::Medium { nanos }, - ) => { - assert_eq!(a_nanos.0.low, nanos.0.low); - assert_eq!(a_nanos.0.range, nanos.0.range); - assert_eq!(a_nanos.0.z, nanos.0.z); - } - ( - UniformDurationMode::Large { - max_secs: a_max_secs, - max_nanos: a_max_nanos, - secs: a_secs, - }, - UniformDurationMode::Large { - max_secs, - max_nanos, - secs, - }, - ) => { - assert_eq!(a_max_secs, max_secs); - assert_eq!(a_max_nanos, max_nanos); - - assert_eq!(a_secs.0.low, secs.0.low); - assert_eq!(a_secs.0.range, secs.0.range); - assert_eq!(a_secs.0.z, secs.0.z); - } - _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly"), - } - } - #[test] #[cfg(feature = "serde1")] fn test_uniform_serialization() { @@ -1340,27 +1086,6 @@ mod tests { } } - #[test] - #[cfg_attr(miri, ignore)] // Miri is too slow - fn test_char() { - let mut rng = crate::test::rng(891); - let mut max = core::char::from_u32(0).unwrap(); - for _ in 0..100 { - let c = rng.gen_range('A'..='Z'); - assert!(('A'..='Z').contains(&c)); - max = max.max(c); - } - assert_eq!(max, 'Z'); - let d = Uniform::new( - core::char::from_u32(0xD7F0).unwrap(), - core::char::from_u32(0xE010).unwrap(), - ); - for _ in 0..100 { - let c = d.sample(&mut rng); - assert!((c as u32) < 0xD800 || (c as u32) > 0xDFFF); - } - } - #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn test_floats() { @@ -1543,29 +1268,6 @@ mod tests { } } - - #[test] - #[cfg_attr(miri, ignore)] // Miri is too slow - fn test_durations() { - let mut rng = crate::test::rng(253); - - let v = &[ - (Duration::new(10, 50000), Duration::new(100, 1234)), - (Duration::new(0, 100), Duration::new(1, 50)), - ( - Duration::new(0, 0), - Duration::new(u64::max_value(), 999_999_999), - ), - ]; - for &(low, high) in v.iter() { - let my_uniform = Uniform::new(low, high); - for _ in 0..1000 { - let v = rng.sample(my_uniform); - assert!(low <= v && v < high); - } - } - } - #[test] fn test_custom_uniform() { use crate::distributions::uniform::{ @@ -1636,26 +1338,26 @@ mod tests { assert!(r.0.scale < 5.0 + 1e-14); } - #[test] - fn value_stability() { - fn test_samples( - lb: T, ub: T, expected_single: &[T], expected_multiple: &[T], - ) where Uniform: Distribution { - let mut rng = crate::test::rng(897); - let mut buf = [lb; 3]; - - for x in &mut buf { - *x = T::Sampler::sample_single(lb, ub, &mut rng); - } - assert_eq!(&buf, expected_single); + pub(crate) fn test_samples( + lb: T, ub: T, expected_single: &[T], expected_multiple: &[T], + ) where Uniform: Distribution { + let mut rng = crate::test::rng(897); + let mut buf = [lb; 3]; - let distr = Uniform::new(lb, ub); - for x in &mut buf { - *x = rng.sample(&distr); - } - assert_eq!(&buf, expected_multiple); + for x in &mut buf { + *x = T::Sampler::sample_single(lb, ub, &mut rng); + } + assert_eq!(&buf, expected_single); + + let distr = Uniform::new(lb, ub); + for x in &mut buf { + *x = rng.sample(&distr); } + assert_eq!(&buf, expected_multiple); + } + #[test] + fn value_stability() { // We test on a sub-set of types; possibly we should do more. // TODO: SIMD types @@ -1673,28 +1375,12 @@ mod tests { &[-4673848682.871551, 6388267422.932352, 4857075081.198343], &[1173375212.1808167, 1917642852.109581, 2365076174.3153973], ); - - test_samples( - Duration::new(2, 0), - Duration::new(4, 0), - &[ - Duration::new(2, 532615131), - Duration::new(3, 638826742), - Duration::new(3, 485707508), - ], - &[ - Duration::new(3, 117337521), - Duration::new(3, 191764285), - Duration::new(3, 236507617), - ], - ); } #[test] fn uniform_distributions_can_be_compared() { assert_eq!(Uniform::new(1.0, 2.0), Uniform::new(1.0, 2.0)); - // To cover UniformInt - assert_eq!(Uniform::new(1 as u32, 2 as u32), Uniform::new(1 as u32, 2 as u32)); + assert_eq!(Uniform::new(1u32, 2u32), Uniform::new(1u32, 2u32)); } } diff --git a/src/distributions/uniform/uniform_other.rs b/src/distributions/uniform/uniform_other.rs new file mode 100644 index 0000000000..f8232c484c --- /dev/null +++ b/src/distributions/uniform/uniform_other.rs @@ -0,0 +1,341 @@ +// Copyright 2018-2021 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{SampleBorrow, SampleUniform, Uniform, UniformInt, UniformSampler}; +use crate::distributions::Distribution; +use crate::Rng; +use core::time::Duration; +#[cfg(feature = "serde1")] use serde::{Deserialize, Serialize}; + +impl SampleUniform for char { + type Sampler = UniformChar; +} + +/// The back-end implementing [`UniformSampler`] for `char`. +/// +/// Unless you are implementing [`UniformSampler`] for your own type, this type +/// should not be used directly, use [`Uniform`] instead. +/// +/// This differs from integer range sampling since the range `0xD800..=0xDFFF` +/// are used for surrogate pairs in UCS and UTF-16, and consequently are not +/// valid Unicode code points. We must therefore avoid sampling values in this +/// range. +#[derive(Clone, Copy, Debug, PartialEq)] +#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] +pub struct UniformChar { + sampler: UniformInt, +} + +/// UTF-16 surrogate range start +const CHAR_SURROGATE_START: u32 = 0xD800; +/// UTF-16 surrogate range size +const CHAR_SURROGATE_LEN: u32 = 0xE000 - CHAR_SURROGATE_START; + +/// Convert `char` to compressed `u32` +fn char_to_comp_u32(c: char) -> u32 { + match c as u32 { + c if c >= CHAR_SURROGATE_START => c - CHAR_SURROGATE_LEN, + c => c, + } +} + +impl UniformSampler for UniformChar { + type X = char; + + #[inline] // if the range is constant, this helps LLVM to do the + // calculations at compile-time. + fn new(low_b: B1, high_b: B2) -> Self + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = char_to_comp_u32(*low_b.borrow()); + let high = char_to_comp_u32(*high_b.borrow()); + let sampler = UniformInt::::new(low, high); + UniformChar { sampler } + } + + #[inline] // if the range is constant, this helps LLVM to do the + // calculations at compile-time. + fn new_inclusive(low_b: B1, high_b: B2) -> Self + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = char_to_comp_u32(*low_b.borrow()); + let high = char_to_comp_u32(*high_b.borrow()); + let sampler = UniformInt::::new_inclusive(low, high); + UniformChar { sampler } + } + + fn sample(&self, rng: &mut R) -> Self::X { + let mut x = self.sampler.sample(rng); + if x >= CHAR_SURROGATE_START { + x += CHAR_SURROGATE_LEN; + } + // SAFETY: x must not be in surrogate range or greater than char::MAX. + // This relies on range constructors which accept char arguments. + // Validity of input char values is assumed. + unsafe { core::char::from_u32_unchecked(x) } + } +} + + +/// The back-end implementing [`UniformSampler`] for `Duration`. +/// +/// Unless you are implementing [`UniformSampler`] for your own types, this type +/// should not be used directly, use [`Uniform`] instead. +#[derive(Clone, Copy, Debug, PartialEq)] +#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] +pub struct UniformDuration { + mode: UniformDurationMode, + offset: u32, +} + +#[derive(Debug, Copy, Clone, PartialEq)] +#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] +enum UniformDurationMode { + Small { + secs: u64, + nanos: Uniform, + }, + Medium { + nanos: Uniform, + }, + Large { + max_secs: u64, + max_nanos: u32, + secs: Uniform, + }, +} + +impl SampleUniform for Duration { + type Sampler = UniformDuration; +} + +impl UniformSampler for UniformDuration { + type X = Duration; + + #[inline] + fn new(low_b: B1, high_b: B2) -> Self + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low < high, "Uniform::new called with `low >= high`"); + UniformDuration::new_inclusive(low, high - Duration::new(0, 1)) + } + + #[inline] + fn new_inclusive(low_b: B1, high_b: B2) -> Self + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "Uniform::new_inclusive called with `low > high`" + ); + + let low_s = low.as_secs(); + let low_n = low.subsec_nanos(); + let mut high_s = high.as_secs(); + let mut high_n = high.subsec_nanos(); + + if high_n < low_n { + high_s -= 1; + high_n += 1_000_000_000; + } + + let mode = if low_s == high_s { + UniformDurationMode::Small { + secs: low_s, + nanos: Uniform::new_inclusive(low_n, high_n), + } + } else { + let max = high_s + .checked_mul(1_000_000_000) + .and_then(|n| n.checked_add(u64::from(high_n))); + + if let Some(higher_bound) = max { + let lower_bound = low_s * 1_000_000_000 + u64::from(low_n); + UniformDurationMode::Medium { + nanos: Uniform::new_inclusive(lower_bound, higher_bound), + } + } else { + // An offset is applied to simplify generation of nanoseconds + let max_nanos = high_n - low_n; + UniformDurationMode::Large { + max_secs: high_s, + max_nanos, + secs: Uniform::new_inclusive(low_s, high_s), + } + } + }; + UniformDuration { + mode, + offset: low_n, + } + } + + #[inline] + fn sample(&self, rng: &mut R) -> Duration { + match self.mode { + UniformDurationMode::Small { secs, nanos } => { + let n = nanos.sample(rng); + Duration::new(secs, n) + } + UniformDurationMode::Medium { nanos } => { + let nanos = nanos.sample(rng); + Duration::new(nanos / 1_000_000_000, (nanos % 1_000_000_000) as u32) + } + UniformDurationMode::Large { + max_secs, + max_nanos, + secs, + } => { + // constant folding means this is at least as fast as `Rng::sample(Range)` + let nano_range = Uniform::new(0, 1_000_000_000); + loop { + let s = secs.sample(rng); + let n = nano_range.sample(rng); + if !(s == max_secs && n > max_nanos) { + let sum = n + self.offset; + break Duration::new(s, sum); + } + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::distributions::uniform::tests::test_samples; + + #[test] + #[cfg(feature = "serde1")] + fn test_serialization_uniform_duration() { + let distr = UniformDuration::new(Duration::from_secs(10), Duration::from_secs(60)); + let de_distr: UniformDuration = + bincode::deserialize(&bincode::serialize(&distr).unwrap()).unwrap(); + assert_eq!(distr.offset, de_distr.offset); + match (distr.mode, de_distr.mode) { + ( + UniformDurationMode::Small { + secs: a_secs, + nanos: a_nanos, + }, + UniformDurationMode::Small { secs, nanos }, + ) => { + assert_eq!(a_secs, secs); + assert_eq!(a_nanos, nanos); + } + ( + UniformDurationMode::Medium { nanos: a_nanos }, + UniformDurationMode::Medium { nanos }, + ) => { + assert_eq!(a_nanos, nanos); + } + ( + UniformDurationMode::Large { + max_secs: a_max_secs, + max_nanos: a_max_nanos, + secs: a_secs, + }, + UniformDurationMode::Large { + max_secs, + max_nanos, + secs, + }, + ) => { + assert_eq!(a_max_secs, max_secs); + assert_eq!(a_max_nanos, max_nanos); + assert_eq!(a_secs, secs); + } + _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly"), + } + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_char() { + let mut rng = crate::test::rng(891); + let mut max = core::char::from_u32(0).unwrap(); + for _ in 0..100 { + let c = rng.gen_range('A'..='Z'); + assert!(('A'..='Z').contains(&c)); + max = max.max(c); + } + assert_eq!(max, 'Z'); + let d = Uniform::new( + core::char::from_u32(0xD7F0).unwrap(), + core::char::from_u32(0xE010).unwrap(), + ); + for _ in 0..100 { + let c = d.sample(&mut rng); + assert!((c as u32) < 0xD800 || (c as u32) > 0xDFFF); + } + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_durations() { + let mut rng = crate::test::rng(253); + + let v = &[ + (Duration::new(10, 50000), Duration::new(100, 1234)), + (Duration::new(0, 100), Duration::new(1, 50)), + ( + Duration::new(0, 0), + Duration::new(u64::max_value(), 999_999_999), + ), + ]; + for &(low, high) in v.iter() { + let my_uniform = Uniform::new(low, high); + for _ in 0..1000 { + let v = rng.sample(my_uniform); + assert!(low <= v && v < high); + } + } + } + + #[test] + fn value_stability() { + test_samples( + Duration::new(2, 0), + Duration::new(4, 0), + &[ + Duration::new(2, 532615131), + Duration::new(3, 638826742), + Duration::new(3, 485707508), + ], + &[ + Duration::new(3, 117337521), + Duration::new(3, 191764285), + Duration::new(3, 236507617), + ], + ); + + test_samples('a', 'z', &['a', 'g', 'y'], &['u', 'j', 's']); + } + + #[test] + fn uniform_distributions_can_be_compared() { + assert_eq!(Uniform::new('a', 'g'), Uniform::new('a', 'g')); + assert_eq!( + Uniform::new(Duration::from_millis(10), Duration::from_millis(20)), + Uniform::new(Duration::from_millis(10), Duration::from_millis(20)), + ); + } +} From 255ff71cee94c81b313d2fe4cd49fed2eef10862 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Thu, 24 Feb 2022 11:40:14 +0000 Subject: [PATCH 03/17] Move UniformFloat to a new submodule --- src/distributions/uniform.rs | 434 +------------------ src/distributions/uniform/uniform_float.rs | 480 +++++++++++++++++++++ 2 files changed, 483 insertions(+), 431 deletions(-) create mode 100644 src/distributions/uniform/uniform_float.rs diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 07a4ddbbf2..7e1bdcb2c8 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -108,8 +108,7 @@ use core::ops::{Range, RangeInclusive}; -use crate::distributions::float::IntoFloat; -use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, WideningMultiply}; +use crate::distributions::utils::WideningMultiply; use crate::distributions::Distribution; use crate::{Rng, RngCore}; @@ -121,8 +120,10 @@ use crate::distributions::utils::Float; #[cfg(feature = "serde1")] use serde::{Deserialize, Serialize}; +mod uniform_float; mod uniform_other; +pub use uniform_float::UniformFloat; pub use uniform_other::{UniformChar, UniformDuration}; /// Sample values uniformly between two bounds. @@ -724,241 +725,10 @@ uniform_simd_int_impl! { u8 } -/// The back-end implementing [`UniformSampler`] for floating-point types. -/// -/// Unless you are implementing [`UniformSampler`] for your own type, this type -/// should not be used directly, use [`Uniform`] instead. -/// -/// # Implementation notes -/// -/// Instead of generating a float in the `[0, 1)` range using [`Standard`], the -/// `UniformFloat` implementation converts the output of an PRNG itself. This -/// way one or two steps can be optimized out. -/// -/// The floats are first converted to a value in the `[1, 2)` interval using a -/// transmute-based method, and then mapped to the expected range with a -/// multiply and addition. Values produced this way have what equals 23 bits of -/// random digits for an `f32`, and 52 for an `f64`. -/// -/// [`new`]: UniformSampler::new -/// [`new_inclusive`]: UniformSampler::new_inclusive -/// [`Standard`]: crate::distributions::Standard -#[derive(Clone, Copy, Debug, PartialEq)] -#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] -pub struct UniformFloat { - low: X, - scale: X, -} - -macro_rules! uniform_float_impl { - ($ty:ty, $uty:ident, $f_scalar:ident, $u_scalar:ident, $bits_to_discard:expr) => { - impl SampleUniform for $ty { - type Sampler = UniformFloat<$ty>; - } - - impl UniformSampler for UniformFloat<$ty> { - type X = $ty; - - fn new(low_b: B1, high_b: B2) -> Self - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - debug_assert!( - low.all_finite(), - "Uniform::new called with `low` non-finite." - ); - debug_assert!( - high.all_finite(), - "Uniform::new called with `high` non-finite." - ); - assert!(low.all_lt(high), "Uniform::new called with `low >= high`"); - let max_rand = <$ty>::splat( - (::core::$u_scalar::MAX >> $bits_to_discard).into_float_with_exponent(0) - 1.0, - ); - - let mut scale = high - low; - assert!(scale.all_finite(), "Uniform::new: range overflow"); - - loop { - let mask = (scale * max_rand + low).ge_mask(high); - if mask.none() { - break; - } - scale = scale.decrease_masked(mask); - } - - debug_assert!(<$ty>::splat(0.0).all_le(scale)); - - UniformFloat { low, scale } - } - - fn new_inclusive(low_b: B1, high_b: B2) -> Self - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - debug_assert!( - low.all_finite(), - "Uniform::new_inclusive called with `low` non-finite." - ); - debug_assert!( - high.all_finite(), - "Uniform::new_inclusive called with `high` non-finite." - ); - assert!( - low.all_le(high), - "Uniform::new_inclusive called with `low > high`" - ); - let max_rand = <$ty>::splat( - (::core::$u_scalar::MAX >> $bits_to_discard).into_float_with_exponent(0) - 1.0, - ); - - let mut scale = (high - low) / max_rand; - assert!(scale.all_finite(), "Uniform::new_inclusive: range overflow"); - - loop { - let mask = (scale * max_rand + low).gt_mask(high); - if mask.none() { - break; - } - scale = scale.decrease_masked(mask); - } - - debug_assert!(<$ty>::splat(0.0).all_le(scale)); - - UniformFloat { low, scale } - } - - fn sample(&self, rng: &mut R) -> Self::X { - // Generate a value in the range [1, 2) - let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0); - - // Get a value in the range [0, 1) in order to avoid - // overflowing into infinity when multiplying with scale - let value0_1 = value1_2 - 1.0; - - // We don't use `f64::mul_add`, because it is not available with - // `no_std`. Furthermore, it is slower for some targets (but - // faster for others). However, the order of multiplication and - // addition is important, because on some platforms (e.g. ARM) - // it will be optimized to a single (non-FMA) instruction. - value0_1 * self.scale + self.low - } - - #[inline] - fn sample_single(low_b: B1, high_b: B2, rng: &mut R) -> Self::X - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - debug_assert!( - low.all_finite(), - "UniformSampler::sample_single called with `low` non-finite." - ); - debug_assert!( - high.all_finite(), - "UniformSampler::sample_single called with `high` non-finite." - ); - assert!( - low.all_lt(high), - "UniformSampler::sample_single: low >= high" - ); - let mut scale = high - low; - assert!( - scale.all_finite(), - "UniformSampler::sample_single: range overflow" - ); - - loop { - // Generate a value in the range [1, 2) - let value1_2 = - (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0); - - // Get a value in the range [0, 1) in order to avoid - // overflowing into infinity when multiplying with scale - let value0_1 = value1_2 - 1.0; - - // Doing multiply before addition allows some architectures - // to use a single instruction. - let res = value0_1 * scale + low; - - debug_assert!(low.all_le(res) || !scale.all_finite()); - if res.all_lt(high) { - return res; - } - - // This handles a number of edge cases. - // * `low` or `high` is NaN. In this case `scale` and - // `res` are going to end up as NaN. - // * `low` is negative infinity and `high` is finite. - // `scale` is going to be infinite and `res` will be - // NaN. - // * `high` is positive infinity and `low` is finite. - // `scale` is going to be infinite and `res` will - // be infinite or NaN (if value0_1 is 0). - // * `low` is negative infinity and `high` is positive - // infinity. `scale` will be infinite and `res` will - // be NaN. - // * `low` and `high` are finite, but `high - low` - // overflows to infinite. `scale` will be infinite - // and `res` will be infinite or NaN (if value0_1 is 0). - // So if `high` or `low` are non-finite, we are guaranteed - // to fail the `res < high` check above and end up here. - // - // While we technically should check for non-finite `low` - // and `high` before entering the loop, by doing the checks - // here instead, we allow the common case to avoid these - // checks. But we are still guaranteed that if `low` or - // `high` are non-finite we'll end up here and can do the - // appropriate checks. - // - // Likewise `high - low` overflowing to infinity is also - // rare, so handle it here after the common case. - let mask = !scale.finite_mask(); - if mask.any() { - assert!( - low.all_finite() && high.all_finite(), - "Uniform::sample_single: low and high must be finite" - ); - scale = scale.decrease_masked(mask); - } - } - } - } - }; -} - -uniform_float_impl! { f32, u32, f32, u32, 32 - 23 } -uniform_float_impl! { f64, u64, f64, u64, 64 - 52 } - -#[cfg(feature = "simd_support")] -uniform_float_impl! { f32x2, u32x2, f32, u32, 32 - 23 } -#[cfg(feature = "simd_support")] -uniform_float_impl! { f32x4, u32x4, f32, u32, 32 - 23 } -#[cfg(feature = "simd_support")] -uniform_float_impl! { f32x8, u32x8, f32, u32, 32 - 23 } -#[cfg(feature = "simd_support")] -uniform_float_impl! { f32x16, u32x16, f32, u32, 32 - 23 } - -#[cfg(feature = "simd_support")] -uniform_float_impl! { f64x2, u64x2, f64, u64, 64 - 52 } -#[cfg(feature = "simd_support")] -uniform_float_impl! { f64x4, u64x4, f64, u64, 64 - 52 } -#[cfg(feature = "simd_support")] -uniform_float_impl! { f64x8, u64x8, f64, u64, 64 - 52 } - #[cfg(test)] mod tests { use super::*; - use crate::rngs::mock::StepRng; #[test] #[cfg(feature = "serde1")] @@ -970,13 +740,6 @@ mod tests { assert_eq!(unit_box.0.low, de_unit_box.0.low); assert_eq!(unit_box.0.range, de_unit_box.0.range); assert_eq!(unit_box.0.z, de_unit_box.0.z); - - let unit_box: Uniform = Uniform::new(-1., 1.); - let de_unit_box: Uniform = - bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap(); - - assert_eq!(unit_box.0.low, de_unit_box.0.low); - assert_eq!(unit_box.0.scale, de_unit_box.0.scale); } #[should_panic] @@ -1086,188 +849,6 @@ mod tests { } } - #[test] - #[cfg_attr(miri, ignore)] // Miri is too slow - fn test_floats() { - let mut rng = crate::test::rng(252); - let mut zero_rng = StepRng::new(0, 0); - let mut max_rng = StepRng::new(0xffff_ffff_ffff_ffff, 0); - macro_rules! t { - ($ty:ty, $f_scalar:ident, $bits_shifted:expr) => {{ - let v: &[($f_scalar, $f_scalar)] = &[ - (0.0, 100.0), - (-1e35, -1e25), - (1e-35, 1e-25), - (-1e35, 1e35), - (<$f_scalar>::from_bits(0), <$f_scalar>::from_bits(3)), - (-<$f_scalar>::from_bits(10), -<$f_scalar>::from_bits(1)), - (-<$f_scalar>::from_bits(5), 0.0), - (-<$f_scalar>::from_bits(7), -0.0), - (0.1 * ::core::$f_scalar::MAX, ::core::$f_scalar::MAX), - (-::core::$f_scalar::MAX * 0.2, ::core::$f_scalar::MAX * 0.7), - ]; - for &(low_scalar, high_scalar) in v.iter() { - for lane in 0..<$ty>::lanes() { - let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar); - let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar); - let my_uniform = Uniform::new(low, high); - let my_incl_uniform = Uniform::new_inclusive(low, high); - for _ in 0..100 { - let v = rng.sample(my_uniform).extract(lane); - assert!(low_scalar <= v && v < high_scalar); - let v = rng.sample(my_incl_uniform).extract(lane); - assert!(low_scalar <= v && v <= high_scalar); - let v = - <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng) - .extract(lane); - assert!(low_scalar <= v && v < high_scalar); - } - - assert_eq!( - rng.sample(Uniform::new_inclusive(low, low)).extract(lane), - low_scalar - ); - - assert_eq!(zero_rng.sample(my_uniform).extract(lane), low_scalar); - assert_eq!(zero_rng.sample(my_incl_uniform).extract(lane), low_scalar); - assert_eq!( - <$ty as SampleUniform>::Sampler::sample_single( - low, - high, - &mut zero_rng - ) - .extract(lane), - low_scalar - ); - assert!(max_rng.sample(my_uniform).extract(lane) < high_scalar); - assert!(max_rng.sample(my_incl_uniform).extract(lane) <= high_scalar); - - // Don't run this test for really tiny differences between high and low - // since for those rounding might result in selecting high for a very - // long time. - if (high_scalar - low_scalar) > 0.0001 { - let mut lowering_max_rng = StepRng::new( - 0xffff_ffff_ffff_ffff, - (-1i64 << $bits_shifted) as u64, - ); - assert!( - <$ty as SampleUniform>::Sampler::sample_single( - low, - high, - &mut lowering_max_rng - ) - .extract(lane) - < high_scalar - ); - } - } - } - - assert_eq!( - rng.sample(Uniform::new_inclusive( - ::core::$f_scalar::MAX, - ::core::$f_scalar::MAX - )), - ::core::$f_scalar::MAX - ); - assert_eq!( - rng.sample(Uniform::new_inclusive( - -::core::$f_scalar::MAX, - -::core::$f_scalar::MAX - )), - -::core::$f_scalar::MAX - ); - }}; - } - - t!(f32, f32, 32 - 23); - t!(f64, f64, 64 - 52); - #[cfg(feature = "simd_support")] - { - t!(f32x2, f32, 32 - 23); - t!(f32x4, f32, 32 - 23); - t!(f32x8, f32, 32 - 23); - t!(f32x16, f32, 32 - 23); - t!(f64x2, f64, 64 - 52); - t!(f64x4, f64, 64 - 52); - t!(f64x8, f64, 64 - 52); - } - } - - #[test] - #[should_panic] - fn test_float_overflow() { - let _ = Uniform::from(::core::f64::MIN..::core::f64::MAX); - } - - #[test] - #[should_panic] - fn test_float_overflow_single() { - let mut rng = crate::test::rng(252); - rng.gen_range(::core::f64::MIN..::core::f64::MAX); - } - - #[test] - #[cfg(all( - feature = "std", - not(target_arch = "wasm32"), - not(target_arch = "asmjs") - ))] - fn test_float_assertions() { - use super::SampleUniform; - use std::panic::catch_unwind; - fn range(low: T, high: T) { - let mut rng = crate::test::rng(253); - T::Sampler::sample_single(low, high, &mut rng); - } - - macro_rules! t { - ($ty:ident, $f_scalar:ident) => {{ - let v: &[($f_scalar, $f_scalar)] = &[ - (::std::$f_scalar::NAN, 0.0), - (1.0, ::std::$f_scalar::NAN), - (::std::$f_scalar::NAN, ::std::$f_scalar::NAN), - (1.0, 0.5), - (::std::$f_scalar::MAX, -::std::$f_scalar::MAX), - (::std::$f_scalar::INFINITY, ::std::$f_scalar::INFINITY), - ( - ::std::$f_scalar::NEG_INFINITY, - ::std::$f_scalar::NEG_INFINITY, - ), - (::std::$f_scalar::NEG_INFINITY, 5.0), - (5.0, ::std::$f_scalar::INFINITY), - (::std::$f_scalar::NAN, ::std::$f_scalar::INFINITY), - (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::NAN), - (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::INFINITY), - ]; - for &(low_scalar, high_scalar) in v.iter() { - for lane in 0..<$ty>::lanes() { - let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar); - let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar); - assert!(catch_unwind(|| range(low, high)).is_err()); - assert!(catch_unwind(|| Uniform::new(low, high)).is_err()); - assert!(catch_unwind(|| Uniform::new_inclusive(low, high)).is_err()); - assert!(catch_unwind(|| range(low, low)).is_err()); - assert!(catch_unwind(|| Uniform::new(low, low)).is_err()); - } - } - }}; - } - - t!(f32, f32); - t!(f64, f64); - #[cfg(feature = "simd_support")] - { - t!(f32x2, f32); - t!(f32x4, f32); - t!(f32x8, f32); - t!(f32x16, f32); - t!(f64x2, f64); - t!(f64x4, f64); - t!(f64x8, f64); - } - } - #[test] fn test_custom_uniform() { use crate::distributions::uniform::{ @@ -1322,9 +903,6 @@ mod tests { let r = Uniform::from(2u32..7); assert_eq!(r.0.low, 2); assert_eq!(r.0.range, 5); - let r = Uniform::from(2.0f64..7.0); - assert_eq!(r.0.low, 2.0); - assert_eq!(r.0.scale, 5.0); } #[test] @@ -1332,10 +910,6 @@ mod tests { let r = Uniform::from(2u32..=6); assert_eq!(r.0.low, 2); assert_eq!(r.0.range, 5); - let r = Uniform::from(2.0f64..=7.0); - assert_eq!(r.0.low, 2.0); - assert!(r.0.scale > 5.0); - assert!(r.0.scale < 5.0 + 1e-14); } pub(crate) fn test_samples( @@ -1379,8 +953,6 @@ mod tests { #[test] fn uniform_distributions_can_be_compared() { - assert_eq!(Uniform::new(1.0, 2.0), Uniform::new(1.0, 2.0)); - assert_eq!(Uniform::new(1u32, 2u32), Uniform::new(1u32, 2u32)); } } diff --git a/src/distributions/uniform/uniform_float.rs b/src/distributions/uniform/uniform_float.rs new file mode 100644 index 0000000000..cf4e37e067 --- /dev/null +++ b/src/distributions/uniform/uniform_float.rs @@ -0,0 +1,480 @@ +// Copyright 2018-2021 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{SampleBorrow, SampleUniform, UniformSampler}; +use crate::distributions::float::IntoFloat; +use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils}; +use crate::Rng; +#[cfg(feature = "simd_support")] use packed_simd::*; +#[cfg(feature = "serde1")] use serde::{Deserialize, Serialize}; + +/// The back-end implementing [`UniformSampler`] for floating-point types. +/// +/// Unless you are implementing [`UniformSampler`] for your own type, this type +/// should not be used directly, use [`Uniform`] instead. +/// +/// # Implementation notes +/// +/// Instead of generating a float in the `[0, 1)` range using [`Standard`], the +/// `UniformFloat` implementation converts the output of an PRNG itself. This +/// way one or two steps can be optimized out. +/// +/// The floats are first converted to a value in the `[1, 2)` interval using a +/// transmute-based method, and then mapped to the expected range with a +/// multiply and addition. Values produced this way have what equals 23 bits of +/// random digits for an `f32`, and 52 for an `f64`. +/// +/// [`new`]: UniformSampler::new +/// [`new_inclusive`]: UniformSampler::new_inclusive +/// [`Standard`]: crate::distributions::Standard +#[derive(Clone, Copy, Debug, PartialEq)] +#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] +pub struct UniformFloat { + low: X, + scale: X, +} + +macro_rules! uniform_float_impl { + ($ty:ty, $uty:ident, $f_scalar:ident, $u_scalar:ident, $bits_to_discard:expr) => { + impl SampleUniform for $ty { + type Sampler = UniformFloat<$ty>; + } + + impl UniformSampler for UniformFloat<$ty> { + type X = $ty; + + fn new(low_b: B1, high_b: B2) -> Self + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + debug_assert!( + low.all_finite(), + "Uniform::new called with `low` non-finite." + ); + debug_assert!( + high.all_finite(), + "Uniform::new called with `high` non-finite." + ); + assert!(low.all_lt(high), "Uniform::new called with `low >= high`"); + let max_rand = <$ty>::splat( + (::core::$u_scalar::MAX >> $bits_to_discard).into_float_with_exponent(0) - 1.0, + ); + + let mut scale = high - low; + assert!(scale.all_finite(), "Uniform::new: range overflow"); + + loop { + let mask = (scale * max_rand + low).ge_mask(high); + if mask.none() { + break; + } + scale = scale.decrease_masked(mask); + } + + debug_assert!(<$ty>::splat(0.0).all_le(scale)); + + UniformFloat { low, scale } + } + + fn new_inclusive(low_b: B1, high_b: B2) -> Self + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + debug_assert!( + low.all_finite(), + "Uniform::new_inclusive called with `low` non-finite." + ); + debug_assert!( + high.all_finite(), + "Uniform::new_inclusive called with `high` non-finite." + ); + assert!( + low.all_le(high), + "Uniform::new_inclusive called with `low > high`" + ); + let max_rand = <$ty>::splat( + (::core::$u_scalar::MAX >> $bits_to_discard).into_float_with_exponent(0) - 1.0, + ); + + let mut scale = (high - low) / max_rand; + assert!(scale.all_finite(), "Uniform::new_inclusive: range overflow"); + + loop { + let mask = (scale * max_rand + low).gt_mask(high); + if mask.none() { + break; + } + scale = scale.decrease_masked(mask); + } + + debug_assert!(<$ty>::splat(0.0).all_le(scale)); + + UniformFloat { low, scale } + } + + fn sample(&self, rng: &mut R) -> Self::X { + // Generate a value in the range [1, 2) + let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0); + + // Get a value in the range [0, 1) in order to avoid + // overflowing into infinity when multiplying with scale + let value0_1 = value1_2 - 1.0; + + // We don't use `f64::mul_add`, because it is not available with + // `no_std`. Furthermore, it is slower for some targets (but + // faster for others). However, the order of multiplication and + // addition is important, because on some platforms (e.g. ARM) + // it will be optimized to a single (non-FMA) instruction. + value0_1 * self.scale + self.low + } + + #[inline] + fn sample_single(low_b: B1, high_b: B2, rng: &mut R) -> Self::X + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + debug_assert!( + low.all_finite(), + "UniformSampler::sample_single called with `low` non-finite." + ); + debug_assert!( + high.all_finite(), + "UniformSampler::sample_single called with `high` non-finite." + ); + assert!( + low.all_lt(high), + "UniformSampler::sample_single: low >= high" + ); + let mut scale = high - low; + assert!( + scale.all_finite(), + "UniformSampler::sample_single: range overflow" + ); + + loop { + // Generate a value in the range [1, 2) + let value1_2 = + (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0); + + // Get a value in the range [0, 1) in order to avoid + // overflowing into infinity when multiplying with scale + let value0_1 = value1_2 - 1.0; + + // Doing multiply before addition allows some architectures + // to use a single instruction. + let res = value0_1 * scale + low; + + debug_assert!(low.all_le(res) || !scale.all_finite()); + if res.all_lt(high) { + return res; + } + + // This handles a number of edge cases. + // * `low` or `high` is NaN. In this case `scale` and + // `res` are going to end up as NaN. + // * `low` is negative infinity and `high` is finite. + // `scale` is going to be infinite and `res` will be + // NaN. + // * `high` is positive infinity and `low` is finite. + // `scale` is going to be infinite and `res` will + // be infinite or NaN (if value0_1 is 0). + // * `low` is negative infinity and `high` is positive + // infinity. `scale` will be infinite and `res` will + // be NaN. + // * `low` and `high` are finite, but `high - low` + // overflows to infinite. `scale` will be infinite + // and `res` will be infinite or NaN (if value0_1 is 0). + // So if `high` or `low` are non-finite, we are guaranteed + // to fail the `res < high` check above and end up here. + // + // While we technically should check for non-finite `low` + // and `high` before entering the loop, by doing the checks + // here instead, we allow the common case to avoid these + // checks. But we are still guaranteed that if `low` or + // `high` are non-finite we'll end up here and can do the + // appropriate checks. + // + // Likewise `high - low` overflowing to infinity is also + // rare, so handle it here after the common case. + let mask = !scale.finite_mask(); + if mask.any() { + assert!( + low.all_finite() && high.all_finite(), + "Uniform::sample_single: low and high must be finite" + ); + scale = scale.decrease_masked(mask); + } + } + } + } + }; +} + +uniform_float_impl! { f32, u32, f32, u32, 32 - 23 } +uniform_float_impl! { f64, u64, f64, u64, 64 - 52 } + +#[cfg(feature = "simd_support")] +uniform_float_impl! { f32x2, u32x2, f32, u32, 32 - 23 } +#[cfg(feature = "simd_support")] +uniform_float_impl! { f32x4, u32x4, f32, u32, 32 - 23 } +#[cfg(feature = "simd_support")] +uniform_float_impl! { f32x8, u32x8, f32, u32, 32 - 23 } +#[cfg(feature = "simd_support")] +uniform_float_impl! { f32x16, u32x16, f32, u32, 32 - 23 } + +#[cfg(feature = "simd_support")] +uniform_float_impl! { f64x2, u64x2, f64, u64, 64 - 52 } +#[cfg(feature = "simd_support")] +uniform_float_impl! { f64x4, u64x4, f64, u64, 64 - 52 } +#[cfg(feature = "simd_support")] +uniform_float_impl! { f64x8, u64x8, f64, u64, 64 - 52 } + +#[cfg(test)] +mod tests { + use super::*; + use crate::distributions::uniform::tests::test_samples; + use crate::distributions::Uniform; + use crate::rngs::mock::StepRng; + + #[test] + #[cfg(feature = "serde1")] + fn test_uniform_serialization() { + let unit_box: Uniform = Uniform::new(-1., 1.); + let de_unit_box: Uniform = + bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap(); + + assert_eq!(unit_box.0.low, de_unit_box.0.low); + assert_eq!(unit_box.0.scale, de_unit_box.0.scale); + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_floats() { + let mut rng = crate::test::rng(252); + let mut zero_rng = StepRng::new(0, 0); + let mut max_rng = StepRng::new(0xffff_ffff_ffff_ffff, 0); + macro_rules! t { + ($ty:ty, $f_scalar:ident, $bits_shifted:expr) => {{ + let v: &[($f_scalar, $f_scalar)] = &[ + (0.0, 100.0), + (-1e35, -1e25), + (1e-35, 1e-25), + (-1e35, 1e35), + (<$f_scalar>::from_bits(0), <$f_scalar>::from_bits(3)), + (-<$f_scalar>::from_bits(10), -<$f_scalar>::from_bits(1)), + (-<$f_scalar>::from_bits(5), 0.0), + (-<$f_scalar>::from_bits(7), -0.0), + (0.1 * ::core::$f_scalar::MAX, ::core::$f_scalar::MAX), + (-::core::$f_scalar::MAX * 0.2, ::core::$f_scalar::MAX * 0.7), + ]; + for &(low_scalar, high_scalar) in v.iter() { + for lane in 0..<$ty>::lanes() { + let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar); + let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar); + let my_uniform = Uniform::new(low, high); + let my_incl_uniform = Uniform::new_inclusive(low, high); + for _ in 0..100 { + let v = rng.sample(my_uniform).extract(lane); + assert!(low_scalar <= v && v < high_scalar); + let v = rng.sample(my_incl_uniform).extract(lane); + assert!(low_scalar <= v && v <= high_scalar); + let v = + <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng) + .extract(lane); + assert!(low_scalar <= v && v < high_scalar); + } + + assert_eq!( + rng.sample(Uniform::new_inclusive(low, low)).extract(lane), + low_scalar + ); + + assert_eq!(zero_rng.sample(my_uniform).extract(lane), low_scalar); + assert_eq!(zero_rng.sample(my_incl_uniform).extract(lane), low_scalar); + assert_eq!( + <$ty as SampleUniform>::Sampler::sample_single( + low, + high, + &mut zero_rng + ) + .extract(lane), + low_scalar + ); + assert!(max_rng.sample(my_uniform).extract(lane) < high_scalar); + assert!(max_rng.sample(my_incl_uniform).extract(lane) <= high_scalar); + + // Don't run this test for really tiny differences between high and low + // since for those rounding might result in selecting high for a very + // long time. + if (high_scalar - low_scalar) > 0.0001 { + let mut lowering_max_rng = StepRng::new( + 0xffff_ffff_ffff_ffff, + (-1i64 << $bits_shifted) as u64, + ); + assert!( + <$ty as SampleUniform>::Sampler::sample_single( + low, + high, + &mut lowering_max_rng + ) + .extract(lane) + < high_scalar + ); + } + } + } + + assert_eq!( + rng.sample(Uniform::new_inclusive( + ::core::$f_scalar::MAX, + ::core::$f_scalar::MAX + )), + ::core::$f_scalar::MAX + ); + assert_eq!( + rng.sample(Uniform::new_inclusive( + -::core::$f_scalar::MAX, + -::core::$f_scalar::MAX + )), + -::core::$f_scalar::MAX + ); + }}; + } + + t!(f32, f32, 32 - 23); + t!(f64, f64, 64 - 52); + #[cfg(feature = "simd_support")] + { + t!(f32x2, f32, 32 - 23); + t!(f32x4, f32, 32 - 23); + t!(f32x8, f32, 32 - 23); + t!(f32x16, f32, 32 - 23); + t!(f64x2, f64, 64 - 52); + t!(f64x4, f64, 64 - 52); + t!(f64x8, f64, 64 - 52); + } + } + + #[test] + #[should_panic] + fn test_float_overflow() { + let _ = Uniform::from(::core::f64::MIN..::core::f64::MAX); + } + + #[test] + #[should_panic] + fn test_float_overflow_single() { + let mut rng = crate::test::rng(252); + rng.gen_range(::core::f64::MIN..::core::f64::MAX); + } + + #[test] + #[cfg(all( + feature = "std", + not(target_arch = "wasm32"), + not(target_arch = "asmjs") + ))] + fn test_float_assertions() { + use super::SampleUniform; + use std::panic::catch_unwind; + fn range(low: T, high: T) { + let mut rng = crate::test::rng(253); + T::Sampler::sample_single(low, high, &mut rng); + } + + macro_rules! t { + ($ty:ident, $f_scalar:ident) => {{ + let v: &[($f_scalar, $f_scalar)] = &[ + (::std::$f_scalar::NAN, 0.0), + (1.0, ::std::$f_scalar::NAN), + (::std::$f_scalar::NAN, ::std::$f_scalar::NAN), + (1.0, 0.5), + (::std::$f_scalar::MAX, -::std::$f_scalar::MAX), + (::std::$f_scalar::INFINITY, ::std::$f_scalar::INFINITY), + ( + ::std::$f_scalar::NEG_INFINITY, + ::std::$f_scalar::NEG_INFINITY, + ), + (::std::$f_scalar::NEG_INFINITY, 5.0), + (5.0, ::std::$f_scalar::INFINITY), + (::std::$f_scalar::NAN, ::std::$f_scalar::INFINITY), + (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::NAN), + (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::INFINITY), + ]; + for &(low_scalar, high_scalar) in v.iter() { + for lane in 0..<$ty>::lanes() { + let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar); + let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar); + assert!(catch_unwind(|| range(low, high)).is_err()); + assert!(catch_unwind(|| Uniform::new(low, high)).is_err()); + assert!(catch_unwind(|| Uniform::new_inclusive(low, high)).is_err()); + assert!(catch_unwind(|| range(low, low)).is_err()); + assert!(catch_unwind(|| Uniform::new(low, low)).is_err()); + } + } + }}; + } + + t!(f32, f32); + t!(f64, f64); + #[cfg(feature = "simd_support")] + { + t!(f32x2, f32); + t!(f32x4, f32); + t!(f32x8, f32); + t!(f32x16, f32); + t!(f64x2, f64); + t!(f64x4, f64); + t!(f64x8, f64); + } + } + + #[test] + fn test_uniform_from_std_range() { + let r = Uniform::from(2.0f64..7.0); + assert_eq!(r.0.low, 2.0); + assert_eq!(r.0.scale, 5.0); + } + + #[test] + fn test_uniform_from_std_range_inclusive() { + let r = Uniform::from(2.0f64..=7.0); + assert_eq!(r.0.low, 2.0); + assert!(r.0.scale > 5.0); + assert!(r.0.scale < 5.0 + 1e-14); + } + + #[test] + fn value_stability() { + test_samples(0f32, 1000000f32, &[30701.041, 266307.47, 979833.0], &[ + 819413.3, 398172.03, 742853.6, + ]); + test_samples( + -1f64, + 1f64, + &[-0.4673848682871551, 0.6388267422932352, 0.4857075081198343], + &[0.11733752121808161, 0.191764285210958, 0.2365076174315397], + ); + + // TODO: SIMD types + } + + #[test] + fn uniform_distributions_can_be_compared() { + assert_eq!(Uniform::new(1.0, 2.0), Uniform::new(1.0, 2.0)); + } +} From 00869d7c3d223d1f575e7f4d64b44dca9cca92ef Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Wed, 20 Oct 2021 10:13:52 +0100 Subject: [PATCH 04/17] Impl ONeill, Canon, Canon-Lemire and Bitmask methods for integer types This is based on @TheIronBorn's work (#1154, #1172), with some changes. --- src/distributions/uniform.rs | 198 ++++++++++++++++++++++++++++++++--- 1 file changed, 182 insertions(+), 16 deletions(-) diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 7e1bdcb2c8..1029a3f888 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -433,7 +433,7 @@ pub struct UniformInt { } macro_rules! uniform_int_impl { - ($ty:ty, $unsigned:ident, $u_large:ident) => { + ($ty:ty, $unsigned:ident, $u_large:ident, $u_extra_large:ident) => { impl SampleUniform for $ty { type Sampler = UniformInt<$ty>; } @@ -536,9 +536,8 @@ macro_rules! uniform_int_impl { "UniformSampler::sample_single_inclusive: low > high" ); let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; - // If the above resulted in wrap-around to 0, the range is $ty::MIN..=$ty::MAX, - // and any integer will do. if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case return rng.gen(); } @@ -564,21 +563,188 @@ macro_rules! uniform_int_impl { } } } + + impl UniformInt<$ty> { + /// Sample single inclusive, using ONeill's method + #[inline] + pub fn sample_single_inclusive_oneill( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // we use the "Debiased Int Mult (t-opt, m-opt)" rejection sampling method + // described here https://www.pcg-random.org/posts/bounded-rands.html + // and here https://github.com/imneme/bounded-rands + + let (mut hi, mut lo) = rng.gen::<$u_large>().wmul(range); + if lo < range { + let mut threshold = range.wrapping_neg(); + // this shortcut works best with large ranges + if threshold >= range { + threshold -= range; + if threshold >= range { + threshold %= range; + } + } + while lo < threshold { + let (new_hi, new_lo) = rng.gen::<$u_large>().wmul(range); + hi = new_hi; + lo = new_lo; + } + } + low.wrapping_add(hi as $ty) + } + + /// Sample single inclusive, using Canon's method + #[inline] + pub fn sample_single_inclusive_canon( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // generate a sample using a sensible integer type + let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range); + + // if the sample is biased... + if lo_order > range.wrapping_neg() { + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let (new_hi_order, _) = + (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large); + // and adjust if needed + result += lo_order + .checked_add(new_hi_order as $u_extra_large) + .is_none() as $u_extra_large; + } + + low.wrapping_add(result as $ty) + } + + /// Sample single inclusive, using Canon's method with Lemire's early-out + #[inline] + pub fn sample_inclusive_canon_lemire( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // generate a sample using a sensible integer type + let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range); + + // if the sample is biased... (since range won't be changing we can further + // improve this check with a modulo) + if lo_order < range.wrapping_neg() % range { + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let (new_hi_order, _) = + (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large); + // and adjust if needed + result += lo_order + .checked_add(new_hi_order as $u_extra_large) + .is_none() as $u_extra_large; + } + + low.wrapping_add(result as $ty) + } + + /// Sample single inclusive, using the Bitmask method + #[inline] + pub fn sample_single_inclusive_bitmask( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let mut range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // the old impl use a mix of methods for different integer sizes, we only use + // the lz method here for a better comparison. + + let mut mask = $u_large::max_value(); + range -= 1; + mask >>= (range | 1).leading_zeros(); + loop { + let x = rng.gen::<$u_large>() & mask; + if x <= range { + return low.wrapping_add(x as $ty); + } + } + } + } }; } - -uniform_int_impl! { i8, u8, u32 } -uniform_int_impl! { i16, u16, u32 } -uniform_int_impl! { i32, u32, u32 } -uniform_int_impl! { i64, u64, u64 } -uniform_int_impl! { i128, u128, u128 } -uniform_int_impl! { isize, usize, usize } -uniform_int_impl! { u8, u8, u32 } -uniform_int_impl! { u16, u16, u32 } -uniform_int_impl! { u32, u32, u32 } -uniform_int_impl! { u64, u64, u64 } -uniform_int_impl! { usize, usize, usize } -uniform_int_impl! { u128, u128, u128 } +uniform_int_impl! { i8, u8, u32, u64 } +uniform_int_impl! { i16, u16, u32, u64 } +uniform_int_impl! { i32, u32, u32, u64 } +uniform_int_impl! { i64, u64, u64, u64 } +uniform_int_impl! { i128, u128, u128, u128 } +uniform_int_impl! { u8, u8, u32, u64 } +uniform_int_impl! { u16, u16, u32, u64 } +uniform_int_impl! { u32, u32, u32, u64 } +uniform_int_impl! { u64, u64, u64, u64 } +uniform_int_impl! { u128, u128, u128, u128 } +#[cfg(any(target_pointer_width = "16", target_pointer_width = "32",))] +mod isize_int_impls { + use super::*; + uniform_int_impl! { isize, usize, usize, u64 } + uniform_int_impl! { usize, usize, usize, u64 } +} +#[cfg(not(any(target_pointer_width = "16", target_pointer_width = "32",)))] +mod isize_int_impls { + use super::*; + uniform_int_impl! { isize, usize, usize, usize } + uniform_int_impl! { usize, usize, usize, usize } +} #[cfg(feature = "simd_support")] macro_rules! uniform_simd_int_impl { From 8753100a6c685f8ff990cc9c414becb39f413ac5 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Wed, 20 Oct 2021 11:37:50 +0100 Subject: [PATCH 05/17] Add Criterion benchmark for uniform int distributions --- Cargo.toml | 6 +++++ benches/uniform.rs | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 benches/uniform.rs diff --git a/Cargo.toml b/Cargo.toml index 98ba373c68..cb27f4193c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,6 +80,12 @@ features = ["into_bits"] libc = { version = "0.2.22", optional = true, default-features = false } [dev-dependencies] +criterion = { version = "0.3", features = ["html_reports"] } +num-traits = "0.2.14" rand_pcg = { path = "rand_pcg", version = "0.3.0" } # Only to test serde1 bincode = "1.2.1" + +[[bench]] +name = "uniform" +harness = false diff --git a/benches/uniform.rs b/benches/uniform.rs new file mode 100644 index 0000000000..5e19c046e1 --- /dev/null +++ b/benches/uniform.rs @@ -0,0 +1,67 @@ +// Copyright 2021 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use criterion::{criterion_group, criterion_main}; +use criterion::{BenchmarkId, Criterion}; +use rand::distributions::uniform::{SampleUniform, UniformSampler}; +use rand::prelude::*; + +type BenchRng = SmallRng; + +macro_rules! bench_int_group { + ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => { + for input in $inputs { + $g.bench_with_input( + BenchmarkId::new($name, input.0), + &input.1, + |b, (low, high)| { + let mut rng = BenchRng::from_entropy(); + b.iter(|| <$T as SampleUniform>::Sampler::$f(low, high, &mut rng)) + }, + ); + } + $g.bench_function(BenchmarkId::new($name, "varying"), |b| { + let (low, mut high) = ($inputs[0].1 .0, $inputs[1].1 .1); + let mut rng = BenchRng::from_entropy(); + b.iter(|| { + high = high.wrapping_add(1); + <$T as SampleUniform>::Sampler::$f(low, high, &mut rng) + }) + }); + }; +} + +macro_rules! bench_int { + ($c:expr, $T:ty, $high:expr) => {{ + let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T))); + let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))]; + + bench_int_group!("Old", $T, sample_single_inclusive, g, inputs); + bench_int_group!("ONeill", $T, sample_single_inclusive_oneill, g, inputs); + bench_int_group!("Canon", $T, sample_single_inclusive_canon, g, inputs); + bench_int_group!("Canon-Lemire", $T, sample_inclusive_canon_lemire, g, inputs); + bench_int_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs); + }}; +} + +fn uniform_int(c: &mut Criterion) { + // for i8/i16, we use 32-bit integers internally so rejection is most common near full-size + // the exact values were determined with an exhaustive search + bench_int!(c, i8, (i8::MIN, 116)); + bench_int!(c, i16, (i16::MIN, 32407)); + bench_int!(c, i32, (i32::MIN, 1)); + bench_int!(c, i64, (i64::MIN, 1)); + bench_int!(c, i128, (i128::MIN, 1)); +} + +criterion_group! { + name = benches; + config = Criterion::default(); + targets = uniform_int +} +criterion_main!(benches); From aada17da89db175e5123fdc4ede8d2a65b3eb7bb Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Wed, 20 Oct 2021 12:28:30 +0100 Subject: [PATCH 06/17] Add uniform SIMD impls for new algs This is from @TheIronBorn (see #1172) --- src/distributions/integer.rs | 4 +- src/distributions/uniform.rs | 417 ++++++++++++++++++++++++++++++++++- src/distributions/utils.rs | 191 ++++++++++++++++ 3 files changed, 607 insertions(+), 5 deletions(-) diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs index 19ce71599c..931c2235d7 100644 --- a/src/distributions/integer.rs +++ b/src/distributions/integer.rs @@ -148,9 +148,9 @@ simd_impl!(64, u8x8, i8x8, u16x4, i16x4, u32x2, i32x2,); #[cfg(feature = "simd_support")] simd_impl!(128, u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2,); #[cfg(feature = "simd_support")] -simd_impl!(256, u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4,); +simd_impl!(256, u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4, u128x2, i128x2,); #[cfg(feature = "simd_support")] -simd_impl!(512, u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8,); +simd_impl!(512, u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8, u128x4, i128x4,); #[cfg(all( feature = "simd_support", any(target_arch = "x86", target_arch = "x86_64") diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 1029a3f888..231237609f 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -109,6 +109,8 @@ use core::ops::{Range, RangeInclusive}; use crate::distributions::utils::WideningMultiply; +#[cfg(feature = "simd_support")] +use crate::distributions::utils::{OverflowingAdd, SimdCombine, SimdSplit}; use crate::distributions::Distribution; use crate::{Rng, RngCore}; @@ -853,6 +855,381 @@ macro_rules! uniform_simd_int_impl { }; } +#[cfg(feature = "simd_support")] +macro_rules! uniform_simd_int_gt8_impl { + ($ty:ident, $unsigned:ident) => { + impl UniformInt<$ty> { + #[inline(always)] + fn sample_inc_setup(low_b: B1, high_b: B2) -> ($unsigned, $ty) + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high"); + let range: $unsigned = ((high - low) + 1).cast(); + (range, low) + } + + #[inline(always)] + fn canon_successive( + range: $unsigned, + result: &mut $unsigned, + lo_order: $unsigned, + rng: &mut R + ) { + let mut vecs_64 = range.simd_split(); + for x in &mut vecs_64 { + *x = rng.gen::().wmul((*x).cast()).0.cast(); + } + let cast_new_hi: $unsigned = vecs_64.simd_combine(); + + let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); + *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); + } + + /// Canon's method + #[inline(always)] + pub fn sample_inclusive_canon( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + Self::canon_successive(range, &mut result, lo_order, rng); + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// Canon's method + #[inline(always)] + pub fn sample_single_inclusive_canon( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + if lo_order.gt(0 - range).any() { + Self::canon_successive(range, &mut result, lo_order, rng); + } + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// Canon + Lemire's early-out + #[inline(always)] + pub fn sample_inclusive_canon_lemire( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + // lo_order < range.wrapping_neg() % range { + // may panic if range == 0 + if lo_order.lt((0 - range) % range).any() { + Self::canon_successive(range, &mut result, lo_order, rng); + } + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// Bitmask + #[inline(always)] + pub fn sample_single_inclusive_bitmask( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (mut range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate bitmask + range -= 1; + let mut mask = range | 1; + + mask |= mask >> 1; + mask |= mask >> 2; + mask |= mask >> 4; + + const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); + if LANE_WIDTH >= 16 { mask |= mask >> 8; } + if LANE_WIDTH >= 32 { mask |= mask >> 16; } + if LANE_WIDTH >= 64 { mask |= mask >> 32; } + if LANE_WIDTH >= 128 { mask |= mask >> 64; } + + let mut v: $unsigned = rng.gen(); + loop { + let masked = v & mask; + let accept = masked.le(range); + if accept.all() { + let masked: $ty = masked.cast(); + // wrapping addition + let result = low + masked; + // `select` here compiles to a blend operation + // When `range.eq(0).none()` the compare and blend + // operations are avoided. + let v: $ty = v.cast(); + return is_full_range.select(v, result); + } + // Replace only the failing lanes + v = accept.select(v, rng.gen()); + } + } + } + }; + + ($(($unsigned:ident, $signed:ident)),+) => {$( + uniform_simd_int_gt8_impl!{ $unsigned, $unsigned } + uniform_simd_int_gt8_impl!{ $signed, $unsigned } + )+}; +} + +#[cfg(feature = "simd_support")] +macro_rules! uniform_simd_int_le8_impl { + ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => { + impl UniformInt<$ty> { + #[inline(always)] + fn sample_inc_setup(low_b: B1, high_b: B2) -> ($unsigned, $ty) + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high"); + // wrapping sub and add + let range: $unsigned = ((high - low) + 1).cast(); + (range, low) + } + + #[inline(always)] + fn canon_successive( + range: $unsigned, + result: &mut $unsigned, + lo_order: $unsigned, + rng: &mut R + ) { + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast(); + let large_range: $u_extra_large = range.cast(); + let (new_hi_order, _) = new_bits.wmul(large_range); + // and adjust if needed + let cast_new_hi: $unsigned = new_hi_order.cast(); + let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); + *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); + } + + /// + #[inline(always)] + pub fn sample_inclusive_canon( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + Self::canon_successive(range, &mut result, lo_order, rng); + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// + #[inline(always)] + pub fn sample_inclusive_canon_scalar( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast(); + let large_range: $u_extra_large = range.cast(); + + // let (new_hi_order, _) = new_bits.wmul(large_range); + let mut new_hi_order = <$u_extra_large>::default(); + + for i in 0..<$ty>::lanes() { + let (shi, _slo) = new_bits.extract(i).wmul(large_range.extract(i)); + new_hi_order = new_hi_order.replace(i, shi); + } + + // and adjust if needed + let cast_new_hi: $unsigned = new_hi_order.cast(); + let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); + result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// + #[inline(always)] + pub fn sample_single_inclusive_canon( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + if lo_order.gt(0 - range).any() { + Self::canon_successive(range, &mut result, lo_order, rng); + } + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// + #[inline(always)] + pub fn sample_inclusive_canon_lemire( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + // lo_order < range.wrapping_neg() % range { + // may panic if range == 0 + if lo_order.lt((0 - range) % range).any() { + Self::canon_successive(range, &mut result, lo_order, rng); + } + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// + #[inline(always)] + pub fn sample_single_inclusive_bitmask( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (mut range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate bitmask + range -= 1; + let mut mask = range | 1; + + mask |= mask >> 1; + mask |= mask >> 2; + mask |= mask >> 4; + + const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); + if LANE_WIDTH >= 16 { mask |= mask >> 8; } + if LANE_WIDTH >= 32 { mask |= mask >> 16; } + if LANE_WIDTH >= 64 { mask |= mask >> 32; } + if LANE_WIDTH >= 128 { mask |= mask >> 64; } + + let mut v: $unsigned = rng.gen(); + loop { + let masked = v & mask; + let accept = masked.le(range); + if accept.all() { + let masked: $ty = masked.cast(); + // wrapping addition + let result = low + masked; + // `select` here compiles to a blend operation + // When `range.eq(0).none()` the compare and blend + // operations are avoided. + let v: $ty = v.cast(); + return is_full_range.select(v, result); + } + // Replace only the failing lanes + v = accept.select(v, rng.gen()); + } + } + } + }; + + ($(($unsigned:ident, $signed:ident, $u64xN_type:ident, $u_extra_large:ident)),+) => {$( + uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u64xN_type, $u_extra_large } + uniform_simd_int_le8_impl!{ $signed, $unsigned, $u64xN_type, $u_extra_large } + )+}; +} + +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u128x2, i128x2), + (u128x4, i128x4), + u128 +} +// #[cfg(feature = "simd_support")] +// uniform_simd_int_impl! { +// (usizex2, isizex2), +// (usizex4, isizex4), +// (usizex8, isizex8), +// usize +// } #[cfg(feature = "simd_support")] uniform_simd_int_impl! { (u64x2, i64x2), @@ -860,7 +1237,6 @@ uniform_simd_int_impl! { (u64x8, i64x8), u64 } - #[cfg(feature = "simd_support")] uniform_simd_int_impl! { (u32x2, i32x2), @@ -869,7 +1245,6 @@ uniform_simd_int_impl! { (u32x16, i32x16), u32 } - #[cfg(feature = "simd_support")] uniform_simd_int_impl! { (u16x2, i16x2), @@ -879,7 +1254,6 @@ uniform_simd_int_impl! { (u16x32, i16x32), u16 } - #[cfg(feature = "simd_support")] uniform_simd_int_impl! { (u8x2, i8x2), @@ -891,6 +1265,43 @@ uniform_simd_int_impl! { u8 } +#[cfg(feature = "simd_support")] +uniform_simd_int_gt8_impl! { + (u8x16, i8x16), + (u8x32, i8x32), + (u8x64, i8x64), + + (u16x16, i16x16), + (u16x32, i16x32), + + (u32x16, i32x16) +} +#[cfg(feature = "simd_support")] +uniform_simd_int_le8_impl! { + (u8x2, i8x2, i64x2, u64x2), + (u8x4, i8x4, i64x4, u64x4), + (u8x8, i8x8, i64x8, u64x8), + + (u16x2, i16x2, i64x2, u64x2), + (u16x4, i16x4, i64x4, u64x4), + (u16x8, i16x8, i64x8, u64x8), + + (u32x2, i32x2, i64x2, u64x2), + (u32x4, i32x4, i64x4, u64x4), + (u32x8, i32x8, i64x8, u64x8), + + (u64x2, i64x2, i64x2, u64x2), + (u64x4, i64x4, i64x4, u64x4), + (u64x8, i64x8, i64x8, u64x8), + + (u128x2, i128x2, i64x2, u128x2), + (u128x4, i128x4, i64x4, u128x4) + + // (usizex2, isizex2, i64x2, u64x2), + // (usizex4, isizex4, i64x4, u64x4), + // (usizex8, isizex8, i64x8, u64x8) +} + #[cfg(test)] mod tests { diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs index 89da5fd7aa..f614524364 100644 --- a/src/distributions/utils.rs +++ b/src/distributions/utils.rs @@ -202,6 +202,197 @@ mod simd_wmul { wmul_impl_large! { (u16x32,) u16, 8 } wmul_impl_large! { (u32x16,) u32, 16 } wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 } + wmul_impl_large! { (u128x2, u128x4,) u128, 64 } + + #[cfg(target_pointer_width = "64")] + wmul_impl_large! { (usizex2, usizex4, usizex8,) usize, 32 } + #[cfg(target_pointer_width = "32")] + wmul_impl_large! { (usizex2, usizex4, usizex8,) usize, 16 } + #[cfg(target_pointer_width = "16")] + wmul_impl_large! { (usizex2, usizex4, usizex8,) usize, 8 } +} + +pub(crate) trait OverflowingAdd { + fn overflowing_add(&self, y: Self) -> (Self, T) where Self: Sized; +} + +#[cfg(feature = "simd_support")] +macro_rules! impl_overflowing_add { + ($(($ty:ty, $signed_ty:ty, $mask:ty)),+) => {$( + #[cfg(feature = "simd_support")] + impl OverflowingAdd<$mask> for $ty { + #[inline] + fn overflowing_add(&self, y: Self) -> (Self, $mask) { + let sum = *self + y; + + let lane_bitwidth = std::mem::size_of::<$ty>() / <$ty>::lanes(); + let mask = <$ty>::splat(1 << (lane_bitwidth - 1)); + + let neg_self: $signed_ty = (mask ^ *self).cast(); + let neg_sum: $signed_ty = (mask ^ sum).cast(); + + let overflowed = neg_self.gt(neg_sum); + + (sum, overflowed) + } + } + )+}; +} + +#[cfg(feature = "simd_support")] +impl_overflowing_add!( + (u8x2, i8x2, m8x2), + (u8x4, i8x4, m8x4), + (u8x8, i8x8, m8x8), + (u8x16, i8x16, m8x16), + (u8x32, i8x32, m8x32), + (u8x64, i8x64, m8x64), + + (u16x2, i16x2, m16x2), + (u16x4, i16x4, m16x4), + (u16x8, i16x8, m16x8), + (u16x16, i16x16, m16x16), + (u16x32, i16x32, m16x32), + + (u32x2, i32x2, m32x2), + (u32x4, i32x4, m32x4), + (u32x8, i32x8, m32x8), + (u32x16, i32x16, m32x16), + + (u64x2, i64x2, m64x2), + (u64x4, i64x4, m64x4), + (u64x8, i64x8, m64x8), + + (u128x2, i128x2, m128x2), + (u128x4, i128x4, m128x4) +); + + +#[cfg(feature = "simd_support")] +pub(crate) trait SimdCombine { + fn simd_combine(&self) -> T; +} + +#[cfg(feature = "simd_support")] +macro_rules! impl_combine_2 { + ($(($wide:ident, $short:ident)),+) => {$( + impl SimdCombine<$wide> for [$short] { + #[inline] + fn simd_combine(&self) -> $wide { + shuffle!(self[0], self[1], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) + } + } + )+}; +} + +#[cfg(feature = "simd_support")] +impl_combine_2!{ + (u32x16, u32x8), + (u16x16, u16x8), + (u8x16, u8x8) +} + +#[cfg(feature = "simd_support")] +macro_rules! impl_combine_4 { + ($(($wide:ident, $mid:ident, $short:ident)),+) => {$( + impl SimdCombine<$wide> for [$short] { + #[inline] + fn simd_combine(&self) -> $wide { + let a: $mid = self.chunks_exact(2).nth(0).unwrap().simd_combine(); + let b: $mid = self.chunks_exact(2).nth(1).unwrap().simd_combine(); + shuffle!(a, b, [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ]) + } + } + )+}; +} + +#[cfg(feature = "simd_support")] +impl_combine_4!{ + (u16x32, u16x16, u16x8), + (u8x32, u8x16, u8x8) +} + +#[cfg(feature = "simd_support")] +impl SimdCombine for [u8x8; 8] { + #[inline] + fn simd_combine(&self) -> u8x64 { + let a: u8x32 = self.chunks_exact(4).nth(0).unwrap().simd_combine(); + let b: u8x32 = self.chunks_exact(4).nth(1).unwrap().simd_combine(); + shuffle!(a, b, [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ]) + } +} + + +#[cfg(feature = "simd_support")] +pub(crate) trait SimdSplit { + fn simd_split(&self) -> T; +} + +#[cfg(feature = "simd_support")] +macro_rules! impl_split_2 { + ($(($wide:ident, $short:ident)),+) => {$( + impl SimdSplit<[$short; 2]> for $wide { + #[inline] + fn simd_split(&self) -> [$short; 2] { + let a = shuffle!(self, [0, 1, 2, 3, 4, 5, 6, 7]); + let b = shuffle!(self, [8, 9, 10, 11, 12, 13, 14, 15]); + [a, b] + } + } + )+}; +} + +#[cfg(feature = "simd_support")] +impl_split_2!{ + (u32x16, u32x8), + (u16x16, u16x8), + (u8x16, u8x8) +} + +#[cfg(feature = "simd_support")] +macro_rules! impl_split_4 { + ($(($wide:ident, $mid:ident, $short:ident)),+) => {$( + impl SimdSplit<[$short; 4]> for $wide { + #[inline] + fn simd_split(&self) -> [$short; 4] { + let a = shuffle!(self, [0, 1, 2, 3, 4, 5, 6, 7]); + let b = shuffle!(self, [8, 9, 10, 11, 12, 13, 14, 15]); + let c = shuffle!(self, [16, 17, 18, 19, 20, 21, 22, 23]); + let d = shuffle!(self, [24, 25, 26, 27, 28, 29, 30, 31]); + [a, b, c, d] + } + } + )+}; +} + +#[cfg(feature = "simd_support")] +impl_split_4!{ + (u16x32, u16x16, u16x8), + (u8x32, u8x16, u8x8) +} + +#[cfg(feature = "simd_support")] +impl SimdSplit<[u8x8; 8]> for u8x64 { + #[inline] + fn simd_split(&self) -> [u8x8; 8] { + let x0 = shuffle!(self, [0, 1, 2, 3, 4, 5, 6, 7]); + let x1 = shuffle!(self, [8, 9, 10, 11, 12, 13, 14, 15]); + let x2 = shuffle!(self, [16, 17, 18, 19, 20, 21, 22, 23]); + let x3 = shuffle!(self, [24, 25, 26, 27, 28, 29, 30, 31]); + let x4 = shuffle!(self, [32, 33, 34, 35, 36, 37, 38, 39]); + let x5 = shuffle!(self, [40, 41, 42, 43, 44, 45, 46, 47]); + let x6 = shuffle!(self, [48, 49, 50, 51, 52, 53, 54, 55]); + let x7 = shuffle!(self, [56, 57, 58, 59, 60, 61, 62, 63]); + [x0, x1, x2, x3, x4, x5, x6, x7] + } } /// Helper trait when dealing with scalar and SIMD floating point types. From 1cfba720bf73e79e5fb301b0a1b142adda4bb36e Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Wed, 20 Oct 2021 16:19:01 +0100 Subject: [PATCH 07/17] Add uniform benches for SIMD types --- benches/uniform.rs | 68 +++++++++++++++++++++++++++++++++++- src/distributions/uniform.rs | 60 +++++-------------------------- 2 files changed, 76 insertions(+), 52 deletions(-) diff --git a/benches/uniform.rs b/benches/uniform.rs index 5e19c046e1..e5b02a2912 100644 --- a/benches/uniform.rs +++ b/benches/uniform.rs @@ -8,6 +8,7 @@ use criterion::{criterion_group, criterion_main}; use criterion::{BenchmarkId, Criterion}; +#[cfg(feature = "simd_support")] use packed_simd::*; use rand::distributions::uniform::{SampleUniform, UniformSampler}; use rand::prelude::*; @@ -59,9 +60,74 @@ fn uniform_int(c: &mut Criterion) { bench_int!(c, i128, (i128::MIN, 1)); } +#[cfg(feature = "simd_support")] +macro_rules! bench_simd_group { + ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => { + for input in $inputs { + $g.bench_with_input( + BenchmarkId::new($name, input.0), + &input.1, + |b, (low, high)| { + let mut rng = BenchRng::from_entropy(); + let (low, high) = (<$T>::splat(*low), <$T>::splat(*high)); + b.iter(|| <$T as SampleUniform>::Sampler::$f(low, high, &mut rng)) + }, + ); + } + $g.bench_function(BenchmarkId::new($name, "varying"), |b| { + let (low, mut high) = (<$T>::splat($inputs[0].1 .0), <$T>::splat($inputs[1].1 .1)); + let mut rng = BenchRng::from_entropy(); + b.iter(|| { + high += 1; + <$T as SampleUniform>::Sampler::$f(low, high, &mut rng) + }) + }); + }; +} + +#[cfg(feature = "simd_support")] +macro_rules! bench_simd { + ($c:expr, $T:ty, $high:expr/*, $incr:expr*/) => {{ + let mut g = $c.benchmark_group(concat!("uniform_simd_", stringify!($T))); + let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))]; + + bench_simd_group!("Old", $T, sample_single_inclusive, g, inputs); + bench_simd_group!("Canon", $T, sample_single_inclusive_canon, g, inputs); + bench_simd_group!( + "Canon-branchless", + $T, + sample_single_inclusive_canon_branchless, + g, + inputs + ); + bench_simd_group!("Canon-scalar", $T, sample_inclusive_canon_scalar, g, inputs); + bench_simd_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs); + }}; +} + +#[cfg(feature = "simd_support")] +fn uniform_simd(c: &mut Criterion) { + // for i8/i16, we use 32-bit integers internally so rejection is most common near full-size + // the exact values were determined with an exhaustive search + bench_simd!(c, i8x16, (i8::MIN, 116)); + bench_simd!(c, i8x32, (i8::MIN, 116)); + bench_simd!(c, i16x8, (i16::MIN, 32407)); + bench_simd!(c, i16x16, (i16::MIN, 32407)); + bench_simd!(c, i32x4, (i32::MIN, 1)); + bench_simd!(c, i32x8, (i32::MIN, 1)); + bench_simd!(c, i64x2, (i64::MIN, 1)); + bench_simd!(c, i64x4, (i64::MIN, 1)); + bench_simd!(c, i64x8, (i64::MIN, 1)); + bench_simd!(c, i128x2, (i128::MIN, 1)); + bench_simd!(c, i128x4, (i128::MIN, 1)); +} + +#[cfg(not(feature = "simd_support"))] +fn uniform_simd(_c: &mut Criterion) {} + criterion_group! { name = benches; config = Criterion::default(); - targets = uniform_int + targets = uniform_int, uniform_simd } criterion_main!(benches); diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 231237609f..33ee9d63d0 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -891,7 +891,7 @@ macro_rules! uniform_simd_int_gt8_impl { /// Canon's method #[inline(always)] - pub fn sample_inclusive_canon( + pub fn sample_single_inclusive_canon_branchless( low_b: B1, high_b: B2, rng: &mut R, ) -> $ty where @@ -911,34 +911,21 @@ macro_rules! uniform_simd_int_gt8_impl { is_full_range.select(cast_rand_bits, low + cast_result) } - /// Canon's method + /// #[inline(always)] - pub fn sample_single_inclusive_canon( - low_b: B1, high_b: B2, rng: &mut R, + pub fn sample_inclusive_canon_scalar( + _low_b: B1, _high_b: B2, _rng: &mut R, ) -> $ty where B1: SampleBorrow<$ty> + Sized, B2: SampleBorrow<$ty> + Sized, { - let (range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate a sample using a sensible integer type - let rand_bits = rng.gen::<$unsigned>(); - let (mut result, lo_order) = rand_bits.wmul(range); - - if lo_order.gt(0 - range).any() { - Self::canon_successive(range, &mut result, lo_order, rng); - } - - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) + Default::default() // dummy impl } - /// Canon + Lemire's early-out + /// Canon's method #[inline(always)] - pub fn sample_inclusive_canon_lemire( + pub fn sample_single_inclusive_canon( low_b: B1, high_b: B2, rng: &mut R, ) -> $ty where @@ -952,9 +939,7 @@ macro_rules! uniform_simd_int_gt8_impl { let rand_bits = rng.gen::<$unsigned>(); let (mut result, lo_order) = rand_bits.wmul(range); - // lo_order < range.wrapping_neg() % range { - // may panic if range == 0 - if lo_order.lt((0 - range) % range).any() { + if lo_order.gt(0 - range).any() { Self::canon_successive(range, &mut result, lo_order, rng); } @@ -1053,7 +1038,7 @@ macro_rules! uniform_simd_int_le8_impl { /// #[inline(always)] - pub fn sample_inclusive_canon( + pub fn sample_single_inclusive_canon_branchless( low_b: B1, high_b: B2, rng: &mut R, ) -> $ty where @@ -1137,33 +1122,6 @@ macro_rules! uniform_simd_int_le8_impl { is_full_range.select(cast_rand_bits, low + cast_result) } - /// - #[inline(always)] - pub fn sample_inclusive_canon_lemire( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate a sample using a sensible integer type - let rand_bits = rng.gen::<$unsigned>(); - let (mut result, lo_order) = rand_bits.wmul(range); - - // lo_order < range.wrapping_neg() % range { - // may panic if range == 0 - if lo_order.lt((0 - range) % range).any() { - Self::canon_successive(range, &mut result, lo_order, rng); - } - - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) - } - /// #[inline(always)] pub fn sample_single_inclusive_bitmask( From 7cfa6f90228daf8cc529e598338f186263355853 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Wed, 20 Oct 2021 16:41:54 +0100 Subject: [PATCH 08/17] Uniform: dedup --- src/distributions/uniform.rs | 43 ++++++++++++++---------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 33ee9d63d0..4588856884 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -844,20 +844,7 @@ macro_rules! uniform_simd_int_impl { } } } - }; - // bulk implementation - ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => { - $( - uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar); - uniform_simd_int_impl!($signed, $unsigned, $u_scalar); - )+ - }; -} - -#[cfg(feature = "simd_support")] -macro_rules! uniform_simd_int_gt8_impl { - ($ty:ident, $unsigned:ident) => { impl UniformInt<$ty> { #[inline(always)] fn sample_inc_setup(low_b: B1, high_b: B2) -> ($unsigned, $ty) @@ -868,10 +855,26 @@ macro_rules! uniform_simd_int_gt8_impl { let low = *low_b.borrow(); let high = *high_b.borrow(); assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high"); + // wrapping sub and add let range: $unsigned = ((high - low) + 1).cast(); (range, low) } + } + }; + // bulk implementation + ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => { + $( + uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar); + uniform_simd_int_impl!($signed, $unsigned, $u_scalar); + )+ + }; +} + +#[cfg(feature = "simd_support")] +macro_rules! uniform_simd_int_gt8_impl { + ($ty:ident, $unsigned:ident) => { + impl UniformInt<$ty> { #[inline(always)] fn canon_successive( range: $unsigned, @@ -1005,20 +1008,6 @@ macro_rules! uniform_simd_int_gt8_impl { macro_rules! uniform_simd_int_le8_impl { ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => { impl UniformInt<$ty> { - #[inline(always)] - fn sample_inc_setup(low_b: B1, high_b: B2) -> ($unsigned, $ty) - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high"); - // wrapping sub and add - let range: $unsigned = ((high - low) + 1).cast(); - (range, low) - } - #[inline(always)] fn canon_successive( range: $unsigned, From 4518f4dac6aa86483f3939e2323f963ea881290b Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Thu, 24 Feb 2022 15:27:39 +0000 Subject: [PATCH 09/17] Move UniformInt to a new submodule --- src/distributions/uniform.rs | 1038 +--------------------- src/distributions/uniform/uniform_int.rs | 1038 ++++++++++++++++++++++ 2 files changed, 1043 insertions(+), 1033 deletions(-) create mode 100644 src/distributions/uniform/uniform_int.rs diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 4588856884..4b81ab9ee9 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -106,26 +106,20 @@ //! [`UniformDuration`]: crate::distributions::uniform::UniformDuration //! [`SampleBorrow::borrow`]: crate::distributions::uniform::SampleBorrow::borrow -use core::ops::{Range, RangeInclusive}; - -use crate::distributions::utils::WideningMultiply; -#[cfg(feature = "simd_support")] -use crate::distributions::utils::{OverflowingAdd, SimdCombine, SimdSplit}; -use crate::distributions::Distribution; -use crate::{Rng, RngCore}; - #[cfg(not(feature = "std"))] #[allow(unused_imports)] // rustc doesn't detect that this is actually used use crate::distributions::utils::Float; - -#[cfg(feature = "simd_support")] use packed_simd::*; - +use crate::distributions::Distribution; +use crate::{Rng, RngCore}; +use core::ops::{Range, RangeInclusive}; #[cfg(feature = "serde1")] use serde::{Deserialize, Serialize}; mod uniform_float; +mod uniform_int; mod uniform_other; pub use uniform_float::UniformFloat; +pub use uniform_int::UniformInt; pub use uniform_other::{UniformChar, UniformDuration}; /// Sample values uniformly between two bounds. @@ -387,992 +381,10 @@ impl SampleRange for RangeInclusive { } -//////////////////////////////////////////////////////////////////////////////// - -// What follows are all back-ends. - - -/// The back-end implementing [`UniformSampler`] for integer types. -/// -/// Unless you are implementing [`UniformSampler`] for your own type, this type -/// should not be used directly, use [`Uniform`] instead. -/// -/// # Implementation notes -/// -/// For simplicity, we use the same generic struct `UniformInt` for all -/// integer types `X`. This gives us only one field type, `X`; to store unsigned -/// values of this size, we take use the fact that these conversions are no-ops. -/// -/// For a closed range, the number of possible numbers we should generate is -/// `range = (high - low + 1)`. To avoid bias, we must ensure that the size of -/// our sample space, `zone`, is a multiple of `range`; other values must be -/// rejected (by replacing with a new random sample). -/// -/// As a special case, we use `range = 0` to represent the full range of the -/// result type (i.e. for `new_inclusive($ty::MIN, $ty::MAX)`). -/// -/// The optimum `zone` is the largest product of `range` which fits in our -/// (unsigned) target type. We calculate this by calculating how many numbers we -/// must reject: `reject = (MAX + 1) % range = (MAX - range + 1) % range`. Any (large) -/// product of `range` will suffice, thus in `sample_single` we multiply by a -/// power of 2 via bit-shifting (faster but may cause more rejections). -/// -/// The smallest integer PRNGs generate is `u32`. For 8- and 16-bit outputs we -/// use `u32` for our `zone` and samples (because it's not slower and because -/// it reduces the chance of having to reject a sample). In this case we cannot -/// store `zone` in the target type since it is too large, however we know -/// `ints_to_reject < range <= $unsigned::MAX`. -/// -/// An alternative to using a modulus is widening multiply: After a widening -/// multiply by `range`, the result is in the high word. Then comparing the low -/// word against `zone` makes sure our distribution is uniform. -#[derive(Clone, Copy, Debug, PartialEq)] -#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] -pub struct UniformInt { - low: X, - range: X, - z: X, // either ints_to_reject or zone depending on implementation -} - -macro_rules! uniform_int_impl { - ($ty:ty, $unsigned:ident, $u_large:ident, $u_extra_large:ident) => { - impl SampleUniform for $ty { - type Sampler = UniformInt<$ty>; - } - - impl UniformSampler for UniformInt<$ty> { - // We play free and fast with unsigned vs signed here - // (when $ty is signed), but that's fine, since the - // contract of this macro is for $ty and $unsigned to be - // "bit-equal", so casting between them is a no-op. - - type X = $ty; - - #[inline] // if the range is constant, this helps LLVM to do the - // calculations at compile-time. - fn new(low_b: B1, high_b: B2) -> Self - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!(low < high, "Uniform::new called with `low >= high`"); - UniformSampler::new_inclusive(low, high - 1) - } - - #[inline] // if the range is constant, this helps LLVM to do the - // calculations at compile-time. - fn new_inclusive(low_b: B1, high_b: B2) -> Self - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!( - low <= high, - "Uniform::new_inclusive called with `low > high`" - ); - let unsigned_max = ::core::$u_large::MAX; - - let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned; - let ints_to_reject = if range > 0 { - let range = $u_large::from(range); - (unsigned_max - range + 1) % range - } else { - 0 - }; - - UniformInt { - low, - // These are really $unsigned values, but store as $ty: - range: range as $ty, - z: ints_to_reject as $unsigned as $ty, - } - } - - #[inline] - fn sample(&self, rng: &mut R) -> Self::X { - let range = self.range as $unsigned as $u_large; - if range > 0 { - let unsigned_max = ::core::$u_large::MAX; - let zone = unsigned_max - (self.z as $unsigned as $u_large); - loop { - let v: $u_large = rng.gen(); - let (hi, lo) = v.wmul(range); - if lo <= zone { - return self.low.wrapping_add(hi as $ty); - } - } - } else { - // Sample from the entire integer range. - rng.gen() - } - } - - #[inline] - fn sample_single(low_b: B1, high_b: B2, rng: &mut R) -> Self::X - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!(low < high, "UniformSampler::sample_single: low >= high"); - Self::sample_single_inclusive(low, high - 1, rng) - } - - #[inline] - fn sample_single_inclusive( - low_b: B1, high_b: B2, rng: &mut R, - ) -> Self::X - where - B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!( - low <= high, - "UniformSampler::sample_single_inclusive: low > high" - ); - let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; - if range == 0 { - // Range is MAX+1 (unrepresentable), so we need a special case - return rng.gen(); - } - - let zone = if ::core::$unsigned::MAX <= ::core::u16::MAX as $unsigned { - // Using a modulus is faster than the approximation for - // i8 and i16. I suppose we trade the cost of one - // modulus for near-perfect branch prediction. - let unsigned_max: $u_large = ::core::$u_large::MAX; - let ints_to_reject = (unsigned_max - range + 1) % range; - unsigned_max - ints_to_reject - } else { - // conservative but fast approximation. `- 1` is necessary to allow the - // same comparison without bias. - (range << range.leading_zeros()).wrapping_sub(1) - }; - - loop { - let v: $u_large = rng.gen(); - let (hi, lo) = v.wmul(range); - if lo <= zone { - return low.wrapping_add(hi as $ty); - } - } - } - } - - impl UniformInt<$ty> { - /// Sample single inclusive, using ONeill's method - #[inline] - pub fn sample_single_inclusive_oneill( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!( - low <= high, - "UniformSampler::sample_single_inclusive: low > high" - ); - let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; - if range == 0 { - // Range is MAX+1 (unrepresentable), so we need a special case - return rng.gen(); - } - - // we use the "Debiased Int Mult (t-opt, m-opt)" rejection sampling method - // described here https://www.pcg-random.org/posts/bounded-rands.html - // and here https://github.com/imneme/bounded-rands - - let (mut hi, mut lo) = rng.gen::<$u_large>().wmul(range); - if lo < range { - let mut threshold = range.wrapping_neg(); - // this shortcut works best with large ranges - if threshold >= range { - threshold -= range; - if threshold >= range { - threshold %= range; - } - } - while lo < threshold { - let (new_hi, new_lo) = rng.gen::<$u_large>().wmul(range); - hi = new_hi; - lo = new_lo; - } - } - low.wrapping_add(hi as $ty) - } - - /// Sample single inclusive, using Canon's method - #[inline] - pub fn sample_single_inclusive_canon( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!( - low <= high, - "UniformSampler::sample_single_inclusive: low > high" - ); - let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large; - if range == 0 { - // Range is MAX+1 (unrepresentable), so we need a special case - return rng.gen(); - } - - // generate a sample using a sensible integer type - let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range); - - // if the sample is biased... - if lo_order > range.wrapping_neg() { - // ...generate a new sample with 64 more bits, enough that bias is undetectable - let (new_hi_order, _) = - (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large); - // and adjust if needed - result += lo_order - .checked_add(new_hi_order as $u_extra_large) - .is_none() as $u_extra_large; - } - - low.wrapping_add(result as $ty) - } - - /// Sample single inclusive, using Canon's method with Lemire's early-out - #[inline] - pub fn sample_inclusive_canon_lemire( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!( - low <= high, - "UniformSampler::sample_single_inclusive: low > high" - ); - let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large; - if range == 0 { - // Range is MAX+1 (unrepresentable), so we need a special case - return rng.gen(); - } - - // generate a sample using a sensible integer type - let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range); - - // if the sample is biased... (since range won't be changing we can further - // improve this check with a modulo) - if lo_order < range.wrapping_neg() % range { - // ...generate a new sample with 64 more bits, enough that bias is undetectable - let (new_hi_order, _) = - (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large); - // and adjust if needed - result += lo_order - .checked_add(new_hi_order as $u_extra_large) - .is_none() as $u_extra_large; - } - - low.wrapping_add(result as $ty) - } - - /// Sample single inclusive, using the Bitmask method - #[inline] - pub fn sample_single_inclusive_bitmask( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!( - low <= high, - "UniformSampler::sample_single_inclusive: low > high" - ); - let mut range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; - if range == 0 { - // Range is MAX+1 (unrepresentable), so we need a special case - return rng.gen(); - } - - // the old impl use a mix of methods for different integer sizes, we only use - // the lz method here for a better comparison. - - let mut mask = $u_large::max_value(); - range -= 1; - mask >>= (range | 1).leading_zeros(); - loop { - let x = rng.gen::<$u_large>() & mask; - if x <= range { - return low.wrapping_add(x as $ty); - } - } - } - } - }; -} -uniform_int_impl! { i8, u8, u32, u64 } -uniform_int_impl! { i16, u16, u32, u64 } -uniform_int_impl! { i32, u32, u32, u64 } -uniform_int_impl! { i64, u64, u64, u64 } -uniform_int_impl! { i128, u128, u128, u128 } -uniform_int_impl! { u8, u8, u32, u64 } -uniform_int_impl! { u16, u16, u32, u64 } -uniform_int_impl! { u32, u32, u32, u64 } -uniform_int_impl! { u64, u64, u64, u64 } -uniform_int_impl! { u128, u128, u128, u128 } -#[cfg(any(target_pointer_width = "16", target_pointer_width = "32",))] -mod isize_int_impls { - use super::*; - uniform_int_impl! { isize, usize, usize, u64 } - uniform_int_impl! { usize, usize, usize, u64 } -} -#[cfg(not(any(target_pointer_width = "16", target_pointer_width = "32",)))] -mod isize_int_impls { - use super::*; - uniform_int_impl! { isize, usize, usize, usize } - uniform_int_impl! { usize, usize, usize, usize } -} - -#[cfg(feature = "simd_support")] -macro_rules! uniform_simd_int_impl { - ($ty:ident, $unsigned:ident, $u_scalar:ident) => { - // The "pick the largest zone that can fit in an `u32`" optimization - // is less useful here. Multiple lanes complicate things, we don't - // know the PRNG's minimal output size, and casting to a larger vector - // is generally a bad idea for SIMD performance. The user can still - // implement it manually. - - // TODO: look into `Uniform::::new(0u32, 100)` functionality - // perhaps `impl SampleUniform for $u_scalar`? - impl SampleUniform for $ty { - type Sampler = UniformInt<$ty>; - } - - impl UniformSampler for UniformInt<$ty> { - type X = $ty; - - #[inline] // if the range is constant, this helps LLVM to do the - // calculations at compile-time. - fn new(low_b: B1, high_b: B2) -> Self - where B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!(low.lt(high).all(), "Uniform::new called with `low >= high`"); - UniformSampler::new_inclusive(low, high - 1) - } - - #[inline] // if the range is constant, this helps LLVM to do the - // calculations at compile-time. - fn new_inclusive(low_b: B1, high_b: B2) -> Self - where B1: SampleBorrow + Sized, - B2: SampleBorrow + Sized - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!(low.le(high).all(), - "Uniform::new_inclusive called with `low > high`"); - let unsigned_max = ::core::$u_scalar::MAX; - - // NOTE: these may need to be replaced with explicitly - // wrapping operations if `packed_simd` changes - let range: $unsigned = ((high - low) + 1).cast(); - // `% 0` will panic at runtime. - let not_full_range = range.gt($unsigned::splat(0)); - // replacing 0 with `unsigned_max` allows a faster `select` - // with bitwise OR - let modulo = not_full_range.select(range, $unsigned::splat(unsigned_max)); - // wrapping addition - let ints_to_reject = (unsigned_max - range + 1) % modulo; - // When `range` is 0, `lo` of `v.wmul(range)` will always be - // zero which means only one sample is needed. - let zone = unsigned_max - ints_to_reject; - - UniformInt { - low, - // These are really $unsigned values, but store as $ty: - range: range.cast(), - z: zone.cast(), - } - } - - fn sample(&self, rng: &mut R) -> Self::X { - let range: $unsigned = self.range.cast(); - let zone: $unsigned = self.z.cast(); - - // This might seem very slow, generating a whole new - // SIMD vector for every sample rejection. For most uses - // though, the chance of rejection is small and provides good - // general performance. With multiple lanes, that chance is - // multiplied. To mitigate this, we replace only the lanes of - // the vector which fail, iteratively reducing the chance of - // rejection. The replacement method does however add a little - // overhead. Benchmarking or calculating probabilities might - // reveal contexts where this replacement method is slower. - let mut v: $unsigned = rng.gen(); - loop { - let (hi, lo) = v.wmul(range); - let mask = lo.le(zone); - if mask.all() { - let hi: $ty = hi.cast(); - // wrapping addition - let result = self.low + hi; - // `select` here compiles to a blend operation - // When `range.eq(0).none()` the compare and blend - // operations are avoided. - let v: $ty = v.cast(); - return range.gt($unsigned::splat(0)).select(result, v); - } - // Replace only the failing lanes - v = mask.select(v, rng.gen()); - } - } - } - - impl UniformInt<$ty> { - #[inline(always)] - fn sample_inc_setup(low_b: B1, high_b: B2) -> ($unsigned, $ty) - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let low = *low_b.borrow(); - let high = *high_b.borrow(); - assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high"); - // wrapping sub and add - let range: $unsigned = ((high - low) + 1).cast(); - (range, low) - } - } - }; - - // bulk implementation - ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => { - $( - uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar); - uniform_simd_int_impl!($signed, $unsigned, $u_scalar); - )+ - }; -} - -#[cfg(feature = "simd_support")] -macro_rules! uniform_simd_int_gt8_impl { - ($ty:ident, $unsigned:ident) => { - impl UniformInt<$ty> { - #[inline(always)] - fn canon_successive( - range: $unsigned, - result: &mut $unsigned, - lo_order: $unsigned, - rng: &mut R - ) { - let mut vecs_64 = range.simd_split(); - for x in &mut vecs_64 { - *x = rng.gen::().wmul((*x).cast()).0.cast(); - } - let cast_new_hi: $unsigned = vecs_64.simd_combine(); - - let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); - *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); - } - - /// Canon's method - #[inline(always)] - pub fn sample_single_inclusive_canon_branchless( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - let rand_bits = rng.gen::<$unsigned>(); - let (mut result, lo_order) = rand_bits.wmul(range); - - Self::canon_successive(range, &mut result, lo_order, rng); - - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) - } - - /// - #[inline(always)] - pub fn sample_inclusive_canon_scalar( - _low_b: B1, _high_b: B2, _rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - Default::default() // dummy impl - } - - /// Canon's method - #[inline(always)] - pub fn sample_single_inclusive_canon( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate a sample using a sensible integer type - let rand_bits = rng.gen::<$unsigned>(); - let (mut result, lo_order) = rand_bits.wmul(range); - - if lo_order.gt(0 - range).any() { - Self::canon_successive(range, &mut result, lo_order, rng); - } - - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) - } - - /// Bitmask - #[inline(always)] - pub fn sample_single_inclusive_bitmask( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (mut range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate bitmask - range -= 1; - let mut mask = range | 1; - - mask |= mask >> 1; - mask |= mask >> 2; - mask |= mask >> 4; - - const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); - if LANE_WIDTH >= 16 { mask |= mask >> 8; } - if LANE_WIDTH >= 32 { mask |= mask >> 16; } - if LANE_WIDTH >= 64 { mask |= mask >> 32; } - if LANE_WIDTH >= 128 { mask |= mask >> 64; } - - let mut v: $unsigned = rng.gen(); - loop { - let masked = v & mask; - let accept = masked.le(range); - if accept.all() { - let masked: $ty = masked.cast(); - // wrapping addition - let result = low + masked; - // `select` here compiles to a blend operation - // When `range.eq(0).none()` the compare and blend - // operations are avoided. - let v: $ty = v.cast(); - return is_full_range.select(v, result); - } - // Replace only the failing lanes - v = accept.select(v, rng.gen()); - } - } - } - }; - - ($(($unsigned:ident, $signed:ident)),+) => {$( - uniform_simd_int_gt8_impl!{ $unsigned, $unsigned } - uniform_simd_int_gt8_impl!{ $signed, $unsigned } - )+}; -} - -#[cfg(feature = "simd_support")] -macro_rules! uniform_simd_int_le8_impl { - ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => { - impl UniformInt<$ty> { - #[inline(always)] - fn canon_successive( - range: $unsigned, - result: &mut $unsigned, - lo_order: $unsigned, - rng: &mut R - ) { - // ...generate a new sample with 64 more bits, enough that bias is undetectable - let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast(); - let large_range: $u_extra_large = range.cast(); - let (new_hi_order, _) = new_bits.wmul(large_range); - // and adjust if needed - let cast_new_hi: $unsigned = new_hi_order.cast(); - let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); - *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); - } - - /// - #[inline(always)] - pub fn sample_single_inclusive_canon_branchless( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate a sample using a sensible integer type - let rand_bits = rng.gen::<$unsigned>(); - let (mut result, lo_order) = rand_bits.wmul(range); - - Self::canon_successive(range, &mut result, lo_order, rng); - - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) - } - - /// - #[inline(always)] - pub fn sample_inclusive_canon_scalar( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate a sample using a sensible integer type - let rand_bits = rng.gen::<$unsigned>(); - let (mut result, lo_order) = rand_bits.wmul(range); - - // ...generate a new sample with 64 more bits, enough that bias is undetectable - let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast(); - let large_range: $u_extra_large = range.cast(); - - // let (new_hi_order, _) = new_bits.wmul(large_range); - let mut new_hi_order = <$u_extra_large>::default(); - - for i in 0..<$ty>::lanes() { - let (shi, _slo) = new_bits.extract(i).wmul(large_range.extract(i)); - new_hi_order = new_hi_order.replace(i, shi); - } - - // and adjust if needed - let cast_new_hi: $unsigned = new_hi_order.cast(); - let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); - result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); - - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) - } - - /// - #[inline(always)] - pub fn sample_single_inclusive_canon( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate a sample using a sensible integer type - let rand_bits = rng.gen::<$unsigned>(); - let (mut result, lo_order) = rand_bits.wmul(range); - - if lo_order.gt(0 - range).any() { - Self::canon_successive(range, &mut result, lo_order, rng); - } - - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) - } - - /// - #[inline(always)] - pub fn sample_single_inclusive_bitmask( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (mut range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate bitmask - range -= 1; - let mut mask = range | 1; - - mask |= mask >> 1; - mask |= mask >> 2; - mask |= mask >> 4; - - const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); - if LANE_WIDTH >= 16 { mask |= mask >> 8; } - if LANE_WIDTH >= 32 { mask |= mask >> 16; } - if LANE_WIDTH >= 64 { mask |= mask >> 32; } - if LANE_WIDTH >= 128 { mask |= mask >> 64; } - - let mut v: $unsigned = rng.gen(); - loop { - let masked = v & mask; - let accept = masked.le(range); - if accept.all() { - let masked: $ty = masked.cast(); - // wrapping addition - let result = low + masked; - // `select` here compiles to a blend operation - // When `range.eq(0).none()` the compare and blend - // operations are avoided. - let v: $ty = v.cast(); - return is_full_range.select(v, result); - } - // Replace only the failing lanes - v = accept.select(v, rng.gen()); - } - } - } - }; - - ($(($unsigned:ident, $signed:ident, $u64xN_type:ident, $u_extra_large:ident)),+) => {$( - uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u64xN_type, $u_extra_large } - uniform_simd_int_le8_impl!{ $signed, $unsigned, $u64xN_type, $u_extra_large } - )+}; -} - -#[cfg(feature = "simd_support")] -uniform_simd_int_impl! { - (u128x2, i128x2), - (u128x4, i128x4), - u128 -} -// #[cfg(feature = "simd_support")] -// uniform_simd_int_impl! { -// (usizex2, isizex2), -// (usizex4, isizex4), -// (usizex8, isizex8), -// usize -// } -#[cfg(feature = "simd_support")] -uniform_simd_int_impl! { - (u64x2, i64x2), - (u64x4, i64x4), - (u64x8, i64x8), - u64 -} -#[cfg(feature = "simd_support")] -uniform_simd_int_impl! { - (u32x2, i32x2), - (u32x4, i32x4), - (u32x8, i32x8), - (u32x16, i32x16), - u32 -} -#[cfg(feature = "simd_support")] -uniform_simd_int_impl! { - (u16x2, i16x2), - (u16x4, i16x4), - (u16x8, i16x8), - (u16x16, i16x16), - (u16x32, i16x32), - u16 -} -#[cfg(feature = "simd_support")] -uniform_simd_int_impl! { - (u8x2, i8x2), - (u8x4, i8x4), - (u8x8, i8x8), - (u8x16, i8x16), - (u8x32, i8x32), - (u8x64, i8x64), - u8 -} - -#[cfg(feature = "simd_support")] -uniform_simd_int_gt8_impl! { - (u8x16, i8x16), - (u8x32, i8x32), - (u8x64, i8x64), - - (u16x16, i16x16), - (u16x32, i16x32), - - (u32x16, i32x16) -} -#[cfg(feature = "simd_support")] -uniform_simd_int_le8_impl! { - (u8x2, i8x2, i64x2, u64x2), - (u8x4, i8x4, i64x4, u64x4), - (u8x8, i8x8, i64x8, u64x8), - - (u16x2, i16x2, i64x2, u64x2), - (u16x4, i16x4, i64x4, u64x4), - (u16x8, i16x8, i64x8, u64x8), - - (u32x2, i32x2, i64x2, u64x2), - (u32x4, i32x4, i64x4, u64x4), - (u32x8, i32x8, i64x8, u64x8), - - (u64x2, i64x2, i64x2, u64x2), - (u64x4, i64x4, i64x4, u64x4), - (u64x8, i64x8, i64x8, u64x8), - - (u128x2, i128x2, i64x2, u128x2), - (u128x4, i128x4, i64x4, u128x4) - - // (usizex2, isizex2, i64x2, u64x2), - // (usizex4, isizex4, i64x4, u64x4), - // (usizex8, isizex8, i64x8, u64x8) -} - - #[cfg(test)] mod tests { use super::*; - #[test] - #[cfg(feature = "serde1")] - fn test_uniform_serialization() { - let unit_box: Uniform = Uniform::new(-1, 1); - let de_unit_box: Uniform = - bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap(); - - assert_eq!(unit_box.0.low, de_unit_box.0.low); - assert_eq!(unit_box.0.range, de_unit_box.0.range); - assert_eq!(unit_box.0.z, de_unit_box.0.z); - } - - #[should_panic] - #[test] - fn test_uniform_bad_limits_equal_int() { - Uniform::new(10, 10); - } - - #[test] - fn test_uniform_good_limits_equal_int() { - let mut rng = crate::test::rng(804); - let dist = Uniform::new_inclusive(10, 10); - for _ in 0..20 { - assert_eq!(rng.sample(dist), 10); - } - } - - #[should_panic] - #[test] - fn test_uniform_bad_limits_flipped_int() { - Uniform::new(10, 5); - } - - #[test] - #[cfg_attr(miri, ignore)] // Miri is too slow - fn test_integers() { - use core::{i128, u128}; - use core::{i16, i32, i64, i8, isize}; - use core::{u16, u32, u64, u8, usize}; - - let mut rng = crate::test::rng(251); - macro_rules! t { - ($ty:ident, $v:expr, $le:expr, $lt:expr) => {{ - for &(low, high) in $v.iter() { - let my_uniform = Uniform::new(low, high); - for _ in 0..1000 { - let v: $ty = rng.sample(my_uniform); - assert!($le(low, v) && $lt(v, high)); - } - - let my_uniform = Uniform::new_inclusive(low, high); - for _ in 0..1000 { - let v: $ty = rng.sample(my_uniform); - assert!($le(low, v) && $le(v, high)); - } - - let my_uniform = Uniform::new(&low, high); - for _ in 0..1000 { - let v: $ty = rng.sample(my_uniform); - assert!($le(low, v) && $lt(v, high)); - } - - let my_uniform = Uniform::new_inclusive(&low, &high); - for _ in 0..1000 { - let v: $ty = rng.sample(my_uniform); - assert!($le(low, v) && $le(v, high)); - } - - for _ in 0..1000 { - let v = <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng); - assert!($le(low, v) && $lt(v, high)); - } - - for _ in 0..1000 { - let v = <$ty as SampleUniform>::Sampler::sample_single_inclusive(low, high, &mut rng); - assert!($le(low, v) && $le(v, high)); - } - } - }}; - - // scalar bulk - ($($ty:ident),*) => {{ - $(t!( - $ty, - [(0, 10), (10, 127), ($ty::MIN, $ty::MAX)], - |x, y| x <= y, - |x, y| x < y - );)* - }}; - - // simd bulk - ($($ty:ident),* => $scalar:ident) => {{ - $(t!( - $ty, - [ - ($ty::splat(0), $ty::splat(10)), - ($ty::splat(10), $ty::splat(127)), - ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)), - ], - |x: $ty, y| x.le(y).all(), - |x: $ty, y| x.lt(y).all() - );)* - }}; - } - t!(i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, i128, u128); - - #[cfg(feature = "simd_support")] - { - t!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64 => u8); - t!(i8x2, i8x4, i8x8, i8x16, i8x32, i8x64 => i8); - t!(u16x2, u16x4, u16x8, u16x16, u16x32 => u16); - t!(i16x2, i16x4, i16x8, i16x16, i16x32 => i16); - t!(u32x2, u32x4, u32x8, u32x16 => u32); - t!(i32x2, i32x4, i32x8, i32x16 => i32); - t!(u64x2, u64x4, u64x8 => u64); - t!(i64x2, i64x4, i64x8 => i64); - } - } - #[test] fn test_custom_uniform() { use crate::distributions::uniform::{ @@ -1422,20 +434,6 @@ mod tests { } } - #[test] - fn test_uniform_from_std_range() { - let r = Uniform::from(2u32..7); - assert_eq!(r.0.low, 2); - assert_eq!(r.0.range, 5); - } - - #[test] - fn test_uniform_from_std_range_inclusive() { - let r = Uniform::from(2u32..=6); - assert_eq!(r.0.low, 2); - assert_eq!(r.0.range, 5); - } - pub(crate) fn test_samples( lb: T, ub: T, expected_single: &[T], expected_multiple: &[T], ) where Uniform: Distribution { @@ -1453,30 +451,4 @@ mod tests { } assert_eq!(&buf, expected_multiple); } - - #[test] - fn value_stability() { - // We test on a sub-set of types; possibly we should do more. - // TODO: SIMD types - - test_samples(11u8, 219, &[17, 66, 214], &[181, 93, 165]); - test_samples(11u32, 219, &[17, 66, 214], &[181, 93, 165]); - - test_samples(0f32, 1e-2f32, &[0.0003070104, 0.0026630748, 0.00979833], &[ - 0.008194133, - 0.00398172, - 0.007428536, - ]); - test_samples( - -1e10f64, - 1e10f64, - &[-4673848682.871551, 6388267422.932352, 4857075081.198343], - &[1173375212.1808167, 1917642852.109581, 2365076174.3153973], - ); - } - - #[test] - fn uniform_distributions_can_be_compared() { - assert_eq!(Uniform::new(1u32, 2u32), Uniform::new(1u32, 2u32)); - } } diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs new file mode 100644 index 0000000000..8a7dac5b71 --- /dev/null +++ b/src/distributions/uniform/uniform_int.rs @@ -0,0 +1,1038 @@ +// Copyright 2018-2021 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::{SampleBorrow, SampleUniform, UniformSampler}; +use crate::distributions::utils::WideningMultiply; +#[cfg(feature = "simd_support")] +use crate::distributions::utils::{OverflowingAdd, SimdCombine, SimdSplit}; +use crate::Rng; +#[cfg(feature = "simd_support")] use packed_simd::*; +#[cfg(feature = "serde1")] use serde::{Deserialize, Serialize}; + +/// The back-end implementing [`UniformSampler`] for integer types. +/// +/// Unless you are implementing [`UniformSampler`] for your own type, this type +/// should not be used directly, use [`Uniform`] instead. +/// +/// # Implementation notes +/// +/// For simplicity, we use the same generic struct `UniformInt` for all +/// integer types `X`. This gives us only one field type, `X`; to store unsigned +/// values of this size, we take use the fact that these conversions are no-ops. +/// +/// For a closed range, the number of possible numbers we should generate is +/// `range = (high - low + 1)`. To avoid bias, we must ensure that the size of +/// our sample space, `zone`, is a multiple of `range`; other values must be +/// rejected (by replacing with a new random sample). +/// +/// As a special case, we use `range = 0` to represent the full range of the +/// result type (i.e. for `new_inclusive($ty::MIN, $ty::MAX)`). +/// +/// The optimum `zone` is the largest product of `range` which fits in our +/// (unsigned) target type. We calculate this by calculating how many numbers we +/// must reject: `reject = (MAX + 1) % range = (MAX - range + 1) % range`. Any (large) +/// product of `range` will suffice, thus in `sample_single` we multiply by a +/// power of 2 via bit-shifting (faster but may cause more rejections). +/// +/// The smallest integer PRNGs generate is `u32`. For 8- and 16-bit outputs we +/// use `u32` for our `zone` and samples (because it's not slower and because +/// it reduces the chance of having to reject a sample). In this case we cannot +/// store `zone` in the target type since it is too large, however we know +/// `ints_to_reject < range <= $unsigned::MAX`. +/// +/// An alternative to using a modulus is widening multiply: After a widening +/// multiply by `range`, the result is in the high word. Then comparing the low +/// word against `zone` makes sure our distribution is uniform. +#[derive(Clone, Copy, Debug, PartialEq)] +#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))] +pub struct UniformInt { + low: X, + range: X, + z: X, // either ints_to_reject or zone depending on implementation +} + +macro_rules! uniform_int_impl { + ($ty:ty, $unsigned:ident, $u_large:ident, $u_extra_large:ident) => { + impl SampleUniform for $ty { + type Sampler = UniformInt<$ty>; + } + + impl UniformSampler for UniformInt<$ty> { + // We play free and fast with unsigned vs signed here + // (when $ty is signed), but that's fine, since the + // contract of this macro is for $ty and $unsigned to be + // "bit-equal", so casting between them is a no-op. + + type X = $ty; + + #[inline] // if the range is constant, this helps LLVM to do the + // calculations at compile-time. + fn new(low_b: B1, high_b: B2) -> Self + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low < high, "Uniform::new called with `low >= high`"); + UniformSampler::new_inclusive(low, high - 1) + } + + #[inline] // if the range is constant, this helps LLVM to do the + // calculations at compile-time. + fn new_inclusive(low_b: B1, high_b: B2) -> Self + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "Uniform::new_inclusive called with `low > high`" + ); + let unsigned_max = ::core::$u_large::MAX; + + let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned; + let ints_to_reject = if range > 0 { + let range = $u_large::from(range); + (unsigned_max - range + 1) % range + } else { + 0 + }; + + UniformInt { + low, + // These are really $unsigned values, but store as $ty: + range: range as $ty, + z: ints_to_reject as $unsigned as $ty, + } + } + + #[inline] + fn sample(&self, rng: &mut R) -> Self::X { + let range = self.range as $unsigned as $u_large; + if range > 0 { + let unsigned_max = ::core::$u_large::MAX; + let zone = unsigned_max - (self.z as $unsigned as $u_large); + loop { + let v: $u_large = rng.gen(); + let (hi, lo) = v.wmul(range); + if lo <= zone { + return self.low.wrapping_add(hi as $ty); + } + } + } else { + // Sample from the entire integer range. + rng.gen() + } + } + + #[inline] + fn sample_single(low_b: B1, high_b: B2, rng: &mut R) -> Self::X + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low < high, "UniformSampler::sample_single: low >= high"); + Self::sample_single_inclusive(low, high - 1, rng) + } + + #[inline] + fn sample_single_inclusive( + low_b: B1, high_b: B2, rng: &mut R, + ) -> Self::X + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + let zone = if ::core::$unsigned::MAX <= ::core::u16::MAX as $unsigned { + // Using a modulus is faster than the approximation for + // i8 and i16. I suppose we trade the cost of one + // modulus for near-perfect branch prediction. + let unsigned_max: $u_large = ::core::$u_large::MAX; + let ints_to_reject = (unsigned_max - range + 1) % range; + unsigned_max - ints_to_reject + } else { + // conservative but fast approximation. `- 1` is necessary to allow the + // same comparison without bias. + (range << range.leading_zeros()).wrapping_sub(1) + }; + + loop { + let v: $u_large = rng.gen(); + let (hi, lo) = v.wmul(range); + if lo <= zone { + return low.wrapping_add(hi as $ty); + } + } + } + } + + impl UniformInt<$ty> { + /// Sample single inclusive, using ONeill's method + #[inline] + pub fn sample_single_inclusive_oneill( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // we use the "Debiased Int Mult (t-opt, m-opt)" rejection sampling method + // described here https://www.pcg-random.org/posts/bounded-rands.html + // and here https://github.com/imneme/bounded-rands + + let (mut hi, mut lo) = rng.gen::<$u_large>().wmul(range); + if lo < range { + let mut threshold = range.wrapping_neg(); + // this shortcut works best with large ranges + if threshold >= range { + threshold -= range; + if threshold >= range { + threshold %= range; + } + } + while lo < threshold { + let (new_hi, new_lo) = rng.gen::<$u_large>().wmul(range); + hi = new_hi; + lo = new_lo; + } + } + low.wrapping_add(hi as $ty) + } + + /// Sample single inclusive, using Canon's method + #[inline] + pub fn sample_single_inclusive_canon( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // generate a sample using a sensible integer type + let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range); + + // if the sample is biased... + if lo_order > range.wrapping_neg() { + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let (new_hi_order, _) = + (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large); + // and adjust if needed + result += lo_order + .checked_add(new_hi_order as $u_extra_large) + .is_none() as $u_extra_large; + } + + low.wrapping_add(result as $ty) + } + + /// Sample single inclusive, using Canon's method with Lemire's early-out + #[inline] + pub fn sample_inclusive_canon_lemire( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // generate a sample using a sensible integer type + let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range); + + // if the sample is biased... (since range won't be changing we can further + // improve this check with a modulo) + if lo_order < range.wrapping_neg() % range { + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let (new_hi_order, _) = + (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large); + // and adjust if needed + result += lo_order + .checked_add(new_hi_order as $u_extra_large) + .is_none() as $u_extra_large; + } + + low.wrapping_add(result as $ty) + } + + /// Sample single inclusive, using the Bitmask method + #[inline] + pub fn sample_single_inclusive_bitmask( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let mut range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // the old impl use a mix of methods for different integer sizes, we only use + // the lz method here for a better comparison. + + let mut mask = $u_large::max_value(); + range -= 1; + mask >>= (range | 1).leading_zeros(); + loop { + let x = rng.gen::<$u_large>() & mask; + if x <= range { + return low.wrapping_add(x as $ty); + } + } + } + } + }; +} +uniform_int_impl! { i8, u8, u32, u64 } +uniform_int_impl! { i16, u16, u32, u64 } +uniform_int_impl! { i32, u32, u32, u64 } +uniform_int_impl! { i64, u64, u64, u64 } +uniform_int_impl! { i128, u128, u128, u128 } +uniform_int_impl! { u8, u8, u32, u64 } +uniform_int_impl! { u16, u16, u32, u64 } +uniform_int_impl! { u32, u32, u32, u64 } +uniform_int_impl! { u64, u64, u64, u64 } +uniform_int_impl! { u128, u128, u128, u128 } +#[cfg(any(target_pointer_width = "16", target_pointer_width = "32",))] +mod isize_int_impls { + use super::*; + uniform_int_impl! { isize, usize, usize, u64 } + uniform_int_impl! { usize, usize, usize, u64 } +} +#[cfg(not(any(target_pointer_width = "16", target_pointer_width = "32",)))] +mod isize_int_impls { + use super::*; + uniform_int_impl! { isize, usize, usize, usize } + uniform_int_impl! { usize, usize, usize, usize } +} + +#[cfg(feature = "simd_support")] +macro_rules! uniform_simd_int_impl { + ($ty:ident, $unsigned:ident, $u_scalar:ident) => { + // The "pick the largest zone that can fit in an `u32`" optimization + // is less useful here. Multiple lanes complicate things, we don't + // know the PRNG's minimal output size, and casting to a larger vector + // is generally a bad idea for SIMD performance. The user can still + // implement it manually. + + // TODO: look into `Uniform::::new(0u32, 100)` functionality + // perhaps `impl SampleUniform for $u_scalar`? + impl SampleUniform for $ty { + type Sampler = UniformInt<$ty>; + } + + impl UniformSampler for UniformInt<$ty> { + type X = $ty; + + #[inline] // if the range is constant, this helps LLVM to do the + // calculations at compile-time. + fn new(low_b: B1, high_b: B2) -> Self + where B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low.lt(high).all(), "Uniform::new called with `low >= high`"); + UniformSampler::new_inclusive(low, high - 1) + } + + #[inline] // if the range is constant, this helps LLVM to do the + // calculations at compile-time. + fn new_inclusive(low_b: B1, high_b: B2) -> Self + where B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low.le(high).all(), + "Uniform::new_inclusive called with `low > high`"); + let unsigned_max = ::core::$u_scalar::MAX; + + // NOTE: these may need to be replaced with explicitly + // wrapping operations if `packed_simd` changes + let range: $unsigned = ((high - low) + 1).cast(); + // `% 0` will panic at runtime. + let not_full_range = range.gt($unsigned::splat(0)); + // replacing 0 with `unsigned_max` allows a faster `select` + // with bitwise OR + let modulo = not_full_range.select(range, $unsigned::splat(unsigned_max)); + // wrapping addition + let ints_to_reject = (unsigned_max - range + 1) % modulo; + // When `range` is 0, `lo` of `v.wmul(range)` will always be + // zero which means only one sample is needed. + let zone = unsigned_max - ints_to_reject; + + UniformInt { + low, + // These are really $unsigned values, but store as $ty: + range: range.cast(), + z: zone.cast(), + } + } + + fn sample(&self, rng: &mut R) -> Self::X { + let range: $unsigned = self.range.cast(); + let zone: $unsigned = self.z.cast(); + + // This might seem very slow, generating a whole new + // SIMD vector for every sample rejection. For most uses + // though, the chance of rejection is small and provides good + // general performance. With multiple lanes, that chance is + // multiplied. To mitigate this, we replace only the lanes of + // the vector which fail, iteratively reducing the chance of + // rejection. The replacement method does however add a little + // overhead. Benchmarking or calculating probabilities might + // reveal contexts where this replacement method is slower. + let mut v: $unsigned = rng.gen(); + loop { + let (hi, lo) = v.wmul(range); + let mask = lo.le(zone); + if mask.all() { + let hi: $ty = hi.cast(); + // wrapping addition + let result = self.low + hi; + // `select` here compiles to a blend operation + // When `range.eq(0).none()` the compare and blend + // operations are avoided. + let v: $ty = v.cast(); + return range.gt($unsigned::splat(0)).select(result, v); + } + // Replace only the failing lanes + v = mask.select(v, rng.gen()); + } + } + } + + impl UniformInt<$ty> { + #[inline(always)] + fn sample_inc_setup(low_b: B1, high_b: B2) -> ($unsigned, $ty) + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high"); + // wrapping sub and add + let range: $unsigned = ((high - low) + 1).cast(); + (range, low) + } + } + }; + + // bulk implementation + ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => { + $( + uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar); + uniform_simd_int_impl!($signed, $unsigned, $u_scalar); + )+ + }; +} + +#[cfg(feature = "simd_support")] +macro_rules! uniform_simd_int_gt8_impl { + ($ty:ident, $unsigned:ident) => { + impl UniformInt<$ty> { + #[inline(always)] + fn canon_successive( + range: $unsigned, + result: &mut $unsigned, + lo_order: $unsigned, + rng: &mut R + ) { + let mut vecs_64 = range.simd_split(); + for x in &mut vecs_64 { + *x = rng.gen::().wmul((*x).cast()).0.cast(); + } + let cast_new_hi: $unsigned = vecs_64.simd_combine(); + + let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); + *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); + } + + /// Canon's method + #[inline(always)] + pub fn sample_single_inclusive_canon_branchless( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + Self::canon_successive(range, &mut result, lo_order, rng); + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// + #[inline(always)] + pub fn sample_inclusive_canon_scalar( + _low_b: B1, _high_b: B2, _rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + Default::default() // dummy impl + } + + /// Canon's method + #[inline(always)] + pub fn sample_single_inclusive_canon( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + if lo_order.gt(0 - range).any() { + Self::canon_successive(range, &mut result, lo_order, rng); + } + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// Bitmask + #[inline(always)] + pub fn sample_single_inclusive_bitmask( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (mut range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate bitmask + range -= 1; + let mut mask = range | 1; + + mask |= mask >> 1; + mask |= mask >> 2; + mask |= mask >> 4; + + const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); + if LANE_WIDTH >= 16 { mask |= mask >> 8; } + if LANE_WIDTH >= 32 { mask |= mask >> 16; } + if LANE_WIDTH >= 64 { mask |= mask >> 32; } + if LANE_WIDTH >= 128 { mask |= mask >> 64; } + + let mut v: $unsigned = rng.gen(); + loop { + let masked = v & mask; + let accept = masked.le(range); + if accept.all() { + let masked: $ty = masked.cast(); + // wrapping addition + let result = low + masked; + // `select` here compiles to a blend operation + // When `range.eq(0).none()` the compare and blend + // operations are avoided. + let v: $ty = v.cast(); + return is_full_range.select(v, result); + } + // Replace only the failing lanes + v = accept.select(v, rng.gen()); + } + } + } + }; + + ($(($unsigned:ident, $signed:ident)),+) => {$( + uniform_simd_int_gt8_impl!{ $unsigned, $unsigned } + uniform_simd_int_gt8_impl!{ $signed, $unsigned } + )+}; +} + +#[cfg(feature = "simd_support")] +macro_rules! uniform_simd_int_le8_impl { + ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => { + impl UniformInt<$ty> { + #[inline(always)] + fn canon_successive( + range: $unsigned, + result: &mut $unsigned, + lo_order: $unsigned, + rng: &mut R + ) { + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast(); + let large_range: $u_extra_large = range.cast(); + let (new_hi_order, _) = new_bits.wmul(large_range); + // and adjust if needed + let cast_new_hi: $unsigned = new_hi_order.cast(); + let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); + *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); + } + + /// + #[inline(always)] + pub fn sample_single_inclusive_canon_branchless( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + Self::canon_successive(range, &mut result, lo_order, rng); + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// + #[inline(always)] + pub fn sample_inclusive_canon_scalar( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast(); + let large_range: $u_extra_large = range.cast(); + + // let (new_hi_order, _) = new_bits.wmul(large_range); + let mut new_hi_order = <$u_extra_large>::default(); + + for i in 0..<$ty>::lanes() { + let (shi, _slo) = new_bits.extract(i).wmul(large_range.extract(i)); + new_hi_order = new_hi_order.replace(i, shi); + } + + // and adjust if needed + let cast_new_hi: $unsigned = new_hi_order.cast(); + let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); + result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// + #[inline(always)] + pub fn sample_single_inclusive_canon( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$unsigned>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + if lo_order.gt(0 - range).any() { + Self::canon_successive(range, &mut result, lo_order, rng); + } + + let cast_result: $ty = result.cast(); + let cast_rand_bits: $ty = rand_bits.cast(); + is_full_range.select(cast_rand_bits, low + cast_result) + } + + /// + #[inline(always)] + pub fn sample_single_inclusive_bitmask( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (mut range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate bitmask + range -= 1; + let mut mask = range | 1; + + mask |= mask >> 1; + mask |= mask >> 2; + mask |= mask >> 4; + + const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); + if LANE_WIDTH >= 16 { mask |= mask >> 8; } + if LANE_WIDTH >= 32 { mask |= mask >> 16; } + if LANE_WIDTH >= 64 { mask |= mask >> 32; } + if LANE_WIDTH >= 128 { mask |= mask >> 64; } + + let mut v: $unsigned = rng.gen(); + loop { + let masked = v & mask; + let accept = masked.le(range); + if accept.all() { + let masked: $ty = masked.cast(); + // wrapping addition + let result = low + masked; + // `select` here compiles to a blend operation + // When `range.eq(0).none()` the compare and blend + // operations are avoided. + let v: $ty = v.cast(); + return is_full_range.select(v, result); + } + // Replace only the failing lanes + v = accept.select(v, rng.gen()); + } + } + } + }; + + ($(($unsigned:ident, $signed:ident, $u64xN_type:ident, $u_extra_large:ident)),+) => {$( + uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u64xN_type, $u_extra_large } + uniform_simd_int_le8_impl!{ $signed, $unsigned, $u64xN_type, $u_extra_large } + )+}; +} + +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u128x2, i128x2), + (u128x4, i128x4), + u128 +} +// #[cfg(feature = "simd_support")] +// uniform_simd_int_impl! { +// (usizex2, isizex2), +// (usizex4, isizex4), +// (usizex8, isizex8), +// usize +// } +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u64x2, i64x2), + (u64x4, i64x4), + (u64x8, i64x8), + u64 +} +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u32x2, i32x2), + (u32x4, i32x4), + (u32x8, i32x8), + (u32x16, i32x16), + u32 +} +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u16x2, i16x2), + (u16x4, i16x4), + (u16x8, i16x8), + (u16x16, i16x16), + (u16x32, i16x32), + u16 +} +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u8x2, i8x2), + (u8x4, i8x4), + (u8x8, i8x8), + (u8x16, i8x16), + (u8x32, i8x32), + (u8x64, i8x64), + u8 +} + +#[cfg(feature = "simd_support")] +uniform_simd_int_gt8_impl! { + (u8x16, i8x16), + (u8x32, i8x32), + (u8x64, i8x64), + + (u16x16, i16x16), + (u16x32, i16x32), + + (u32x16, i32x16) +} +#[cfg(feature = "simd_support")] +uniform_simd_int_le8_impl! { + (u8x2, i8x2, i64x2, u64x2), + (u8x4, i8x4, i64x4, u64x4), + (u8x8, i8x8, i64x8, u64x8), + + (u16x2, i16x2, i64x2, u64x2), + (u16x4, i16x4, i64x4, u64x4), + (u16x8, i16x8, i64x8, u64x8), + + (u32x2, i32x2, i64x2, u64x2), + (u32x4, i32x4, i64x4, u64x4), + (u32x8, i32x8, i64x8, u64x8), + + (u64x2, i64x2, i64x2, u64x2), + (u64x4, i64x4, i64x4, u64x4), + (u64x8, i64x8, i64x8, u64x8), + + (u128x2, i128x2, i64x2, u128x2), + (u128x4, i128x4, i64x4, u128x4) + + // (usizex2, isizex2, i64x2, u64x2), + // (usizex4, isizex4, i64x4, u64x4), + // (usizex8, isizex8, i64x8, u64x8) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::distributions::uniform::tests::test_samples; + use crate::distributions::Uniform; + + #[test] + #[cfg(feature = "serde1")] + fn test_uniform_serialization() { + let unit_box: Uniform = Uniform::new(-1, 1); + let de_unit_box: Uniform = + bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap(); + + assert_eq!(unit_box.0.low, de_unit_box.0.low); + assert_eq!(unit_box.0.range, de_unit_box.0.range); + assert_eq!(unit_box.0.z, de_unit_box.0.z); + } + + #[should_panic] + #[test] + fn test_uniform_bad_limits_equal_int() { + Uniform::new(10, 10); + } + + #[test] + fn test_uniform_good_limits_equal_int() { + let mut rng = crate::test::rng(804); + let dist = Uniform::new_inclusive(10, 10); + for _ in 0..20 { + assert_eq!(rng.sample(dist), 10); + } + } + + #[should_panic] + #[test] + fn test_uniform_bad_limits_flipped_int() { + Uniform::new(10, 5); + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_integers() { + use core::{i128, u128}; + use core::{i16, i32, i64, i8, isize}; + use core::{u16, u32, u64, u8, usize}; + + let mut rng = crate::test::rng(251); + macro_rules! t { + ($ty:ident, $v:expr, $le:expr, $lt:expr) => {{ + for &(low, high) in $v.iter() { + let my_uniform = Uniform::new(low, high); + for _ in 0..1000 { + let v: $ty = rng.sample(my_uniform); + assert!($le(low, v) && $lt(v, high)); + } + + let my_uniform = Uniform::new_inclusive(low, high); + for _ in 0..1000 { + let v: $ty = rng.sample(my_uniform); + assert!($le(low, v) && $le(v, high)); + } + + let my_uniform = Uniform::new(&low, high); + for _ in 0..1000 { + let v: $ty = rng.sample(my_uniform); + assert!($le(low, v) && $lt(v, high)); + } + + let my_uniform = Uniform::new_inclusive(&low, &high); + for _ in 0..1000 { + let v: $ty = rng.sample(my_uniform); + assert!($le(low, v) && $le(v, high)); + } + + for _ in 0..1000 { + let v = <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng); + assert!($le(low, v) && $lt(v, high)); + } + + for _ in 0..1000 { + let v = <$ty as SampleUniform>::Sampler::sample_single_inclusive(low, high, &mut rng); + assert!($le(low, v) && $le(v, high)); + } + } + }}; + + // scalar bulk + ($($ty:ident),*) => {{ + $(t!( + $ty, + [(0, 10), (10, 127), ($ty::MIN, $ty::MAX)], + |x, y| x <= y, + |x, y| x < y + );)* + }}; + + // simd bulk + ($($ty:ident),* => $scalar:ident) => {{ + $(t!( + $ty, + [ + ($ty::splat(0), $ty::splat(10)), + ($ty::splat(10), $ty::splat(127)), + ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)), + ], + |x: $ty, y| x.le(y).all(), + |x: $ty, y| x.lt(y).all() + );)* + }}; + } + t!(i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, i128, u128); + + #[cfg(feature = "simd_support")] + { + t!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64 => u8); + t!(i8x2, i8x4, i8x8, i8x16, i8x32, i8x64 => i8); + t!(u16x2, u16x4, u16x8, u16x16, u16x32 => u16); + t!(i16x2, i16x4, i16x8, i16x16, i16x32 => i16); + t!(u32x2, u32x4, u32x8, u32x16 => u32); + t!(i32x2, i32x4, i32x8, i32x16 => i32); + t!(u64x2, u64x4, u64x8 => u64); + t!(i64x2, i64x4, i64x8 => i64); + } + } + + #[test] + fn test_uniform_from_std_range() { + let r = Uniform::from(2u32..7); + assert_eq!(r.0.low, 2); + assert_eq!(r.0.range, 5); + } + + #[test] + fn test_uniform_from_std_range_inclusive() { + let r = Uniform::from(2u32..=6); + assert_eq!(r.0.low, 2); + assert_eq!(r.0.range, 5); + } + + #[test] + fn value_stability() { + // We test on a sub-set of types; possibly we should do more. + // TODO: SIMD types + + test_samples(11u8, 219, &[17, 66, 214], &[181, 93, 165]); + test_samples(11u32, 219, &[17, 66, 214], &[181, 93, 165]); + + test_samples(0f32, 1e-2f32, &[0.0003070104, 0.0026630748, 0.00979833], &[ + 0.008194133, + 0.00398172, + 0.007428536, + ]); + test_samples( + -1e10f64, + 1e10f64, + &[-4673848682.871551, 6388267422.932352, 4857075081.198343], + &[1173375212.1808167, 1917642852.109581, 2365076174.3153973], + ); + } + + #[test] + fn uniform_distributions_can_be_compared() { + assert_eq!(Uniform::new(1u32, 2u32), Uniform::new(1u32, 2u32)); + } +} From 1a56b3e1c6ec4857b66bcbeb0c629b451cb0e60a Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Wed, 20 Oct 2021 17:21:01 +0100 Subject: [PATCH 10/17] Add algorithm impls for repeated sampling --- benches/uniform.rs | 28 +++++- src/distributions/uniform.rs | 3 +- src/distributions/uniform/uniform_int.rs | 119 ++++++++++++++++++++--- 3 files changed, 134 insertions(+), 16 deletions(-) diff --git a/benches/uniform.rs b/benches/uniform.rs index e5b02a2912..57d62dd7c9 100644 --- a/benches/uniform.rs +++ b/benches/uniform.rs @@ -9,11 +9,27 @@ use criterion::{criterion_group, criterion_main}; use criterion::{BenchmarkId, Criterion}; #[cfg(feature = "simd_support")] use packed_simd::*; -use rand::distributions::uniform::{SampleUniform, UniformSampler}; +use rand::distributions::uniform::{SampleUniform, Uniform, UniformSampler}; use rand::prelude::*; type BenchRng = SmallRng; +macro_rules! bench_dist_int_group { + ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => { + for input in $inputs { + $g.bench_with_input( + BenchmarkId::new($name, input.0), + &input.1, + |b, (low, high)| { + let mut rng = BenchRng::from_entropy(); + let dist = Uniform::new_inclusive(low, high); + b.iter(|| <$T as SampleUniform>::Sampler::$f(&dist.0, &mut rng)) + }, + ); + } + }; +} + macro_rules! bench_int_group { ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => { for input in $inputs { @@ -39,9 +55,17 @@ macro_rules! bench_int_group { macro_rules! bench_int { ($c:expr, $T:ty, $high:expr) => {{ - let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T))); let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))]; + let mut g = $c.benchmark_group(concat!("uniform_dist_int_", stringify!($T))); + bench_dist_int_group!("Old", $T, sample, g, inputs); + bench_dist_int_group!("Lemire", $T, sample_lemire, g, inputs); + bench_dist_int_group!("Canon", $T, sample_canon, g, inputs); + bench_dist_int_group!("Canon-Lemire", $T, sample_canon_lemire, g, inputs); + bench_dist_int_group!("Bitmask", $T, sample_bitmask, g, inputs); + drop(g); + + let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T))); bench_int_group!("Old", $T, sample_single_inclusive, g, inputs); bench_int_group!("ONeill", $T, sample_single_inclusive_oneill, g, inputs); bench_int_group!("Canon", $T, sample_single_inclusive_canon, g, inputs); diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 4b81ab9ee9..8b1beb80dc 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -178,7 +178,8 @@ pub use uniform_other::{UniformChar, UniformDuration}; feature = "serde1", serde(bound(deserialize = "X::Sampler: Deserialize<'de>")) )] -pub struct Uniform(X::Sampler); +// HACK: field is pub +pub struct Uniform(pub X::Sampler); impl Uniform { /// Create a new `Uniform` instance which samples uniformly from the half diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs index 8a7dac5b71..8eb7ac98b9 100644 --- a/src/distributions/uniform/uniform_int.rs +++ b/src/distributions/uniform/uniform_int.rs @@ -53,7 +53,8 @@ use crate::Rng; pub struct UniformInt { low: X, range: X, - z: X, // either ints_to_reject or zone depending on implementation + nrmr: X, // range.wrapping_neg() % range + z: X, // either ints_to_reject or zone depending on implementation } macro_rules! uniform_int_impl { @@ -110,6 +111,7 @@ macro_rules! uniform_int_impl { low, // These are really $unsigned values, but store as $ty: range: range as $ty, + nrmr: (range.wrapping_neg() % range) as $ty, z: ints_to_reject as $unsigned as $ty, } } @@ -117,19 +119,18 @@ macro_rules! uniform_int_impl { #[inline] fn sample(&self, rng: &mut R) -> Self::X { let range = self.range as $unsigned as $u_large; - if range > 0 { - let unsigned_max = ::core::$u_large::MAX; - let zone = unsigned_max - (self.z as $unsigned as $u_large); - loop { - let v: $u_large = rng.gen(); - let (hi, lo) = v.wmul(range); - if lo <= zone { - return self.low.wrapping_add(hi as $ty); - } + if range == 0 { + return rng.gen(); + } + + let unsigned_max = ::core::$u_large::MAX; + let zone = unsigned_max - (self.z as $unsigned as $u_large); + loop { + let v: $u_large = rng.gen(); + let (hi, lo) = v.wmul(range); + if lo <= zone { + return self.low.wrapping_add(hi as $ty); } - } else { - // Sample from the entire integer range. - rng.gen() } } @@ -189,6 +190,97 @@ macro_rules! uniform_int_impl { } impl UniformInt<$ty> { + /// Sample, Lemire's method + #[inline] + pub fn sample_lemire(&self, rng: &mut R) -> $ty { + let range = self.range as $unsigned as $u_large; + if range == 0 { + return rng.gen(); + } + + let (mut hi, mut lo) = rng.gen::<$u_large>().wmul(range); + if lo < range { + while lo < (self.nrmr as $u_large) { + let (new_hi, new_lo) = rng.gen::<$u_large>().wmul(range); + hi = new_hi; + lo = new_lo; + } + } + self.low.wrapping_add(hi as $ty) + } + + /// Sample, Canon's method + #[inline] + pub fn sample_canon(&self, rng: &mut R) -> $ty { + let range = self.range as $unsigned as $u_extra_large; + if range == 0 { + return rng.gen(); + } + + // generate a sample using a sensible integer type + let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range); + + // if the sample is biased... + if lo_order > range.wrapping_neg() { + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let (new_hi_order, _) = + (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large); + // and adjust if needed + result += lo_order + .checked_add(new_hi_order as $u_extra_large) + .is_none() as $u_extra_large; + } + + self.low.wrapping_add(result as $ty) + } + + /// Sample, Canon's method with Lemire's early-out + #[inline] + pub fn sample_canon_lemire(&self, rng: &mut R) -> $ty { + let range = self.range as $unsigned as $u_extra_large; + if range == 0 { + return rng.gen(); + } + + // generate a sample using a sensible integer type + let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range); + + // if the sample is biased... + if lo_order < (self.nrmr as $u_extra_large) { + // ...generate a new sample with 64 more bits, enough that bias is undetectable + let (new_hi_order, _) = + (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large); + // and adjust if needed + result += lo_order + .checked_add(new_hi_order as $u_extra_large) + .is_none() as $u_extra_large; + } + + self.low.wrapping_add(result as $ty) + } + + /// Sample, Bitmask method + #[inline] + pub fn sample_bitmask(&self, rng: &mut R) -> $ty { + let mut range = self.range as $unsigned as $u_large; + if range == 0 { + return rng.gen(); + } + + // the old impl use a mix of methods for different integer sizes, we only use + // the lz method here for a better comparison. + + let mut mask = $u_large::max_value(); + range -= 1; + mask >>= (range | 1).leading_zeros(); + loop { + let x = rng.gen::<$u_large>() & mask; + if x <= range { + return self.low.wrapping_add(x as $ty); + } + } + } + /// Sample single inclusive, using ONeill's method #[inline] pub fn sample_single_inclusive_oneill( @@ -430,6 +522,7 @@ macro_rules! uniform_simd_int_impl { low, // These are really $unsigned values, but store as $ty: range: range.cast(), + nrmr: ((0 - range) % range).cast(), z: zone.cast(), } } From faa141cb53e0c89529dc3ef36abaa058917a8abd Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Thu, 21 Oct 2021 10:38:05 +0100 Subject: [PATCH 11/17] Extra SIMD i8 benches --- benches/uniform.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benches/uniform.rs b/benches/uniform.rs index 57d62dd7c9..393c691377 100644 --- a/benches/uniform.rs +++ b/benches/uniform.rs @@ -133,8 +133,12 @@ macro_rules! bench_simd { fn uniform_simd(c: &mut Criterion) { // for i8/i16, we use 32-bit integers internally so rejection is most common near full-size // the exact values were determined with an exhaustive search + bench_simd!(c, i8x2, (i8::MIN, 116)); + bench_simd!(c, i8x4, (i8::MIN, 116)); + bench_simd!(c, i8x8, (i8::MIN, 116)); bench_simd!(c, i8x16, (i8::MIN, 116)); bench_simd!(c, i8x32, (i8::MIN, 116)); + bench_simd!(c, i8x64, (i8::MIN, 116)); bench_simd!(c, i16x8, (i16::MIN, 32407)); bench_simd!(c, i16x16, (i16::MIN, 32407)); bench_simd!(c, i32x4, (i32::MIN, 1)); From b7f9c49d4d4129241cde3789dfefd2924edd5a3a Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Fri, 22 Oct 2021 11:33:58 +0100 Subject: [PATCH 12/17] Add UniformInt::sample_canon_64 (15% faster) --- benches/uniform.rs | 4 ++++ src/distributions/uniform/uniform_int.rs | 25 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/benches/uniform.rs b/benches/uniform.rs index 393c691377..fa2350b4f4 100644 --- a/benches/uniform.rs +++ b/benches/uniform.rs @@ -82,6 +82,10 @@ fn uniform_int(c: &mut Criterion) { bench_int!(c, i32, (i32::MIN, 1)); bench_int!(c, i64, (i64::MIN, 1)); bench_int!(c, i128, (i128::MIN, 1)); + + let inputs = &[("high_reject", (i128::MIN, 1)), ("low_reject", (-1, 2))]; + let mut g = c.benchmark_group(concat!("uniform_dist_int_", stringify!(i128))); + bench_dist_int_group!("Canon64", i128, sample_canon_64, g, inputs); } #[cfg(feature = "simd_support")] diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs index 8eb7ac98b9..b9381986fc 100644 --- a/src/distributions/uniform/uniform_int.rs +++ b/src/distributions/uniform/uniform_int.rs @@ -462,6 +462,31 @@ mod isize_int_impls { uniform_int_impl! { usize, usize, usize, usize } } +impl UniformInt { + /// Sample, Canon's method variant + #[inline] + pub fn sample_canon_64(&self, rng: &mut R) -> i128 { + let range = self.range as i128 as u128; + if range == 0 { + return rng.gen(); + } + + // Sample multiplied by 2.pow(-128) makes lo1 fractional (>>128): + let (mut result, lo1) = rng.gen::().wmul(range); + + if lo1 > range.wrapping_neg() { + // Generate more bits. Sample is multiplied by 2.pow(-192), so + // hi2 is multiplied by 2.pow(-64): + let (hi2, lo2) = (rng.gen::() as u128).wmul(range); + debug_assert_eq!(hi2 >> 64, 0u128); + let is_overflow = lo1.checked_add((hi2 << 64) | (lo2 >> 64)).is_none(); + result += is_overflow as u128; + } + + self.low.wrapping_add(result as i128) + } +} + #[cfg(feature = "simd_support")] macro_rules! uniform_simd_int_impl { ($ty:ident, $unsigned:ident, $u_scalar:ident) => { From 5326db3fc5182a439c3a486f9ae9d1f69f310789 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Sun, 24 Oct 2021 08:47:56 +0100 Subject: [PATCH 13/17] Additional Canon64 variants --- benches/uniform.rs | 5 +--- src/distributions/uniform/uniform_int.rs | 37 +++++++++++++++++++++++- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/benches/uniform.rs b/benches/uniform.rs index fa2350b4f4..aa92035e79 100644 --- a/benches/uniform.rs +++ b/benches/uniform.rs @@ -61,6 +61,7 @@ macro_rules! bench_int { bench_dist_int_group!("Old", $T, sample, g, inputs); bench_dist_int_group!("Lemire", $T, sample_lemire, g, inputs); bench_dist_int_group!("Canon", $T, sample_canon, g, inputs); + bench_dist_int_group!("Canon64", $T, sample_canon_64, g, inputs); bench_dist_int_group!("Canon-Lemire", $T, sample_canon_lemire, g, inputs); bench_dist_int_group!("Bitmask", $T, sample_bitmask, g, inputs); drop(g); @@ -82,10 +83,6 @@ fn uniform_int(c: &mut Criterion) { bench_int!(c, i32, (i32::MIN, 1)); bench_int!(c, i64, (i64::MIN, 1)); bench_int!(c, i128, (i128::MIN, 1)); - - let inputs = &[("high_reject", (i128::MIN, 1)), ("low_reject", (-1, 2))]; - let mut g = c.benchmark_group(concat!("uniform_dist_int_", stringify!(i128))); - bench_dist_int_group!("Canon64", i128, sample_canon_64, g, inputs); } #[cfg(feature = "simd_support")] diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs index b9381986fc..9c65c582d0 100644 --- a/src/distributions/uniform/uniform_int.rs +++ b/src/distributions/uniform/uniform_int.rs @@ -462,11 +462,46 @@ mod isize_int_impls { uniform_int_impl! { usize, usize, usize, usize } } +macro_rules! uniform_int_64_impl { + ($ty:ty, $unsigned:ident) => { + impl UniformInt<$ty> { + /// Sample, Canon's method variant + #[inline] + pub fn sample_canon_64(&self, rng: &mut R) -> $ty { + let range = self.range as $unsigned as u64; + if range == 0 { + return rng.gen(); + } + + let (result, _lo1) = rng.gen::().wmul(range); + // bias is at most 1 in 2.pow(56) for i8, 1 in 2.pow(48) for i16 + self.low.wrapping_add(result as $ty) + } + } + } +} +uniform_int_64_impl!(i8, u8); +uniform_int_64_impl!(i16, u16); + +macro_rules! uniform_int_64void_impl { + ($ty:ty) => { + impl UniformInt<$ty> { + /// Sample, Canon's method variant + #[inline] + pub fn sample_canon_64(&self, _rng: &mut R) -> $ty { + Default::default() // not used + } + } + } +} +uniform_int_64void_impl!(i32); +uniform_int_64void_impl!(i64); + impl UniformInt { /// Sample, Canon's method variant #[inline] pub fn sample_canon_64(&self, rng: &mut R) -> i128 { - let range = self.range as i128 as u128; + let range = self.range as u128; if range == 0 { return rng.gen(); } From a97199b0a291cf5f42561e3040560e173eb07b9b Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 15 Nov 2021 15:08:59 +0000 Subject: [PATCH 14/17] Add sample_single_inclusive_canon_64 This beats other uniform_int_i128 results --- benches/uniform.rs | 1 + src/distributions/uniform/uniform_int.rs | 81 ++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/benches/uniform.rs b/benches/uniform.rs index aa92035e79..cb743b4f78 100644 --- a/benches/uniform.rs +++ b/benches/uniform.rs @@ -70,6 +70,7 @@ macro_rules! bench_int { bench_int_group!("Old", $T, sample_single_inclusive, g, inputs); bench_int_group!("ONeill", $T, sample_single_inclusive_oneill, g, inputs); bench_int_group!("Canon", $T, sample_single_inclusive_canon, g, inputs); + bench_int_group!("Canon64", $T, sample_single_inclusive_canon_64, g, inputs); bench_int_group!("Canon-Lemire", $T, sample_inclusive_canon_lemire, g, inputs); bench_int_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs); }}; diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs index 9c65c582d0..b1bd86927b 100644 --- a/src/distributions/uniform/uniform_int.rs +++ b/src/distributions/uniform/uniform_int.rs @@ -466,6 +466,8 @@ macro_rules! uniform_int_64_impl { ($ty:ty, $unsigned:ident) => { impl UniformInt<$ty> { /// Sample, Canon's method variant + /// + /// Variant: potential increase to bias (uses a single `u64` sample). #[inline] pub fn sample_canon_64(&self, rng: &mut R) -> $ty { let range = self.range as $unsigned as u64; @@ -477,6 +479,35 @@ macro_rules! uniform_int_64_impl { // bias is at most 1 in 2.pow(56) for i8, 1 in 2.pow(48) for i16 self.low.wrapping_add(result as $ty) } + + /// Sample single inclusive, using Canon's method variant + /// + /// Variant: potential increase to bias (uses a single `u64` sample). + #[inline] + pub fn sample_single_inclusive_canon_64( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as u64; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // generate a sample using a sensible integer type + let (result, _lo1) = rng.gen::().wmul(range); + // bias is at most 1 in 2.pow(56) for i8, 1 in 2.pow(48) for i16 + low.wrapping_add(result as $ty) + } } } } @@ -491,6 +522,18 @@ macro_rules! uniform_int_64void_impl { pub fn sample_canon_64(&self, _rng: &mut R) -> $ty { Default::default() // not used } + + /// Sample single inclusive, using Canon's method variant + #[inline] + pub fn sample_single_inclusive_canon_64( + _low_b: B1, _high_b: B2, _rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + Default::default() // not used + } } } } @@ -520,6 +563,44 @@ impl UniformInt { self.low.wrapping_add(result as i128) } + + /// Sample single inclusive, using Canon's method variant + /// + /// Variant: potential increase to bias (uses a single `u64` sample). + #[inline] + pub fn sample_single_inclusive_canon_64( + low_b: B1, high_b: B2, rng: &mut R, + ) -> i128 + where + B1: SampleBorrow + Sized, + B2: SampleBorrow + Sized, + { + let low = *low_b.borrow(); + let high = *high_b.borrow(); + assert!( + low <= high, + "UniformSampler::sample_single_inclusive: low > high" + ); + let range = high.wrapping_sub(low).wrapping_add(1) as u128; + if range == 0 { + // Range is MAX+1 (unrepresentable), so we need a special case + return rng.gen(); + } + + // generate a sample using a sensible integer type + let (mut result, lo1) = rng.gen::().wmul(range); + + if lo1 > range.wrapping_neg() { + // Generate more bits. Sample is multiplied by 2.pow(-192), so + // hi2 is multiplied by 2.pow(-64): + let (hi2, lo2) = (rng.gen::() as u128).wmul(range); + debug_assert_eq!(hi2 >> 64, 0u128); + let is_overflow = lo1.checked_add((hi2 << 64) | (lo2 >> 64)).is_none(); + result += is_overflow as u128; + } + + low.wrapping_add(result as i128) + } } #[cfg(feature = "simd_support")] From 9d0c88a2979974bd3088368660fa0f832ac5d037 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Mon, 15 Nov 2021 15:16:30 +0000 Subject: [PATCH 15/17] dedup SIMD sample_single_inclusive_bitmask --- src/distributions/uniform/uniform_int.rs | 135 ++++++++--------------- 1 file changed, 45 insertions(+), 90 deletions(-) diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs index b1bd86927b..da1a5b3e93 100644 --- a/src/distributions/uniform/uniform_int.rs +++ b/src/distributions/uniform/uniform_int.rs @@ -715,6 +715,51 @@ macro_rules! uniform_simd_int_impl { let range: $unsigned = ((high - low) + 1).cast(); (range, low) } + + /// + #[inline(always)] + pub fn sample_single_inclusive_bitmask( + low_b: B1, high_b: B2, rng: &mut R, + ) -> $ty + where + B1: SampleBorrow<$ty> + Sized, + B2: SampleBorrow<$ty> + Sized, + { + let (mut range, low) = Self::sample_inc_setup(low_b, high_b); + let is_full_range = range.eq($unsigned::splat(0)); + + // generate bitmask + range -= 1; + let mut mask = range | 1; + + mask |= mask >> 1; + mask |= mask >> 2; + mask |= mask >> 4; + + const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); + if LANE_WIDTH >= 16 { mask |= mask >> 8; } + if LANE_WIDTH >= 32 { mask |= mask >> 16; } + if LANE_WIDTH >= 64 { mask |= mask >> 32; } + if LANE_WIDTH >= 128 { mask |= mask >> 64; } + + let mut v: $unsigned = rng.gen(); + loop { + let masked = v & mask; + let accept = masked.le(range); + if accept.all() { + let masked: $ty = masked.cast(); + // wrapping addition + let result = low + masked; + // `select` here compiles to a blend operation + // When `range.eq(0).none()` the compare and blend + // operations are avoided. + let v: $ty = v.cast(); + return is_full_range.select(v, result); + } + // Replace only the failing lanes + v = accept.select(v, rng.gen()); + } + } } }; @@ -806,51 +851,6 @@ macro_rules! uniform_simd_int_gt8_impl { let cast_rand_bits: $ty = rand_bits.cast(); is_full_range.select(cast_rand_bits, low + cast_result) } - - /// Bitmask - #[inline(always)] - pub fn sample_single_inclusive_bitmask( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (mut range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate bitmask - range -= 1; - let mut mask = range | 1; - - mask |= mask >> 1; - mask |= mask >> 2; - mask |= mask >> 4; - - const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); - if LANE_WIDTH >= 16 { mask |= mask >> 8; } - if LANE_WIDTH >= 32 { mask |= mask >> 16; } - if LANE_WIDTH >= 64 { mask |= mask >> 32; } - if LANE_WIDTH >= 128 { mask |= mask >> 64; } - - let mut v: $unsigned = rng.gen(); - loop { - let masked = v & mask; - let accept = masked.le(range); - if accept.all() { - let masked: $ty = masked.cast(); - // wrapping addition - let result = low + masked; - // `select` here compiles to a blend operation - // When `range.eq(0).none()` the compare and blend - // operations are avoided. - let v: $ty = v.cast(); - return is_full_range.select(v, result); - } - // Replace only the failing lanes - v = accept.select(v, rng.gen()); - } - } } }; @@ -966,51 +966,6 @@ macro_rules! uniform_simd_int_le8_impl { let cast_rand_bits: $ty = rand_bits.cast(); is_full_range.select(cast_rand_bits, low + cast_result) } - - /// - #[inline(always)] - pub fn sample_single_inclusive_bitmask( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (mut range, low) = Self::sample_inc_setup(low_b, high_b); - let is_full_range = range.eq($unsigned::splat(0)); - - // generate bitmask - range -= 1; - let mut mask = range | 1; - - mask |= mask >> 1; - mask |= mask >> 2; - mask |= mask >> 4; - - const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); - if LANE_WIDTH >= 16 { mask |= mask >> 8; } - if LANE_WIDTH >= 32 { mask |= mask >> 16; } - if LANE_WIDTH >= 64 { mask |= mask >> 32; } - if LANE_WIDTH >= 128 { mask |= mask >> 64; } - - let mut v: $unsigned = rng.gen(); - loop { - let masked = v & mask; - let accept = masked.le(range); - if accept.all() { - let masked: $ty = masked.cast(); - // wrapping addition - let result = low + masked; - // `select` here compiles to a blend operation - // When `range.eq(0).none()` the compare and blend - // operations are avoided. - let v: $ty = v.cast(); - return is_full_range.select(v, result); - } - // Replace only the failing lanes - v = accept.select(v, rng.gen()); - } - } } }; From b912a09c06eb6f1e4befc1a1fc0d30c92e4d2072 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 16 Nov 2021 09:30:25 +0000 Subject: [PATCH 16/17] temp --- benches/uniform.rs | 45 +++-- src/distributions/uniform/uniform_int.rs | 236 ++++++++++++++--------- 2 files changed, 177 insertions(+), 104 deletions(-) diff --git a/benches/uniform.rs b/benches/uniform.rs index cb743b4f78..cdd8376482 100644 --- a/benches/uniform.rs +++ b/benches/uniform.rs @@ -86,6 +86,24 @@ fn uniform_int(c: &mut Criterion) { bench_int!(c, i128, (i128::MIN, 1)); } +#[cfg(feature = "simd_support")] +macro_rules! bench_dist_simd_group { + ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => { + for input in $inputs { + $g.bench_with_input( + BenchmarkId::new($name, input.0), + &input.1, + |b, (low, high)| { + let mut rng = BenchRng::from_entropy(); + let (low, high) = (<$T>::splat(*low), <$T>::splat(*high)); + let dist = Uniform::new_inclusive(low, high); + b.iter(|| <$T as SampleUniform>::Sampler::$f(&dist.0, &mut rng)) + }, + ); + } + }; +} + #[cfg(feature = "simd_support")] macro_rules! bench_simd_group { ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => { @@ -114,20 +132,21 @@ macro_rules! bench_simd_group { #[cfg(feature = "simd_support")] macro_rules! bench_simd { ($c:expr, $T:ty, $high:expr/*, $incr:expr*/) => {{ - let mut g = $c.benchmark_group(concat!("uniform_simd_", stringify!($T))); let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))]; + let mut g = $c.benchmark_group(concat!("uniform_dist_simd_", stringify!($T))); + bench_dist_simd_group!("Old", $T, sample, g, inputs); + bench_dist_simd_group!("Canon", $T, sample_canon, g, inputs); + bench_dist_simd_group!("Canon-Lemire", $T, sample_canon_lemire, g, inputs); + bench_dist_simd_group!("Bitmask", $T, sample_bitmask, g, inputs); + drop(g); + + let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T))); bench_simd_group!("Old", $T, sample_single_inclusive, g, inputs); bench_simd_group!("Canon", $T, sample_single_inclusive_canon, g, inputs); - bench_simd_group!( - "Canon-branchless", - $T, - sample_single_inclusive_canon_branchless, - g, - inputs - ); - bench_simd_group!("Canon-scalar", $T, sample_inclusive_canon_scalar, g, inputs); + bench_simd_group!("Canon-Lemire", $T, sample_inclusive_canon_lemire, g, inputs); bench_simd_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs); + }}; } @@ -138,11 +157,11 @@ fn uniform_simd(c: &mut Criterion) { bench_simd!(c, i8x2, (i8::MIN, 116)); bench_simd!(c, i8x4, (i8::MIN, 116)); bench_simd!(c, i8x8, (i8::MIN, 116)); - bench_simd!(c, i8x16, (i8::MIN, 116)); - bench_simd!(c, i8x32, (i8::MIN, 116)); - bench_simd!(c, i8x64, (i8::MIN, 116)); +// bench_simd!(c, i8x16, (i8::MIN, 116)); +// bench_simd!(c, i8x32, (i8::MIN, 116)); +// bench_simd!(c, i8x64, (i8::MIN, 116)); bench_simd!(c, i16x8, (i16::MIN, 32407)); - bench_simd!(c, i16x16, (i16::MIN, 32407)); +// bench_simd!(c, i16x16, (i16::MIN, 32407)); bench_simd!(c, i32x4, (i32::MIN, 1)); bench_simd!(c, i32x8, (i32::MIN, 1)); bench_simd!(c, i64x2, (i64::MIN, 1)); diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs index da1a5b3e93..24c84f38d9 100644 --- a/src/distributions/uniform/uniform_int.rs +++ b/src/distributions/uniform/uniform_int.rs @@ -702,6 +702,48 @@ macro_rules! uniform_simd_int_impl { } impl UniformInt<$ty> { + /// Sample, Bitmask method + #[inline] + pub fn sample_bitmask(&self, rng: &mut R) -> $ty { + let mut range: $unsigned = self.range.cast(); + let is_full_range = range.eq($unsigned::splat(0)); + + // the old impl use a mix of methods for different integer sizes, we only use + // the lz method here for a better comparison. + + // generate bitmask + range -= 1; + let mut mask = range | 1; + + mask |= mask >> 1; + mask |= mask >> 2; + mask |= mask >> 4; + + const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes(); + if LANE_WIDTH >= 16 { mask |= mask >> 8; } + if LANE_WIDTH >= 32 { mask |= mask >> 16; } + if LANE_WIDTH >= 64 { mask |= mask >> 32; } + if LANE_WIDTH >= 128 { mask |= mask >> 64; } + + let mut v: $unsigned = rng.gen(); + loop { + let masked = v & mask; + let accept = masked.le(range); + if accept.all() { + let masked: $ty = masked.cast(); + // wrapping addition + let result = self.low + masked; + // `select` here compiles to a blend operation + // When `range.eq(0).none()` the compare and blend + // operations are avoided. + let v: $ty = v.cast(); + return is_full_range.select(v, result); + } + // Replace only the failing lanes + v = accept.select(v, rng.gen()); + } + } + #[inline(always)] fn sample_inc_setup(low_b: B1, high_b: B2) -> ($unsigned, $ty) where @@ -716,8 +758,8 @@ macro_rules! uniform_simd_int_impl { (range, low) } - /// - #[inline(always)] + /// Sample single inclusive, using the Bitmask method + #[inline] pub fn sample_single_inclusive_bitmask( low_b: B1, high_b: B2, rng: &mut R, ) -> $ty @@ -771,7 +813,7 @@ macro_rules! uniform_simd_int_impl { )+ }; } - +/* #[cfg(feature = "simd_support")] macro_rules! uniform_simd_int_gt8_impl { ($ty:ident, $unsigned:ident) => { @@ -792,7 +834,7 @@ macro_rules! uniform_simd_int_gt8_impl { let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); } - +/* /// Canon's method #[inline(always)] pub fn sample_single_inclusive_canon_branchless( @@ -851,6 +893,7 @@ macro_rules! uniform_simd_int_gt8_impl { let cast_rand_bits: $ty = rand_bits.cast(); is_full_range.select(cast_rand_bits, low + cast_result) } +*/ } }; @@ -859,54 +902,77 @@ macro_rules! uniform_simd_int_gt8_impl { uniform_simd_int_gt8_impl!{ $signed, $unsigned } )+}; } +*/ +// These are the naive ports of the above algorithms to SIMD. +// Caveat: supports only up to x8, and at x8 it relies on 512-bit SIMD. +// Caveat: always generates u64 samples which will often be missing. #[cfg(feature = "simd_support")] macro_rules! uniform_simd_int_le8_impl { - ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => { + ($ty:ident, $unsigned:ident, $u_extra_large:ident) => { impl UniformInt<$ty> { #[inline(always)] fn canon_successive( - range: $unsigned, - result: &mut $unsigned, - lo_order: $unsigned, + range: $u_extra_large, + result: &mut $u_extra_large, + lo_order: $u_extra_large, rng: &mut R ) { // ...generate a new sample with 64 more bits, enough that bias is undetectable - let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast(); - let large_range: $u_extra_large = range.cast(); - let (new_hi_order, _) = new_bits.wmul(large_range); + let new_bits: $u_extra_large = rng.gen::<$u_extra_large>(); + let (new_hi_order, _) = new_bits.wmul(range); // and adjust if needed - let cast_new_hi: $unsigned = new_hi_order.cast(); - let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); - *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); + let (_, overflowed) = lo_order.overflowing_add(new_hi_order); + *result += overflowed.select($u_extra_large::splat(1), $u_extra_large::splat(0)); } - /// - #[inline(always)] - pub fn sample_single_inclusive_canon_branchless( - low_b: B1, high_b: B2, rng: &mut R, - ) -> $ty - where - B1: SampleBorrow<$ty> + Sized, - B2: SampleBorrow<$ty> + Sized, - { - let (range, low) = Self::sample_inc_setup(low_b, high_b); + /// Sample, Canon's method + #[inline] + pub fn sample_canon(&self, rng: &mut R) -> $ty { + let range: $unsigned = self.range.cast(); let is_full_range = range.eq($unsigned::splat(0)); + let range: $u_extra_large = range.cast(); // generate a sample using a sensible integer type - let rand_bits = rng.gen::<$unsigned>(); + let rand_bits = rng.gen::<$u_extra_large>(); let (mut result, lo_order) = rand_bits.wmul(range); - Self::canon_successive(range, &mut result, lo_order, rng); + if lo_order.gt(0 - range).any() { + Self::canon_successive(range, &mut result, lo_order, rng); + } - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) + // truncate and return the result: + let result: $ty = result.cast(); + let rand_bits: $ty = rand_bits.cast(); + is_full_range.select(rand_bits, self.low + result) } - /// - #[inline(always)] - pub fn sample_inclusive_canon_scalar( + /// Sample, Canon's method with Lemire's early-out + #[inline] + pub fn sample_canon_lemire(&self, rng: &mut R) -> $ty { + let range: $unsigned = self.range.cast(); + let is_full_range = range.eq($unsigned::splat(0)); + let range: $u_extra_large = range.cast(); + + // generate a sample using a sensible integer type + let rand_bits = rng.gen::<$u_extra_large>(); + let (mut result, lo_order) = rand_bits.wmul(range); + + let nrmr: $unsigned = self.nrmr.cast(); + let nrmr: $u_extra_large = nrmr.cast(); + if lo_order.lt(nrmr).any() { + Self::canon_successive(range, &mut result, lo_order, rng); + } + + // truncate and return the result: + let result: $ty = result.cast(); + let rand_bits: $ty = rand_bits.cast(); + is_full_range.select(rand_bits, self.low + result) + } + + /// Sample single inclusive, using Canon's method + #[inline] + pub fn sample_single_inclusive_canon( low_b: B1, high_b: B2, rng: &mut R, ) -> $ty where @@ -915,36 +981,25 @@ macro_rules! uniform_simd_int_le8_impl { { let (range, low) = Self::sample_inc_setup(low_b, high_b); let is_full_range = range.eq($unsigned::splat(0)); + let range: $u_extra_large = range.cast(); // generate a sample using a sensible integer type - let rand_bits = rng.gen::<$unsigned>(); + let rand_bits = rng.gen::<$u_extra_large>(); let (mut result, lo_order) = rand_bits.wmul(range); - // ...generate a new sample with 64 more bits, enough that bias is undetectable - let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast(); - let large_range: $u_extra_large = range.cast(); - - // let (new_hi_order, _) = new_bits.wmul(large_range); - let mut new_hi_order = <$u_extra_large>::default(); - - for i in 0..<$ty>::lanes() { - let (shi, _slo) = new_bits.extract(i).wmul(large_range.extract(i)); - new_hi_order = new_hi_order.replace(i, shi); + if lo_order.gt(0 - range).any() { + Self::canon_successive(range, &mut result, lo_order, rng); } - // and adjust if needed - let cast_new_hi: $unsigned = new_hi_order.cast(); - let (_, overflowed) = lo_order.overflowing_add(cast_new_hi); - result += overflowed.select($unsigned::splat(1), $unsigned::splat(0)); - - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) + // truncate and return the result: + let result: $ty = result.cast(); + let rand_bits: $ty = rand_bits.cast(); + is_full_range.select(rand_bits, low + result) } - /// - #[inline(always)] - pub fn sample_single_inclusive_canon( + /// Sample single inclusive, using Canon's method with Lemire's early-out + #[inline] + pub fn sample_inclusive_canon_lemire( low_b: B1, high_b: B2, rng: &mut R, ) -> $ty where @@ -953,25 +1008,28 @@ macro_rules! uniform_simd_int_le8_impl { { let (range, low) = Self::sample_inc_setup(low_b, high_b); let is_full_range = range.eq($unsigned::splat(0)); + let range: $u_extra_large = range.cast(); // generate a sample using a sensible integer type - let rand_bits = rng.gen::<$unsigned>(); + let rand_bits = rng.gen::<$u_extra_large>(); let (mut result, lo_order) = rand_bits.wmul(range); - if lo_order.gt(0 - range).any() { + let nrmr = ((0 - range) % range); + if lo_order.lt(nrmr).any() { Self::canon_successive(range, &mut result, lo_order, rng); } - let cast_result: $ty = result.cast(); - let cast_rand_bits: $ty = rand_bits.cast(); - is_full_range.select(cast_rand_bits, low + cast_result) + // truncate and return the result: + let result: $ty = result.cast(); + let rand_bits: $ty = rand_bits.cast(); + is_full_range.select(rand_bits, low + result) } } }; - ($(($unsigned:ident, $signed:ident, $u64xN_type:ident, $u_extra_large:ident)),+) => {$( - uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u64xN_type, $u_extra_large } - uniform_simd_int_le8_impl!{ $signed, $unsigned, $u64xN_type, $u_extra_large } + ($(($unsigned:ident, $signed:ident, $u_extra_large:ident),)+) => {$( + uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u_extra_large } + uniform_simd_int_le8_impl!{ $signed, $unsigned, $u_extra_large } )+}; } @@ -1023,41 +1081,37 @@ uniform_simd_int_impl! { u8 } -#[cfg(feature = "simd_support")] -uniform_simd_int_gt8_impl! { - (u8x16, i8x16), - (u8x32, i8x32), - (u8x64, i8x64), - - (u16x16, i16x16), - (u16x32, i16x32), - - (u32x16, i32x16) -} +// #[cfg(feature = "simd_support")] +// uniform_simd_int_gt8_impl! { +// (u8x16, i8x16), +// (u8x32, i8x32), +// (u8x64, i8x64), +// +// (u16x16, i16x16), +// (u16x32, i16x32), +// +// (u32x16, i32x16) +// } #[cfg(feature = "simd_support")] uniform_simd_int_le8_impl! { - (u8x2, i8x2, i64x2, u64x2), - (u8x4, i8x4, i64x4, u64x4), - (u8x8, i8x8, i64x8, u64x8), - - (u16x2, i16x2, i64x2, u64x2), - (u16x4, i16x4, i64x4, u64x4), - (u16x8, i16x8, i64x8, u64x8), + (u8x2, i8x2, u64x2), + (u8x4, i8x4, u64x4), + (u8x8, i8x8, u64x8), - (u32x2, i32x2, i64x2, u64x2), - (u32x4, i32x4, i64x4, u64x4), - (u32x8, i32x8, i64x8, u64x8), + (u16x2, i16x2, u64x2), + (u16x4, i16x4, u64x4), + (u16x8, i16x8, u64x8), - (u64x2, i64x2, i64x2, u64x2), - (u64x4, i64x4, i64x4, u64x4), - (u64x8, i64x8, i64x8, u64x8), + (u32x2, i32x2, u64x2), + (u32x4, i32x4, u64x4), + (u32x8, i32x8, u64x8), - (u128x2, i128x2, i64x2, u128x2), - (u128x4, i128x4, i64x4, u128x4) + (u64x2, i64x2, u64x2), + (u64x4, i64x4, u64x4), + (u64x8, i64x8, u64x8), - // (usizex2, isizex2, i64x2, u64x2), - // (usizex4, isizex4, i64x4, u64x4), - // (usizex8, isizex8, i64x8, u64x8) + (u128x2, i128x2, u128x2), + (u128x4, i128x4, u128x4), } #[cfg(test)] From a9d9a88e17acae487f6fa74a320a037ab68437eb Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Thu, 24 Feb 2022 10:58:03 +0000 Subject: [PATCH 17/17] temp --- benches/uniform.rs | 3 +- benches/uniform_simd.rs | 69 ++++++++++++++++++++++++++++++++++++ src/distributions/uniform.rs | 3 +- 3 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 benches/uniform_simd.rs diff --git a/benches/uniform.rs b/benches/uniform.rs index cdd8376482..80520c8df6 100644 --- a/benches/uniform.rs +++ b/benches/uniform.rs @@ -10,9 +10,8 @@ use criterion::{criterion_group, criterion_main}; use criterion::{BenchmarkId, Criterion}; #[cfg(feature = "simd_support")] use packed_simd::*; use rand::distributions::uniform::{SampleUniform, Uniform, UniformSampler}; -use rand::prelude::*; -type BenchRng = SmallRng; +type BenchRng = rand::rngs::SmallRng; macro_rules! bench_dist_int_group { ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => { diff --git a/benches/uniform_simd.rs b/benches/uniform_simd.rs new file mode 100644 index 0000000000..2cf6ac0bf3 --- /dev/null +++ b/benches/uniform_simd.rs @@ -0,0 +1,69 @@ +// Copyright 2021 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use criterion::{criterion_group, criterion_main}; +use criterion::{BenchmarkGroup, BenchmarkId, Criterion}; +use rand::distributions::uniform::{SampleUniform, UniformSampler}; +use rand::prelude::*; +use packed_simd::*; + +type BenchRng = SmallRng; + +macro_rules! bench_simd_group { + ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => { + for input in $inputs { + $g.bench_with_input( + BenchmarkId::new($name, input.0), + &input.1, + |b, (low, high)| { + let mut rng = BenchRng::from_entropy(); + b.iter(|| <$T as SampleUniform>::Sampler::$f(low, high, &mut rng)) + }, + ); + } + $g.bench_function(BenchmarkId::new($name, "varying"), |b| { + let (low, mut high) = ($inputs[0].1 .0, $inputs[1].1 .1); + let mut rng = BenchRng::from_entropy(); + b.iter(|| { + high = high.wrapping_add(1); + <$T as SampleUniform>::Sampler::$f(low, high, &mut rng) + }) + }); + }; +} + +macro_rules! bench_simd { + ($c:expr, $T:ty, $high:expr) => {{ + let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T))); + let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))]; + + bench_simd_group!("Old", $T, sample_single_inclusive, g, inputs); +// bench_simd_group!("ONeill", $T, sample_single_inclusive_oneill, g, inputs); + bench_simd_group!("Canon", $T, sample_single_inclusive_canon, g, inputs); +// bench_simd_group!("Canon-Lemire", $T, sample_inclusive_canon_lemire, g, inputs); +// bench_simd_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs); + }}; +} + +fn uniform_int(c: &mut Criterion) { + // for i8/i16, we use 32-bit integers internally so rejection is most common near full-size + // the exact values were determined with an exhaustive search + bench_simd!(c, i8x16, (i8::MIN, 116)); +// bench_simd!(c, i16, (i16::MIN, 32407)); +// bench_simd!(c, i32, (i32::MIN, 1)); +// bench_simd!(c, i64, (i64::MIN, 1)); +// bench_simd!(c, i128, (i128::MIN, 1)); +} + +criterion_group! { + name = benches; + config = Criterion::default(); + targets = uniform_int + // targets = uniform_int_i8x16, uniform_int_i16, uniform_int_i32, uniform_int_i64, uniform_int_i128 +} +criterion_main!(benches); diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 8b1beb80dc..4b81ab9ee9 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -178,8 +178,7 @@ pub use uniform_other::{UniformChar, UniformDuration}; feature = "serde1", serde(bound(deserialize = "X::Sampler: Deserialize<'de>")) )] -// HACK: field is pub -pub struct Uniform(pub X::Sampler); +pub struct Uniform(X::Sampler); impl Uniform { /// Create a new `Uniform` instance which samples uniformly from the half