From 8bc957cc88f224f3693a75d9b2101f64f1176a2b Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Tue, 19 Oct 2021 17:36:23 +0100
Subject: [PATCH 01/17] Apply rustfmt to rand::distributions::uniform module

---
 src/distributions/uniform.rs | 104 ++++++++++++++++++++++++-----------
 1 file changed, 73 insertions(+), 31 deletions(-)
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 261357b245..296e528c37 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -106,8 +106,8 @@
 //! [`UniformDuration`]: crate::distributions::uniform::UniformDuration
 //! [`SampleBorrow::borrow`]: crate::distributions::uniform::SampleBorrow::borrow
 
-use core::time::Duration;
 use core::ops::{Range, RangeInclusive};
+use core::time::Duration;
 
 use crate::distributions::float::IntoFloat;
 use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, WideningMultiply};
@@ -120,8 +120,7 @@ use crate::distributions::utils::Float;
 
 #[cfg(feature = "simd_support")] use packed_simd::*;
 
-#[cfg(feature = "serde1")]
-use serde::{Serialize, Deserialize};
+#[cfg(feature = "serde1")] use serde::{Deserialize, Serialize};
 
 /// Sample values uniformly between two bounds.
 ///
@@ -175,7 +174,10 @@ use serde::{Serialize, Deserialize};
 #[derive(Clone, Copy, Debug, PartialEq)]
 #[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
 #[cfg_attr(feature = "serde1", serde(bound(serialize = "X::Sampler: Serialize")))]
-#[cfg_attr(feature = "serde1", serde(bound(deserialize = "X::Sampler: Deserialize<'de>")))]
+#[cfg_attr(
+    feature = "serde1",
+    serde(bound(deserialize = "X::Sampler: Deserialize<'de>"))
+)]
 pub struct Uniform<X: SampleUniform>(X::Sampler);
 
 impl<X: SampleUniform> Uniform<X> {
@@ -291,10 +293,10 @@ pub trait UniformSampler: Sized {
     /// some types more optimal implementations for single usage may be provided
     /// via this method.
     /// Results may not be identical.
-    fn sample_single_inclusive<R: Rng + ?Sized, B1, B2>(low: B1, high: B2, rng: &mut R)
-        -> Self::X
-        where B1: SampleBorrow<Self::X> + Sized,
-              B2: SampleBorrow<Self::X> + Sized
+    fn sample_single_inclusive<R: Rng + ?Sized, B1, B2>(low: B1, high: B2, rng: &mut R) -> Self::X
+    where
+        B1: SampleBorrow<Self::X> + Sized,
+        B2: SampleBorrow<Self::X> + Sized,
     {
         let uniform: Self = UniformSampler::new_inclusive(low, high);
         uniform.sample(rng)
@@ -516,14 +518,19 @@ macro_rules! uniform_int_impl {
             }
 
             #[inline]
-            fn sample_single_inclusive<R: Rng + ?Sized, B1, B2>(low_b: B1, high_b: B2, rng: &mut R) -> Self::X
+            fn sample_single_inclusive<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> Self::X
             where
                 B1: SampleBorrow<Self::X> + Sized,
                 B2: SampleBorrow<Self::X> + Sized,
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low <= high, "UniformSampler::sample_single_inclusive: low > high");
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
                 let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
                 // If the above resulted in wrap-around to 0, the range is $ty::MIN..=$ty::MAX,
                 // and any integer will do.
@@ -934,7 +941,10 @@ macro_rules! uniform_float_impl {
                     "UniformSampler::sample_single: low >= high"
                 );
                 let mut scale = high - low;
-                assert!(scale.all_finite(), "UniformSampler::sample_single: range overflow");
+                assert!(
+                    scale.all_finite(),
+                    "UniformSampler::sample_single: range overflow"
+                );
 
                 loop {
                     // Generate a value in the range [1, 2)
@@ -1156,24 +1166,43 @@ mod tests {
     #[cfg(feature = "serde1")]
     fn test_serialization_uniform_duration() {
         let distr = UniformDuration::new(Duration::from_secs(10), Duration::from_secs(60));
-        let de_distr: UniformDuration = bincode::deserialize(&bincode::serialize(&distr).unwrap()).unwrap();
-        assert_eq!(
-            distr.offset, de_distr.offset
-        );
+        let de_distr: UniformDuration =
+            bincode::deserialize(&bincode::serialize(&distr).unwrap()).unwrap();
+        assert_eq!(distr.offset, de_distr.offset);
         match (distr.mode, de_distr.mode) {
-            (UniformDurationMode::Small {secs: a_secs, nanos: a_nanos}, UniformDurationMode::Small {secs, nanos}) => {
+            (
+                UniformDurationMode::Small {
+                    secs: a_secs,
+                    nanos: a_nanos,
+                },
+                UniformDurationMode::Small { secs, nanos },
+            ) => {
                 assert_eq!(a_secs, secs);
 
                 assert_eq!(a_nanos.0.low, nanos.0.low);
                 assert_eq!(a_nanos.0.range, nanos.0.range);
                 assert_eq!(a_nanos.0.z, nanos.0.z);
             }
-            (UniformDurationMode::Medium {nanos: a_nanos} , UniformDurationMode::Medium {nanos}) => {
+            (
+                UniformDurationMode::Medium { nanos: a_nanos },
+                UniformDurationMode::Medium { nanos },
+            ) => {
                 assert_eq!(a_nanos.0.low, nanos.0.low);
                 assert_eq!(a_nanos.0.range, nanos.0.range);
                 assert_eq!(a_nanos.0.z, nanos.0.z);
             }
-            (UniformDurationMode::Large {max_secs:a_max_secs, max_nanos:a_max_nanos, secs:a_secs}, UniformDurationMode::Large {max_secs, max_nanos, secs} ) => {
+            (
+                UniformDurationMode::Large {
+                    max_secs: a_max_secs,
+                    max_nanos: a_max_nanos,
+                    secs: a_secs,
+                },
+                UniformDurationMode::Large {
+                    max_secs,
+                    max_nanos,
+                    secs,
+                },
+            ) => {
                 assert_eq!(a_max_secs, max_secs);
                 assert_eq!(a_max_nanos, max_nanos);
 
@@ -1181,22 +1210,24 @@ mod tests {
                 assert_eq!(a_secs.0.range, secs.0.range);
                 assert_eq!(a_secs.0.z, secs.0.z);
             }
-            _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly")
+            _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly"),
         }
     }
-    
+
     #[test]
     #[cfg(feature = "serde1")]
     fn test_uniform_serialization() {
-        let unit_box: Uniform<i32>  = Uniform::new(-1, 1);
-        let de_unit_box: Uniform<i32> = bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap();
+        let unit_box: Uniform<i32> = Uniform::new(-1, 1);
+        let de_unit_box: Uniform<i32> =
+            bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap();
 
         assert_eq!(unit_box.0.low, de_unit_box.0.low);
         assert_eq!(unit_box.0.range, de_unit_box.0.range);
         assert_eq!(unit_box.0.z, de_unit_box.0.z);
 
         let unit_box: Uniform<f32> = Uniform::new(-1., 1.);
-        let de_unit_box: Uniform<f32> = bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap();
+        let de_unit_box: Uniform<f32> =
+            bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap();
 
         assert_eq!(unit_box.0.low, de_unit_box.0.low);
         assert_eq!(unit_box.0.scale, de_unit_box.0.scale);
@@ -1361,8 +1392,9 @@ mod tests {
                             assert!(low_scalar <= v && v < high_scalar);
                             let v = rng.sample(my_incl_uniform).extract(lane);
                             assert!(low_scalar <= v && v <= high_scalar);
-                            let v = <$ty as SampleUniform>::Sampler
-                                ::sample_single(low, high, &mut rng).extract(lane);
+                            let v =
+                                <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng)
+                                    .extract(lane);
                             assert!(low_scalar <= v && v < high_scalar);
                         }
 
@@ -1373,9 +1405,15 @@ mod tests {
 
                         assert_eq!(zero_rng.sample(my_uniform).extract(lane), low_scalar);
                         assert_eq!(zero_rng.sample(my_incl_uniform).extract(lane), low_scalar);
-                        assert_eq!(<$ty as SampleUniform>::Sampler
-                            ::sample_single(low, high, &mut zero_rng)
-                            .extract(lane), low_scalar);
+                        assert_eq!(
+                            <$ty as SampleUniform>::Sampler::sample_single(
+                                low,
+                                high,
+                                &mut zero_rng
+                            )
+                            .extract(lane),
+                            low_scalar
+                        );
                         assert!(max_rng.sample(my_uniform).extract(lane) < high_scalar);
                         assert!(max_rng.sample(my_incl_uniform).extract(lane) <= high_scalar);
 
@@ -1388,9 +1426,13 @@ mod tests {
                                 (-1i64 << $bits_shifted) as u64,
                             );
                             assert!(
-                                <$ty as SampleUniform>::Sampler
-                                    ::sample_single(low, high, &mut lowering_max_rng)
-                                    .extract(lane) < high_scalar
+                                <$ty as SampleUniform>::Sampler::sample_single(
+                                    low,
+                                    high,
+                                    &mut lowering_max_rng
+                                )
+                                .extract(lane)
+                                    < high_scalar
                             );
                         }
                     }

From 8449015a40e695859bc55a0c20c97c362c0afe14 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 24 Feb 2022 11:22:35 +0000
Subject: [PATCH 02/17] Move UniformChar, UniformDuration to new submodule

---
 src/distributions/uniform.rs               | 358 ++-------------------
 src/distributions/uniform/uniform_other.rs | 341 ++++++++++++++++++++
 2 files changed, 363 insertions(+), 336 deletions(-)
 create mode 100644 src/distributions/uniform/uniform_other.rs

diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 296e528c37..07a4ddbbf2 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -107,7 +107,6 @@
 //! [`SampleBorrow::borrow`]: crate::distributions::uniform::SampleBorrow::borrow
 
 use core::ops::{Range, RangeInclusive};
-use core::time::Duration;
 
 use crate::distributions::float::IntoFloat;
 use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, WideningMultiply};
@@ -122,6 +121,10 @@ use crate::distributions::utils::Float;
 
 #[cfg(feature = "serde1")] use serde::{Deserialize, Serialize};
 
+mod uniform_other;
+
+pub use uniform_other::{UniformChar, UniformDuration};
+
 /// Sample values uniformly between two bounds.
 ///
 /// [`Uniform::new`] and [`Uniform::new_inclusive`] construct a uniform
@@ -721,79 +724,6 @@ uniform_simd_int_impl! {
     u8
 }
 
-impl SampleUniform for char {
-    type Sampler = UniformChar;
-}
-
-/// The back-end implementing [`UniformSampler`] for `char`.
-///
-/// Unless you are implementing [`UniformSampler`] for your own type, this type
-/// should not be used directly, use [`Uniform`] instead.
-///
-/// This differs from integer range sampling since the range `0xD800..=0xDFFF`
-/// are used for surrogate pairs in UCS and UTF-16, and consequently are not
-/// valid Unicode code points. We must therefore avoid sampling values in this
-/// range.
-#[derive(Clone, Copy, Debug)]
-#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
-pub struct UniformChar {
-    sampler: UniformInt<u32>,
-}
-
-/// UTF-16 surrogate range start
-const CHAR_SURROGATE_START: u32 = 0xD800;
-/// UTF-16 surrogate range size
-const CHAR_SURROGATE_LEN: u32 = 0xE000 - CHAR_SURROGATE_START;
-
-/// Convert `char` to compressed `u32`
-fn char_to_comp_u32(c: char) -> u32 {
-    match c as u32 {
-        c if c >= CHAR_SURROGATE_START => c - CHAR_SURROGATE_LEN,
-        c => c,
-    }
-}
-
-impl UniformSampler for UniformChar {
-    type X = char;
-
-    #[inline] // if the range is constant, this helps LLVM to do the
-              // calculations at compile-time.
-    fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
-    where
-        B1: SampleBorrow<Self::X> + Sized,
-        B2: SampleBorrow<Self::X> + Sized,
-    {
-        let low = char_to_comp_u32(*low_b.borrow());
-        let high = char_to_comp_u32(*high_b.borrow());
-        let sampler = UniformInt::<u32>::new(low, high);
-        UniformChar { sampler }
-    }
-
-    #[inline] // if the range is constant, this helps LLVM to do the
-              // calculations at compile-time.
-    fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
-    where
-        B1: SampleBorrow<Self::X> + Sized,
-        B2: SampleBorrow<Self::X> + Sized,
-    {
-        let low = char_to_comp_u32(*low_b.borrow());
-        let high = char_to_comp_u32(*high_b.borrow());
-        let sampler = UniformInt::<u32>::new_inclusive(low, high);
-        UniformChar { sampler }
-    }
-
-    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
-        let mut x = self.sampler.sample(rng);
-        if x >= CHAR_SURROGATE_START {
-            x += CHAR_SURROGATE_LEN;
-        }
-        // SAFETY: x must not be in surrogate range or greater than char::MAX.
-        // This relies on range constructors which accept char arguments.
-        // Validity of input char values is assumed.
-        unsafe { core::char::from_u32_unchecked(x) }
-    }
-}
-
 /// The back-end implementing [`UniformSampler`] for floating-point types.
 ///
 /// Unless you are implementing [`UniformSampler`] for your own type, this type
@@ -1025,195 +955,11 @@ uniform_float_impl! { f64x4, u64x4, f64, u64, 64 - 52 }
 uniform_float_impl! { f64x8, u64x8, f64, u64, 64 - 52 }
 
 
-/// The back-end implementing [`UniformSampler`] for `Duration`.
-///
-/// Unless you are implementing [`UniformSampler`] for your own types, this type
-/// should not be used directly, use [`Uniform`] instead.
-#[derive(Clone, Copy, Debug)]
-#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
-pub struct UniformDuration {
-    mode: UniformDurationMode,
-    offset: u32,
-}
-
-#[derive(Debug, Copy, Clone)]
-#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
-enum UniformDurationMode {
-    Small {
-        secs: u64,
-        nanos: Uniform<u32>,
-    },
-    Medium {
-        nanos: Uniform<u64>,
-    },
-    Large {
-        max_secs: u64,
-        max_nanos: u32,
-        secs: Uniform<u64>,
-    },
-}
-
-impl SampleUniform for Duration {
-    type Sampler = UniformDuration;
-}
-
-impl UniformSampler for UniformDuration {
-    type X = Duration;
-
-    #[inline]
-    fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
-    where
-        B1: SampleBorrow<Self::X> + Sized,
-        B2: SampleBorrow<Self::X> + Sized,
-    {
-        let low = *low_b.borrow();
-        let high = *high_b.borrow();
-        assert!(low < high, "Uniform::new called with `low >= high`");
-        UniformDuration::new_inclusive(low, high - Duration::new(0, 1))
-    }
-
-    #[inline]
-    fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
-    where
-        B1: SampleBorrow<Self::X> + Sized,
-        B2: SampleBorrow<Self::X> + Sized,
-    {
-        let low = *low_b.borrow();
-        let high = *high_b.borrow();
-        assert!(
-            low <= high,
-            "Uniform::new_inclusive called with `low > high`"
-        );
-
-        let low_s = low.as_secs();
-        let low_n = low.subsec_nanos();
-        let mut high_s = high.as_secs();
-        let mut high_n = high.subsec_nanos();
-
-        if high_n < low_n {
-            high_s -= 1;
-            high_n += 1_000_000_000;
-        }
-
-        let mode = if low_s == high_s {
-            UniformDurationMode::Small {
-                secs: low_s,
-                nanos: Uniform::new_inclusive(low_n, high_n),
-            }
-        } else {
-            let max = high_s
-                .checked_mul(1_000_000_000)
-                .and_then(|n| n.checked_add(u64::from(high_n)));
-
-            if let Some(higher_bound) = max {
-                let lower_bound = low_s * 1_000_000_000 + u64::from(low_n);
-                UniformDurationMode::Medium {
-                    nanos: Uniform::new_inclusive(lower_bound, higher_bound),
-                }
-            } else {
-                // An offset is applied to simplify generation of nanoseconds
-                let max_nanos = high_n - low_n;
-                UniformDurationMode::Large {
-                    max_secs: high_s,
-                    max_nanos,
-                    secs: Uniform::new_inclusive(low_s, high_s),
-                }
-            }
-        };
-        UniformDuration {
-            mode,
-            offset: low_n,
-        }
-    }
-
-    #[inline]
-    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Duration {
-        match self.mode {
-            UniformDurationMode::Small { secs, nanos } => {
-                let n = nanos.sample(rng);
-                Duration::new(secs, n)
-            }
-            UniformDurationMode::Medium { nanos } => {
-                let nanos = nanos.sample(rng);
-                Duration::new(nanos / 1_000_000_000, (nanos % 1_000_000_000) as u32)
-            }
-            UniformDurationMode::Large {
-                max_secs,
-                max_nanos,
-                secs,
-            } => {
-                // constant folding means this is at least as fast as `Rng::sample(Range)`
-                let nano_range = Uniform::new(0, 1_000_000_000);
-                loop {
-                    let s = secs.sample(rng);
-                    let n = nano_range.sample(rng);
-                    if !(s == max_secs && n > max_nanos) {
-                        let sum = n + self.offset;
-                        break Duration::new(s, sum);
-                    }
-                }
-            }
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::rngs::mock::StepRng;
 
-    #[test]
-    #[cfg(feature = "serde1")]
-    fn test_serialization_uniform_duration() {
-        let distr = UniformDuration::new(Duration::from_secs(10), Duration::from_secs(60));
-        let de_distr: UniformDuration =
-            bincode::deserialize(&bincode::serialize(&distr).unwrap()).unwrap();
-        assert_eq!(distr.offset, de_distr.offset);
-        match (distr.mode, de_distr.mode) {
-            (
-                UniformDurationMode::Small {
-                    secs: a_secs,
-                    nanos: a_nanos,
-                },
-                UniformDurationMode::Small { secs, nanos },
-            ) => {
-                assert_eq!(a_secs, secs);
-
-                assert_eq!(a_nanos.0.low, nanos.0.low);
-                assert_eq!(a_nanos.0.range, nanos.0.range);
-                assert_eq!(a_nanos.0.z, nanos.0.z);
-            }
-            (
-                UniformDurationMode::Medium { nanos: a_nanos },
-                UniformDurationMode::Medium { nanos },
-            ) => {
-                assert_eq!(a_nanos.0.low, nanos.0.low);
-                assert_eq!(a_nanos.0.range, nanos.0.range);
-                assert_eq!(a_nanos.0.z, nanos.0.z);
-            }
-            (
-                UniformDurationMode::Large {
-                    max_secs: a_max_secs,
-                    max_nanos: a_max_nanos,
-                    secs: a_secs,
-                },
-                UniformDurationMode::Large {
-                    max_secs,
-                    max_nanos,
-                    secs,
-                },
-            ) => {
-                assert_eq!(a_max_secs, max_secs);
-                assert_eq!(a_max_nanos, max_nanos);
-
-                assert_eq!(a_secs.0.low, secs.0.low);
-                assert_eq!(a_secs.0.range, secs.0.range);
-                assert_eq!(a_secs.0.z, secs.0.z);
-            }
-            _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly"),
-        }
-    }
-
     #[test]
     #[cfg(feature = "serde1")]
     fn test_uniform_serialization() {
@@ -1340,27 +1086,6 @@ mod tests {
         }
     }
 
-    #[test]
-    #[cfg_attr(miri, ignore)] // Miri is too slow
-    fn test_char() {
-        let mut rng = crate::test::rng(891);
-        let mut max = core::char::from_u32(0).unwrap();
-        for _ in 0..100 {
-            let c = rng.gen_range('A'..='Z');
-            assert!(('A'..='Z').contains(&c));
-            max = max.max(c);
-        }
-        assert_eq!(max, 'Z');
-        let d = Uniform::new(
-            core::char::from_u32(0xD7F0).unwrap(),
-            core::char::from_u32(0xE010).unwrap(),
-        );
-        for _ in 0..100 {
-            let c = d.sample(&mut rng);
-            assert!((c as u32) < 0xD800 || (c as u32) > 0xDFFF);
-        }
-    }
-
     #[test]
     #[cfg_attr(miri, ignore)] // Miri is too slow
     fn test_floats() {
@@ -1543,29 +1268,6 @@ mod tests {
         }
     }
 
-
-    #[test]
-    #[cfg_attr(miri, ignore)] // Miri is too slow
-    fn test_durations() {
-        let mut rng = crate::test::rng(253);
-
-        let v = &[
-            (Duration::new(10, 50000), Duration::new(100, 1234)),
-            (Duration::new(0, 100), Duration::new(1, 50)),
-            (
-                Duration::new(0, 0),
-                Duration::new(u64::max_value(), 999_999_999),
-            ),
-        ];
-        for &(low, high) in v.iter() {
-            let my_uniform = Uniform::new(low, high);
-            for _ in 0..1000 {
-                let v = rng.sample(my_uniform);
-                assert!(low <= v && v < high);
-            }
-        }
-    }
-
     #[test]
     fn test_custom_uniform() {
         use crate::distributions::uniform::{
@@ -1636,26 +1338,26 @@ mod tests {
         assert!(r.0.scale < 5.0 + 1e-14);
     }
 
-    #[test]
-    fn value_stability() {
-        fn test_samples<T: SampleUniform + Copy + core::fmt::Debug + PartialEq>(
-            lb: T, ub: T, expected_single: &[T], expected_multiple: &[T],
-        ) where Uniform<T>: Distribution<T> {
-            let mut rng = crate::test::rng(897);
-            let mut buf = [lb; 3];
-
-            for x in &mut buf {
-                *x = T::Sampler::sample_single(lb, ub, &mut rng);
-            }
-            assert_eq!(&buf, expected_single);
+    pub(crate) fn test_samples<T: SampleUniform + Copy + core::fmt::Debug + PartialEq>(
+        lb: T, ub: T, expected_single: &[T], expected_multiple: &[T],
+    ) where Uniform<T>: Distribution<T> {
+        let mut rng = crate::test::rng(897);
+        let mut buf = [lb; 3];
 
-            let distr = Uniform::new(lb, ub);
-            for x in &mut buf {
-                *x = rng.sample(&distr);
-            }
-            assert_eq!(&buf, expected_multiple);
+        for x in &mut buf {
+            *x = T::Sampler::sample_single(lb, ub, &mut rng);
+        }
+        assert_eq!(&buf, expected_single);
+
+        let distr = Uniform::new(lb, ub);
+        for x in &mut buf {
+            *x = rng.sample(&distr);
         }
+        assert_eq!(&buf, expected_multiple);
+    }
 
+    #[test]
+    fn value_stability() {
         // We test on a sub-set of types; possibly we should do more.
         // TODO: SIMD types
 
@@ -1673,28 +1375,12 @@ mod tests {
             &[-4673848682.871551, 6388267422.932352, 4857075081.198343],
             &[1173375212.1808167, 1917642852.109581, 2365076174.3153973],
         );
-
-        test_samples(
-            Duration::new(2, 0),
-            Duration::new(4, 0),
-            &[
-                Duration::new(2, 532615131),
-                Duration::new(3, 638826742),
-                Duration::new(3, 485707508),
-            ],
-            &[
-                Duration::new(3, 117337521),
-                Duration::new(3, 191764285),
-                Duration::new(3, 236507617),
-            ],
-        );
     }
 
     #[test]
     fn uniform_distributions_can_be_compared() {
         assert_eq!(Uniform::new(1.0, 2.0), Uniform::new(1.0, 2.0));
 
-        // To cover UniformInt
-        assert_eq!(Uniform::new(1 as u32, 2 as u32), Uniform::new(1 as u32, 2 as u32));
+        assert_eq!(Uniform::new(1u32, 2u32), Uniform::new(1u32, 2u32));
     }
 }
diff --git a/src/distributions/uniform/uniform_other.rs b/src/distributions/uniform/uniform_other.rs
new file mode 100644
index 0000000000..f8232c484c
--- /dev/null
+++ b/src/distributions/uniform/uniform_other.rs
@@ -0,0 +1,341 @@
+// Copyright 2018-2021 Developers of the Rand project.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use super::{SampleBorrow, SampleUniform, Uniform, UniformInt, UniformSampler};
+use crate::distributions::Distribution;
+use crate::Rng;
+use core::time::Duration;
+#[cfg(feature = "serde1")] use serde::{Deserialize, Serialize};
+
+impl SampleUniform for char {
+    type Sampler = UniformChar;
+}
+
+/// The back-end implementing [`UniformSampler`] for `char`.
+///
+/// Unless you are implementing [`UniformSampler`] for your own type, this type
+/// should not be used directly, use [`Uniform`] instead.
+///
+/// This differs from integer range sampling since the range `0xD800..=0xDFFF`
+/// are used for surrogate pairs in UCS and UTF-16, and consequently are not
+/// valid Unicode code points. We must therefore avoid sampling values in this
+/// range.
+#[derive(Clone, Copy, Debug, PartialEq)]
+#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
+pub struct UniformChar {
+    sampler: UniformInt<u32>,
+}
+
+/// UTF-16 surrogate range start
+const CHAR_SURROGATE_START: u32 = 0xD800;
+/// UTF-16 surrogate range size
+const CHAR_SURROGATE_LEN: u32 = 0xE000 - CHAR_SURROGATE_START;
+
+/// Convert `char` to compressed `u32`
+fn char_to_comp_u32(c: char) -> u32 {
+    match c as u32 {
+        c if c >= CHAR_SURROGATE_START => c - CHAR_SURROGATE_LEN,
+        c => c,
+    }
+}
+
+impl UniformSampler for UniformChar {
+    type X = char;
+
+    #[inline] // if the range is constant, this helps LLVM to do the
+              // calculations at compile-time.
+    fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
+    where
+        B1: SampleBorrow<Self::X> + Sized,
+        B2: SampleBorrow<Self::X> + Sized,
+    {
+        let low = char_to_comp_u32(*low_b.borrow());
+        let high = char_to_comp_u32(*high_b.borrow());
+        let sampler = UniformInt::<u32>::new(low, high);
+        UniformChar { sampler }
+    }
+
+    #[inline] // if the range is constant, this helps LLVM to do the
+              // calculations at compile-time.
+    fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
+    where
+        B1: SampleBorrow<Self::X> + Sized,
+        B2: SampleBorrow<Self::X> + Sized,
+    {
+        let low = char_to_comp_u32(*low_b.borrow());
+        let high = char_to_comp_u32(*high_b.borrow());
+        let sampler = UniformInt::<u32>::new_inclusive(low, high);
+        UniformChar { sampler }
+    }
+
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+        let mut x = self.sampler.sample(rng);
+        if x >= CHAR_SURROGATE_START {
+            x += CHAR_SURROGATE_LEN;
+        }
+        // SAFETY: x must not be in surrogate range or greater than char::MAX.
+        // This relies on range constructors which accept char arguments.
+        // Validity of input char values is assumed.
+        unsafe { core::char::from_u32_unchecked(x) }
+    }
+}
+
+
+/// The back-end implementing [`UniformSampler`] for `Duration`.
+///
+/// Unless you are implementing [`UniformSampler`] for your own types, this type
+/// should not be used directly, use [`Uniform`] instead.
+#[derive(Clone, Copy, Debug, PartialEq)]
+#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
+pub struct UniformDuration {
+    mode: UniformDurationMode,
+    offset: u32,
+}
+
+#[derive(Debug, Copy, Clone, PartialEq)]
+#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
+enum UniformDurationMode {
+    Small {
+        secs: u64,
+        nanos: Uniform<u32>,
+    },
+    Medium {
+        nanos: Uniform<u64>,
+    },
+    Large {
+        max_secs: u64,
+        max_nanos: u32,
+        secs: Uniform<u64>,
+    },
+}
+
+impl SampleUniform for Duration {
+    type Sampler = UniformDuration;
+}
+
+impl UniformSampler for UniformDuration {
+    type X = Duration;
+
+    #[inline]
+    fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
+    where
+        B1: SampleBorrow<Self::X> + Sized,
+        B2: SampleBorrow<Self::X> + Sized,
+    {
+        let low = *low_b.borrow();
+        let high = *high_b.borrow();
+        assert!(low < high, "Uniform::new called with `low >= high`");
+        UniformDuration::new_inclusive(low, high - Duration::new(0, 1))
+    }
+
+    #[inline]
+    fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
+    where
+        B1: SampleBorrow<Self::X> + Sized,
+        B2: SampleBorrow<Self::X> + Sized,
+    {
+        let low = *low_b.borrow();
+        let high = *high_b.borrow();
+        assert!(
+            low <= high,
+            "Uniform::new_inclusive called with `low > high`"
+        );
+
+        let low_s = low.as_secs();
+        let low_n = low.subsec_nanos();
+        let mut high_s = high.as_secs();
+        let mut high_n = high.subsec_nanos();
+
+        if high_n < low_n {
+            high_s -= 1;
+            high_n += 1_000_000_000;
+        }
+
+        let mode = if low_s == high_s {
+            UniformDurationMode::Small {
+                secs: low_s,
+                nanos: Uniform::new_inclusive(low_n, high_n),
+            }
+        } else {
+            let max = high_s
+                .checked_mul(1_000_000_000)
+                .and_then(|n| n.checked_add(u64::from(high_n)));
+
+            if let Some(higher_bound) = max {
+                let lower_bound = low_s * 1_000_000_000 + u64::from(low_n);
+                UniformDurationMode::Medium {
+                    nanos: Uniform::new_inclusive(lower_bound, higher_bound),
+                }
+            } else {
+                // An offset is applied to simplify generation of nanoseconds
+                let max_nanos = high_n - low_n;
+                UniformDurationMode::Large {
+                    max_secs: high_s,
+                    max_nanos,
+                    secs: Uniform::new_inclusive(low_s, high_s),
+                }
+            }
+        };
+        UniformDuration {
+            mode,
+            offset: low_n,
+        }
+    }
+
+    #[inline]
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Duration {
+        match self.mode {
+            UniformDurationMode::Small { secs, nanos } => {
+                let n = nanos.sample(rng);
+                Duration::new(secs, n)
+            }
+            UniformDurationMode::Medium { nanos } => {
+                let nanos = nanos.sample(rng);
+                Duration::new(nanos / 1_000_000_000, (nanos % 1_000_000_000) as u32)
+            }
+            UniformDurationMode::Large {
+                max_secs,
+                max_nanos,
+                secs,
+            } => {
+                // constant folding means this is at least as fast as `Rng::sample(Range)`
+                let nano_range = Uniform::new(0, 1_000_000_000);
+                loop {
+                    let s = secs.sample(rng);
+                    let n = nano_range.sample(rng);
+                    if !(s == max_secs && n > max_nanos) {
+                        let sum = n + self.offset;
+                        break Duration::new(s, sum);
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::distributions::uniform::tests::test_samples;
+
+    #[test]
+    #[cfg(feature = "serde1")]
+    fn test_serialization_uniform_duration() {
+        let distr = UniformDuration::new(Duration::from_secs(10), Duration::from_secs(60));
+        let de_distr: UniformDuration =
+            bincode::deserialize(&bincode::serialize(&distr).unwrap()).unwrap();
+        assert_eq!(distr.offset, de_distr.offset);
+        match (distr.mode, de_distr.mode) {
+            (
+                UniformDurationMode::Small {
+                    secs: a_secs,
+                    nanos: a_nanos,
+                },
+                UniformDurationMode::Small { secs, nanos },
+            ) => {
+                assert_eq!(a_secs, secs);
+                assert_eq!(a_nanos, nanos);
+            }
+            (
+                UniformDurationMode::Medium { nanos: a_nanos },
+                UniformDurationMode::Medium { nanos },
+            ) => {
+                assert_eq!(a_nanos, nanos);
+            }
+            (
+                UniformDurationMode::Large {
+                    max_secs: a_max_secs,
+                    max_nanos: a_max_nanos,
+                    secs: a_secs,
+                },
+                UniformDurationMode::Large {
+                    max_secs,
+                    max_nanos,
+                    secs,
+                },
+            ) => {
+                assert_eq!(a_max_secs, max_secs);
+                assert_eq!(a_max_nanos, max_nanos);
+                assert_eq!(a_secs, secs);
+            }
+            _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly"),
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Miri is too slow
+    fn test_char() {
+        let mut rng = crate::test::rng(891);
+        let mut max = core::char::from_u32(0).unwrap();
+        for _ in 0..100 {
+            let c = rng.gen_range('A'..='Z');
+            assert!(('A'..='Z').contains(&c));
+            max = max.max(c);
+        }
+        assert_eq!(max, 'Z');
+        let d = Uniform::new(
+            core::char::from_u32(0xD7F0).unwrap(),
+            core::char::from_u32(0xE010).unwrap(),
+        );
+        for _ in 0..100 {
+            let c = d.sample(&mut rng);
+            assert!((c as u32) < 0xD800 || (c as u32) > 0xDFFF);
+        }
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Miri is too slow
+    fn test_durations() {
+        let mut rng = crate::test::rng(253);
+
+        let v = &[
+            (Duration::new(10, 50000), Duration::new(100, 1234)),
+            (Duration::new(0, 100), Duration::new(1, 50)),
+            (
+                Duration::new(0, 0),
+                Duration::new(u64::max_value(), 999_999_999),
+            ),
+        ];
+        for &(low, high) in v.iter() {
+            let my_uniform = Uniform::new(low, high);
+            for _ in 0..1000 {
+                let v = rng.sample(my_uniform);
+                assert!(low <= v && v < high);
+            }
+        }
+    }
+
+    #[test]
+    fn value_stability() {
+        test_samples(
+            Duration::new(2, 0),
+            Duration::new(4, 0),
+            &[
+                Duration::new(2, 532615131),
+                Duration::new(3, 638826742),
+                Duration::new(3, 485707508),
+            ],
+            &[
+                Duration::new(3, 117337521),
+                Duration::new(3, 191764285),
+                Duration::new(3, 236507617),
+            ],
+        );
+
+        test_samples('a', 'z', &['a', 'g', 'y'], &['u', 'j', 's']);
+    }
+
+    #[test]
+    fn uniform_distributions_can_be_compared() {
+        assert_eq!(Uniform::new('a', 'g'), Uniform::new('a', 'g'));
+        assert_eq!(
+            Uniform::new(Duration::from_millis(10), Duration::from_millis(20)),
+            Uniform::new(Duration::from_millis(10), Duration::from_millis(20)),
+        );
+    }
+}

From 255ff71cee94c81b313d2fe4cd49fed2eef10862 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 24 Feb 2022 11:40:14 +0000
Subject: [PATCH 03/17] Move UniformFloat to a new submodule

---
 src/distributions/uniform.rs               | 434 +------------------
 src/distributions/uniform/uniform_float.rs | 480 +++++++++++++++++++++
 2 files changed, 483 insertions(+), 431 deletions(-)
 create mode 100644 src/distributions/uniform/uniform_float.rs

diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 07a4ddbbf2..7e1bdcb2c8 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -108,8 +108,7 @@
 
 use core::ops::{Range, RangeInclusive};
 
-use crate::distributions::float::IntoFloat;
-use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, WideningMultiply};
+use crate::distributions::utils::WideningMultiply;
 use crate::distributions::Distribution;
 use crate::{Rng, RngCore};
 
@@ -121,8 +120,10 @@ use crate::distributions::utils::Float;
 
 #[cfg(feature = "serde1")] use serde::{Deserialize, Serialize};
 
+mod uniform_float;
 mod uniform_other;
 
+pub use uniform_float::UniformFloat;
 pub use uniform_other::{UniformChar, UniformDuration};
 
 /// Sample values uniformly between two bounds.
@@ -724,241 +725,10 @@ uniform_simd_int_impl! {
     u8
 }
 
-/// The back-end implementing [`UniformSampler`] for floating-point types.
-///
-/// Unless you are implementing [`UniformSampler`] for your own type, this type
-/// should not be used directly, use [`Uniform`] instead.
-///
-/// # Implementation notes
-///
-/// Instead of generating a float in the `[0, 1)` range using [`Standard`], the
-/// `UniformFloat` implementation converts the output of an PRNG itself. This
-/// way one or two steps can be optimized out.
-///
-/// The floats are first converted to a value in the `[1, 2)` interval using a
-/// transmute-based method, and then mapped to the expected range with a
-/// multiply and addition. Values produced this way have what equals 23 bits of
-/// random digits for an `f32`, and 52 for an `f64`.
-///
-/// [`new`]: UniformSampler::new
-/// [`new_inclusive`]: UniformSampler::new_inclusive
-/// [`Standard`]: crate::distributions::Standard
-#[derive(Clone, Copy, Debug, PartialEq)]
-#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
-pub struct UniformFloat<X> {
-    low: X,
-    scale: X,
-}
-
-macro_rules! uniform_float_impl {
-    ($ty:ty, $uty:ident, $f_scalar:ident, $u_scalar:ident, $bits_to_discard:expr) => {
-        impl SampleUniform for $ty {
-            type Sampler = UniformFloat<$ty>;
-        }
-
-        impl UniformSampler for UniformFloat<$ty> {
-            type X = $ty;
-
-            fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
-            where
-                B1: SampleBorrow<Self::X> + Sized,
-                B2: SampleBorrow<Self::X> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                debug_assert!(
-                    low.all_finite(),
-                    "Uniform::new called with `low` non-finite."
-                );
-                debug_assert!(
-                    high.all_finite(),
-                    "Uniform::new called with `high` non-finite."
-                );
-                assert!(low.all_lt(high), "Uniform::new called with `low >= high`");
-                let max_rand = <$ty>::splat(
-                    (::core::$u_scalar::MAX >> $bits_to_discard).into_float_with_exponent(0) - 1.0,
-                );
-
-                let mut scale = high - low;
-                assert!(scale.all_finite(), "Uniform::new: range overflow");
-
-                loop {
-                    let mask = (scale * max_rand + low).ge_mask(high);
-                    if mask.none() {
-                        break;
-                    }
-                    scale = scale.decrease_masked(mask);
-                }
-
-                debug_assert!(<$ty>::splat(0.0).all_le(scale));
-
-                UniformFloat { low, scale }
-            }
-
-            fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
-            where
-                B1: SampleBorrow<Self::X> + Sized,
-                B2: SampleBorrow<Self::X> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                debug_assert!(
-                    low.all_finite(),
-                    "Uniform::new_inclusive called with `low` non-finite."
-                );
-                debug_assert!(
-                    high.all_finite(),
-                    "Uniform::new_inclusive called with `high` non-finite."
-                );
-                assert!(
-                    low.all_le(high),
-                    "Uniform::new_inclusive called with `low > high`"
-                );
-                let max_rand = <$ty>::splat(
-                    (::core::$u_scalar::MAX >> $bits_to_discard).into_float_with_exponent(0) - 1.0,
-                );
-
-                let mut scale = (high - low) / max_rand;
-                assert!(scale.all_finite(), "Uniform::new_inclusive: range overflow");
-
-                loop {
-                    let mask = (scale * max_rand + low).gt_mask(high);
-                    if mask.none() {
-                        break;
-                    }
-                    scale = scale.decrease_masked(mask);
-                }
-
-                debug_assert!(<$ty>::splat(0.0).all_le(scale));
-
-                UniformFloat { low, scale }
-            }
-
-            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
-                // Generate a value in the range [1, 2)
-                let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0);
-
-                // Get a value in the range [0, 1) in order to avoid
-                // overflowing into infinity when multiplying with scale
-                let value0_1 = value1_2 - 1.0;
-
-                // We don't use `f64::mul_add`, because it is not available with
-                // `no_std`. Furthermore, it is slower for some targets (but
-                // faster for others). However, the order of multiplication and
-                // addition is important, because on some platforms (e.g. ARM)
-                // it will be optimized to a single (non-FMA) instruction.
-                value0_1 * self.scale + self.low
-            }
-
-            #[inline]
-            fn sample_single<R: Rng + ?Sized, B1, B2>(low_b: B1, high_b: B2, rng: &mut R) -> Self::X
-            where
-                B1: SampleBorrow<Self::X> + Sized,
-                B2: SampleBorrow<Self::X> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                debug_assert!(
-                    low.all_finite(),
-                    "UniformSampler::sample_single called with `low` non-finite."
-                );
-                debug_assert!(
-                    high.all_finite(),
-                    "UniformSampler::sample_single called with `high` non-finite."
-                );
-                assert!(
-                    low.all_lt(high),
-                    "UniformSampler::sample_single: low >= high"
-                );
-                let mut scale = high - low;
-                assert!(
-                    scale.all_finite(),
-                    "UniformSampler::sample_single: range overflow"
-                );
-
-                loop {
-                    // Generate a value in the range [1, 2)
-                    let value1_2 =
-                        (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0);
-
-                    // Get a value in the range [0, 1) in order to avoid
-                    // overflowing into infinity when multiplying with scale
-                    let value0_1 = value1_2 - 1.0;
-
-                    // Doing multiply before addition allows some architectures
-                    // to use a single instruction.
-                    let res = value0_1 * scale + low;
-
-                    debug_assert!(low.all_le(res) || !scale.all_finite());
-                    if res.all_lt(high) {
-                        return res;
-                    }
-
-                    // This handles a number of edge cases.
-                    // * `low` or `high` is NaN. In this case `scale` and
-                    //   `res` are going to end up as NaN.
-                    // * `low` is negative infinity and `high` is finite.
-                    //   `scale` is going to be infinite and `res` will be
-                    //   NaN.
-                    // * `high` is positive infinity and `low` is finite.
-                    //   `scale` is going to be infinite and `res` will
-                    //   be infinite or NaN (if value0_1 is 0).
-                    // * `low` is negative infinity and `high` is positive
-                    //   infinity. `scale` will be infinite and `res` will
-                    //   be NaN.
-                    // * `low` and `high` are finite, but `high - low`
-                    //   overflows to infinite. `scale` will be infinite
-                    //   and `res` will be infinite or NaN (if value0_1 is 0).
-                    // So if `high` or `low` are non-finite, we are guaranteed
-                    // to fail the `res < high` check above and end up here.
-                    //
-                    // While we technically should check for non-finite `low`
-                    // and `high` before entering the loop, by doing the checks
-                    // here instead, we allow the common case to avoid these
-                    // checks. But we are still guaranteed that if `low` or
-                    // `high` are non-finite we'll end up here and can do the
-                    // appropriate checks.
-                    //
-                    // Likewise `high - low` overflowing to infinity is also
-                    // rare, so handle it here after the common case.
-                    let mask = !scale.finite_mask();
-                    if mask.any() {
-                        assert!(
-                            low.all_finite() && high.all_finite(),
-                            "Uniform::sample_single: low and high must be finite"
-                        );
-                        scale = scale.decrease_masked(mask);
-                    }
-                }
-            }
-        }
-    };
-}
-
-uniform_float_impl! { f32, u32, f32, u32, 32 - 23 }
-uniform_float_impl! { f64, u64, f64, u64, 64 - 52 }
-
-#[cfg(feature = "simd_support")]
-uniform_float_impl! { f32x2, u32x2, f32, u32, 32 - 23 }
-#[cfg(feature = "simd_support")]
-uniform_float_impl! { f32x4, u32x4, f32, u32, 32 - 23 }
-#[cfg(feature = "simd_support")]
-uniform_float_impl! { f32x8, u32x8, f32, u32, 32 - 23 }
-#[cfg(feature = "simd_support")]
-uniform_float_impl! { f32x16, u32x16, f32, u32, 32 - 23 }
-
-#[cfg(feature = "simd_support")]
-uniform_float_impl! { f64x2, u64x2, f64, u64, 64 - 52 }
-#[cfg(feature = "simd_support")]
-uniform_float_impl! { f64x4, u64x4, f64, u64, 64 - 52 }
-#[cfg(feature = "simd_support")]
-uniform_float_impl! { f64x8, u64x8, f64, u64, 64 - 52 }
-
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::rngs::mock::StepRng;
 
     #[test]
     #[cfg(feature = "serde1")]
@@ -970,13 +740,6 @@ mod tests {
         assert_eq!(unit_box.0.low, de_unit_box.0.low);
         assert_eq!(unit_box.0.range, de_unit_box.0.range);
         assert_eq!(unit_box.0.z, de_unit_box.0.z);
-
-        let unit_box: Uniform<f32> = Uniform::new(-1., 1.);
-        let de_unit_box: Uniform<f32> =
-            bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap();
-
-        assert_eq!(unit_box.0.low, de_unit_box.0.low);
-        assert_eq!(unit_box.0.scale, de_unit_box.0.scale);
     }
 
     #[should_panic]
@@ -1086,188 +849,6 @@ mod tests {
         }
     }
 
-    #[test]
-    #[cfg_attr(miri, ignore)] // Miri is too slow
-    fn test_floats() {
-        let mut rng = crate::test::rng(252);
-        let mut zero_rng = StepRng::new(0, 0);
-        let mut max_rng = StepRng::new(0xffff_ffff_ffff_ffff, 0);
-        macro_rules! t {
-            ($ty:ty, $f_scalar:ident, $bits_shifted:expr) => {{
-                let v: &[($f_scalar, $f_scalar)] = &[
-                    (0.0, 100.0),
-                    (-1e35, -1e25),
-                    (1e-35, 1e-25),
-                    (-1e35, 1e35),
-                    (<$f_scalar>::from_bits(0), <$f_scalar>::from_bits(3)),
-                    (-<$f_scalar>::from_bits(10), -<$f_scalar>::from_bits(1)),
-                    (-<$f_scalar>::from_bits(5), 0.0),
-                    (-<$f_scalar>::from_bits(7), -0.0),
-                    (0.1 * ::core::$f_scalar::MAX, ::core::$f_scalar::MAX),
-                    (-::core::$f_scalar::MAX * 0.2, ::core::$f_scalar::MAX * 0.7),
-                ];
-                for &(low_scalar, high_scalar) in v.iter() {
-                    for lane in 0..<$ty>::lanes() {
-                        let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar);
-                        let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar);
-                        let my_uniform = Uniform::new(low, high);
-                        let my_incl_uniform = Uniform::new_inclusive(low, high);
-                        for _ in 0..100 {
-                            let v = rng.sample(my_uniform).extract(lane);
-                            assert!(low_scalar <= v && v < high_scalar);
-                            let v = rng.sample(my_incl_uniform).extract(lane);
-                            assert!(low_scalar <= v && v <= high_scalar);
-                            let v =
-                                <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng)
-                                    .extract(lane);
-                            assert!(low_scalar <= v && v < high_scalar);
-                        }
-
-                        assert_eq!(
-                            rng.sample(Uniform::new_inclusive(low, low)).extract(lane),
-                            low_scalar
-                        );
-
-                        assert_eq!(zero_rng.sample(my_uniform).extract(lane), low_scalar);
-                        assert_eq!(zero_rng.sample(my_incl_uniform).extract(lane), low_scalar);
-                        assert_eq!(
-                            <$ty as SampleUniform>::Sampler::sample_single(
-                                low,
-                                high,
-                                &mut zero_rng
-                            )
-                            .extract(lane),
-                            low_scalar
-                        );
-                        assert!(max_rng.sample(my_uniform).extract(lane) < high_scalar);
-                        assert!(max_rng.sample(my_incl_uniform).extract(lane) <= high_scalar);
-
-                        // Don't run this test for really tiny differences between high and low
-                        // since for those rounding might result in selecting high for a very
-                        // long time.
-                        if (high_scalar - low_scalar) > 0.0001 {
-                            let mut lowering_max_rng = StepRng::new(
-                                0xffff_ffff_ffff_ffff,
-                                (-1i64 << $bits_shifted) as u64,
-                            );
-                            assert!(
-                                <$ty as SampleUniform>::Sampler::sample_single(
-                                    low,
-                                    high,
-                                    &mut lowering_max_rng
-                                )
-                                .extract(lane)
-                                    < high_scalar
-                            );
-                        }
-                    }
-                }
-
-                assert_eq!(
-                    rng.sample(Uniform::new_inclusive(
-                        ::core::$f_scalar::MAX,
-                        ::core::$f_scalar::MAX
-                    )),
-                    ::core::$f_scalar::MAX
-                );
-                assert_eq!(
-                    rng.sample(Uniform::new_inclusive(
-                        -::core::$f_scalar::MAX,
-                        -::core::$f_scalar::MAX
-                    )),
-                    -::core::$f_scalar::MAX
-                );
-            }};
-        }
-
-        t!(f32, f32, 32 - 23);
-        t!(f64, f64, 64 - 52);
-        #[cfg(feature = "simd_support")]
-        {
-            t!(f32x2, f32, 32 - 23);
-            t!(f32x4, f32, 32 - 23);
-            t!(f32x8, f32, 32 - 23);
-            t!(f32x16, f32, 32 - 23);
-            t!(f64x2, f64, 64 - 52);
-            t!(f64x4, f64, 64 - 52);
-            t!(f64x8, f64, 64 - 52);
-        }
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_float_overflow() {
-        let _ = Uniform::from(::core::f64::MIN..::core::f64::MAX);
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_float_overflow_single() {
-        let mut rng = crate::test::rng(252);
-        rng.gen_range(::core::f64::MIN..::core::f64::MAX);
-    }
-
-    #[test]
-    #[cfg(all(
-        feature = "std",
-        not(target_arch = "wasm32"),
-        not(target_arch = "asmjs")
-    ))]
-    fn test_float_assertions() {
-        use super::SampleUniform;
-        use std::panic::catch_unwind;
-        fn range<T: SampleUniform>(low: T, high: T) {
-            let mut rng = crate::test::rng(253);
-            T::Sampler::sample_single(low, high, &mut rng);
-        }
-
-        macro_rules! t {
-            ($ty:ident, $f_scalar:ident) => {{
-                let v: &[($f_scalar, $f_scalar)] = &[
-                    (::std::$f_scalar::NAN, 0.0),
-                    (1.0, ::std::$f_scalar::NAN),
-                    (::std::$f_scalar::NAN, ::std::$f_scalar::NAN),
-                    (1.0, 0.5),
-                    (::std::$f_scalar::MAX, -::std::$f_scalar::MAX),
-                    (::std::$f_scalar::INFINITY, ::std::$f_scalar::INFINITY),
-                    (
-                        ::std::$f_scalar::NEG_INFINITY,
-                        ::std::$f_scalar::NEG_INFINITY,
-                    ),
-                    (::std::$f_scalar::NEG_INFINITY, 5.0),
-                    (5.0, ::std::$f_scalar::INFINITY),
-                    (::std::$f_scalar::NAN, ::std::$f_scalar::INFINITY),
-                    (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::NAN),
-                    (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::INFINITY),
-                ];
-                for &(low_scalar, high_scalar) in v.iter() {
-                    for lane in 0..<$ty>::lanes() {
-                        let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar);
-                        let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar);
-                        assert!(catch_unwind(|| range(low, high)).is_err());
-                        assert!(catch_unwind(|| Uniform::new(low, high)).is_err());
-                        assert!(catch_unwind(|| Uniform::new_inclusive(low, high)).is_err());
-                        assert!(catch_unwind(|| range(low, low)).is_err());
-                        assert!(catch_unwind(|| Uniform::new(low, low)).is_err());
-                    }
-                }
-            }};
-        }
-
-        t!(f32, f32);
-        t!(f64, f64);
-        #[cfg(feature = "simd_support")]
-        {
-            t!(f32x2, f32);
-            t!(f32x4, f32);
-            t!(f32x8, f32);
-            t!(f32x16, f32);
-            t!(f64x2, f64);
-            t!(f64x4, f64);
-            t!(f64x8, f64);
-        }
-    }
-
     #[test]
     fn test_custom_uniform() {
         use crate::distributions::uniform::{
@@ -1322,9 +903,6 @@ mod tests {
         let r = Uniform::from(2u32..7);
         assert_eq!(r.0.low, 2);
         assert_eq!(r.0.range, 5);
-        let r = Uniform::from(2.0f64..7.0);
-        assert_eq!(r.0.low, 2.0);
-        assert_eq!(r.0.scale, 5.0);
     }
 
     #[test]
@@ -1332,10 +910,6 @@ mod tests {
         let r = Uniform::from(2u32..=6);
         assert_eq!(r.0.low, 2);
         assert_eq!(r.0.range, 5);
-        let r = Uniform::from(2.0f64..=7.0);
-        assert_eq!(r.0.low, 2.0);
-        assert!(r.0.scale > 5.0);
-        assert!(r.0.scale < 5.0 + 1e-14);
     }
 
     pub(crate) fn test_samples<T: SampleUniform + Copy + core::fmt::Debug + PartialEq>(
@@ -1379,8 +953,6 @@ mod tests {
 
     #[test]
     fn uniform_distributions_can_be_compared() {
-        assert_eq!(Uniform::new(1.0, 2.0), Uniform::new(1.0, 2.0));
-
         assert_eq!(Uniform::new(1u32, 2u32), Uniform::new(1u32, 2u32));
     }
 }
diff --git a/src/distributions/uniform/uniform_float.rs b/src/distributions/uniform/uniform_float.rs
new file mode 100644
index 0000000000..cf4e37e067
--- /dev/null
+++ b/src/distributions/uniform/uniform_float.rs
@@ -0,0 +1,480 @@
+// Copyright 2018-2021 Developers of the Rand project.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use super::{SampleBorrow, SampleUniform, UniformSampler};
+use crate::distributions::float::IntoFloat;
+use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils};
+use crate::Rng;
+#[cfg(feature = "simd_support")] use packed_simd::*;
+#[cfg(feature = "serde1")] use serde::{Deserialize, Serialize};
+
+/// The back-end implementing [`UniformSampler`] for floating-point types.
+///
+/// Unless you are implementing [`UniformSampler`] for your own type, this type
+/// should not be used directly, use [`Uniform`] instead.
+///
+/// # Implementation notes
+///
+/// Instead of generating a float in the `[0, 1)` range using [`Standard`], the
+/// `UniformFloat` implementation converts the output of an PRNG itself. This
+/// way one or two steps can be optimized out.
+///
+/// The floats are first converted to a value in the `[1, 2)` interval using a
+/// transmute-based method, and then mapped to the expected range with a
+/// multiply and addition. Values produced this way have what equals 23 bits of
+/// random digits for an `f32`, and 52 for an `f64`.
+///
+/// [`new`]: UniformSampler::new
+/// [`new_inclusive`]: UniformSampler::new_inclusive
+/// [`Standard`]: crate::distributions::Standard
+#[derive(Clone, Copy, Debug, PartialEq)]
+#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
+pub struct UniformFloat<X> {
+    low: X,
+    scale: X,
+}
+
+macro_rules! uniform_float_impl {
+    ($ty:ty, $uty:ident, $f_scalar:ident, $u_scalar:ident, $bits_to_discard:expr) => {
+        impl SampleUniform for $ty {
+            type Sampler = UniformFloat<$ty>;
+        }
+
+        impl UniformSampler for UniformFloat<$ty> {
+            type X = $ty;
+
+            fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
+            where
+                B1: SampleBorrow<Self::X> + Sized,
+                B2: SampleBorrow<Self::X> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                debug_assert!(
+                    low.all_finite(),
+                    "Uniform::new called with `low` non-finite."
+                );
+                debug_assert!(
+                    high.all_finite(),
+                    "Uniform::new called with `high` non-finite."
+                );
+                assert!(low.all_lt(high), "Uniform::new called with `low >= high`");
+                let max_rand = <$ty>::splat(
+                    (::core::$u_scalar::MAX >> $bits_to_discard).into_float_with_exponent(0) - 1.0,
+                );
+
+                let mut scale = high - low;
+                assert!(scale.all_finite(), "Uniform::new: range overflow");
+
+                loop {
+                    let mask = (scale * max_rand + low).ge_mask(high);
+                    if mask.none() {
+                        break;
+                    }
+                    scale = scale.decrease_masked(mask);
+                }
+
+                debug_assert!(<$ty>::splat(0.0).all_le(scale));
+
+                UniformFloat { low, scale }
+            }
+
+            fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
+            where
+                B1: SampleBorrow<Self::X> + Sized,
+                B2: SampleBorrow<Self::X> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                debug_assert!(
+                    low.all_finite(),
+                    "Uniform::new_inclusive called with `low` non-finite."
+                );
+                debug_assert!(
+                    high.all_finite(),
+                    "Uniform::new_inclusive called with `high` non-finite."
+                );
+                assert!(
+                    low.all_le(high),
+                    "Uniform::new_inclusive called with `low > high`"
+                );
+                let max_rand = <$ty>::splat(
+                    (::core::$u_scalar::MAX >> $bits_to_discard).into_float_with_exponent(0) - 1.0,
+                );
+
+                let mut scale = (high - low) / max_rand;
+                assert!(scale.all_finite(), "Uniform::new_inclusive: range overflow");
+
+                loop {
+                    let mask = (scale * max_rand + low).gt_mask(high);
+                    if mask.none() {
+                        break;
+                    }
+                    scale = scale.decrease_masked(mask);
+                }
+
+                debug_assert!(<$ty>::splat(0.0).all_le(scale));
+
+                UniformFloat { low, scale }
+            }
+
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+                // Generate a value in the range [1, 2)
+                let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0);
+
+                // Get a value in the range [0, 1) in order to avoid
+                // overflowing into infinity when multiplying with scale
+                let value0_1 = value1_2 - 1.0;
+
+                // We don't use `f64::mul_add`, because it is not available with
+                // `no_std`. Furthermore, it is slower for some targets (but
+                // faster for others). However, the order of multiplication and
+                // addition is important, because on some platforms (e.g. ARM)
+                // it will be optimized to a single (non-FMA) instruction.
+                value0_1 * self.scale + self.low
+            }
+
+            #[inline]
+            fn sample_single<R: Rng + ?Sized, B1, B2>(low_b: B1, high_b: B2, rng: &mut R) -> Self::X
+            where
+                B1: SampleBorrow<Self::X> + Sized,
+                B2: SampleBorrow<Self::X> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                debug_assert!(
+                    low.all_finite(),
+                    "UniformSampler::sample_single called with `low` non-finite."
+                );
+                debug_assert!(
+                    high.all_finite(),
+                    "UniformSampler::sample_single called with `high` non-finite."
+                );
+                assert!(
+                    low.all_lt(high),
+                    "UniformSampler::sample_single: low >= high"
+                );
+                let mut scale = high - low;
+                assert!(
+                    scale.all_finite(),
+                    "UniformSampler::sample_single: range overflow"
+                );
+
+                loop {
+                    // Generate a value in the range [1, 2)
+                    let value1_2 =
+                        (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0);
+
+                    // Get a value in the range [0, 1) in order to avoid
+                    // overflowing into infinity when multiplying with scale
+                    let value0_1 = value1_2 - 1.0;
+
+                    // Doing multiply before addition allows some architectures
+                    // to use a single instruction.
+                    let res = value0_1 * scale + low;
+
+                    debug_assert!(low.all_le(res) || !scale.all_finite());
+                    if res.all_lt(high) {
+                        return res;
+                    }
+
+                    // This handles a number of edge cases.
+                    // * `low` or `high` is NaN. In this case `scale` and
+                    //   `res` are going to end up as NaN.
+                    // * `low` is negative infinity and `high` is finite.
+                    //   `scale` is going to be infinite and `res` will be
+                    //   NaN.
+                    // * `high` is positive infinity and `low` is finite.
+                    //   `scale` is going to be infinite and `res` will
+                    //   be infinite or NaN (if value0_1 is 0).
+                    // * `low` is negative infinity and `high` is positive
+                    //   infinity. `scale` will be infinite and `res` will
+                    //   be NaN.
+                    // * `low` and `high` are finite, but `high - low`
+                    //   overflows to infinite. `scale` will be infinite
+                    //   and `res` will be infinite or NaN (if value0_1 is 0).
+                    // So if `high` or `low` are non-finite, we are guaranteed
+                    // to fail the `res < high` check above and end up here.
+                    //
+                    // While we technically should check for non-finite `low`
+                    // and `high` before entering the loop, by doing the checks
+                    // here instead, we allow the common case to avoid these
+                    // checks. But we are still guaranteed that if `low` or
+                    // `high` are non-finite we'll end up here and can do the
+                    // appropriate checks.
+                    //
+                    // Likewise `high - low` overflowing to infinity is also
+                    // rare, so handle it here after the common case.
+                    let mask = !scale.finite_mask();
+                    if mask.any() {
+                        assert!(
+                            low.all_finite() && high.all_finite(),
+                            "Uniform::sample_single: low and high must be finite"
+                        );
+                        scale = scale.decrease_masked(mask);
+                    }
+                }
+            }
+        }
+    };
+}
+
+uniform_float_impl! { f32, u32, f32, u32, 32 - 23 }
+uniform_float_impl! { f64, u64, f64, u64, 64 - 52 }
+
+#[cfg(feature = "simd_support")]
+uniform_float_impl! { f32x2, u32x2, f32, u32, 32 - 23 }
+#[cfg(feature = "simd_support")]
+uniform_float_impl! { f32x4, u32x4, f32, u32, 32 - 23 }
+#[cfg(feature = "simd_support")]
+uniform_float_impl! { f32x8, u32x8, f32, u32, 32 - 23 }
+#[cfg(feature = "simd_support")]
+uniform_float_impl! { f32x16, u32x16, f32, u32, 32 - 23 }
+
+#[cfg(feature = "simd_support")]
+uniform_float_impl! { f64x2, u64x2, f64, u64, 64 - 52 }
+#[cfg(feature = "simd_support")]
+uniform_float_impl! { f64x4, u64x4, f64, u64, 64 - 52 }
+#[cfg(feature = "simd_support")]
+uniform_float_impl! { f64x8, u64x8, f64, u64, 64 - 52 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::distributions::uniform::tests::test_samples;
+    use crate::distributions::Uniform;
+    use crate::rngs::mock::StepRng;
+
+    #[test]
+    #[cfg(feature = "serde1")]
+    fn test_uniform_serialization() {
+        let unit_box: Uniform<f32> = Uniform::new(-1., 1.);
+        let de_unit_box: Uniform<f32> =
+            bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap();
+
+        assert_eq!(unit_box.0.low, de_unit_box.0.low);
+        assert_eq!(unit_box.0.scale, de_unit_box.0.scale);
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Miri is too slow
+    fn test_floats() {
+        let mut rng = crate::test::rng(252);
+        let mut zero_rng = StepRng::new(0, 0);
+        let mut max_rng = StepRng::new(0xffff_ffff_ffff_ffff, 0);
+        macro_rules! t {
+            ($ty:ty, $f_scalar:ident, $bits_shifted:expr) => {{
+                let v: &[($f_scalar, $f_scalar)] = &[
+                    (0.0, 100.0),
+                    (-1e35, -1e25),
+                    (1e-35, 1e-25),
+                    (-1e35, 1e35),
+                    (<$f_scalar>::from_bits(0), <$f_scalar>::from_bits(3)),
+                    (-<$f_scalar>::from_bits(10), -<$f_scalar>::from_bits(1)),
+                    (-<$f_scalar>::from_bits(5), 0.0),
+                    (-<$f_scalar>::from_bits(7), -0.0),
+                    (0.1 * ::core::$f_scalar::MAX, ::core::$f_scalar::MAX),
+                    (-::core::$f_scalar::MAX * 0.2, ::core::$f_scalar::MAX * 0.7),
+                ];
+                for &(low_scalar, high_scalar) in v.iter() {
+                    for lane in 0..<$ty>::lanes() {
+                        let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar);
+                        let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar);
+                        let my_uniform = Uniform::new(low, high);
+                        let my_incl_uniform = Uniform::new_inclusive(low, high);
+                        for _ in 0..100 {
+                            let v = rng.sample(my_uniform).extract(lane);
+                            assert!(low_scalar <= v && v < high_scalar);
+                            let v = rng.sample(my_incl_uniform).extract(lane);
+                            assert!(low_scalar <= v && v <= high_scalar);
+                            let v =
+                                <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng)
+                                    .extract(lane);
+                            assert!(low_scalar <= v && v < high_scalar);
+                        }
+
+                        assert_eq!(
+                            rng.sample(Uniform::new_inclusive(low, low)).extract(lane),
+                            low_scalar
+                        );
+
+                        assert_eq!(zero_rng.sample(my_uniform).extract(lane), low_scalar);
+                        assert_eq!(zero_rng.sample(my_incl_uniform).extract(lane), low_scalar);
+                        assert_eq!(
+                            <$ty as SampleUniform>::Sampler::sample_single(
+                                low,
+                                high,
+                                &mut zero_rng
+                            )
+                            .extract(lane),
+                            low_scalar
+                        );
+                        assert!(max_rng.sample(my_uniform).extract(lane) < high_scalar);
+                        assert!(max_rng.sample(my_incl_uniform).extract(lane) <= high_scalar);
+
+                        // Don't run this test for really tiny differences between high and low
+                        // since for those rounding might result in selecting high for a very
+                        // long time.
+                        if (high_scalar - low_scalar) > 0.0001 {
+                            let mut lowering_max_rng = StepRng::new(
+                                0xffff_ffff_ffff_ffff,
+                                (-1i64 << $bits_shifted) as u64,
+                            );
+                            assert!(
+                                <$ty as SampleUniform>::Sampler::sample_single(
+                                    low,
+                                    high,
+                                    &mut lowering_max_rng
+                                )
+                                .extract(lane)
+                                    < high_scalar
+                            );
+                        }
+                    }
+                }
+
+                assert_eq!(
+                    rng.sample(Uniform::new_inclusive(
+                        ::core::$f_scalar::MAX,
+                        ::core::$f_scalar::MAX
+                    )),
+                    ::core::$f_scalar::MAX
+                );
+                assert_eq!(
+                    rng.sample(Uniform::new_inclusive(
+                        -::core::$f_scalar::MAX,
+                        -::core::$f_scalar::MAX
+                    )),
+                    -::core::$f_scalar::MAX
+                );
+            }};
+        }
+
+        t!(f32, f32, 32 - 23);
+        t!(f64, f64, 64 - 52);
+        #[cfg(feature = "simd_support")]
+        {
+            t!(f32x2, f32, 32 - 23);
+            t!(f32x4, f32, 32 - 23);
+            t!(f32x8, f32, 32 - 23);
+            t!(f32x16, f32, 32 - 23);
+            t!(f64x2, f64, 64 - 52);
+            t!(f64x4, f64, 64 - 52);
+            t!(f64x8, f64, 64 - 52);
+        }
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_float_overflow() {
+        let _ = Uniform::from(::core::f64::MIN..::core::f64::MAX);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_float_overflow_single() {
+        let mut rng = crate::test::rng(252);
+        rng.gen_range(::core::f64::MIN..::core::f64::MAX);
+    }
+
+    #[test]
+    #[cfg(all(
+        feature = "std",
+        not(target_arch = "wasm32"),
+        not(target_arch = "asmjs")
+    ))]
+    fn test_float_assertions() {
+        use super::SampleUniform;
+        use std::panic::catch_unwind;
+        fn range<T: SampleUniform>(low: T, high: T) {
+            let mut rng = crate::test::rng(253);
+            T::Sampler::sample_single(low, high, &mut rng);
+        }
+
+        macro_rules! t {
+            ($ty:ident, $f_scalar:ident) => {{
+                let v: &[($f_scalar, $f_scalar)] = &[
+                    (::std::$f_scalar::NAN, 0.0),
+                    (1.0, ::std::$f_scalar::NAN),
+                    (::std::$f_scalar::NAN, ::std::$f_scalar::NAN),
+                    (1.0, 0.5),
+                    (::std::$f_scalar::MAX, -::std::$f_scalar::MAX),
+                    (::std::$f_scalar::INFINITY, ::std::$f_scalar::INFINITY),
+                    (
+                        ::std::$f_scalar::NEG_INFINITY,
+                        ::std::$f_scalar::NEG_INFINITY,
+                    ),
+                    (::std::$f_scalar::NEG_INFINITY, 5.0),
+                    (5.0, ::std::$f_scalar::INFINITY),
+                    (::std::$f_scalar::NAN, ::std::$f_scalar::INFINITY),
+                    (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::NAN),
+                    (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::INFINITY),
+                ];
+                for &(low_scalar, high_scalar) in v.iter() {
+                    for lane in 0..<$ty>::lanes() {
+                        let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar);
+                        let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar);
+                        assert!(catch_unwind(|| range(low, high)).is_err());
+                        assert!(catch_unwind(|| Uniform::new(low, high)).is_err());
+                        assert!(catch_unwind(|| Uniform::new_inclusive(low, high)).is_err());
+                        assert!(catch_unwind(|| range(low, low)).is_err());
+                        assert!(catch_unwind(|| Uniform::new(low, low)).is_err());
+                    }
+                }
+            }};
+        }
+
+        t!(f32, f32);
+        t!(f64, f64);
+        #[cfg(feature = "simd_support")]
+        {
+            t!(f32x2, f32);
+            t!(f32x4, f32);
+            t!(f32x8, f32);
+            t!(f32x16, f32);
+            t!(f64x2, f64);
+            t!(f64x4, f64);
+            t!(f64x8, f64);
+        }
+    }
+
+    #[test]
+    fn test_uniform_from_std_range() {
+        let r = Uniform::from(2.0f64..7.0);
+        assert_eq!(r.0.low, 2.0);
+        assert_eq!(r.0.scale, 5.0);
+    }
+
+    #[test]
+    fn test_uniform_from_std_range_inclusive() {
+        let r = Uniform::from(2.0f64..=7.0);
+        assert_eq!(r.0.low, 2.0);
+        assert!(r.0.scale > 5.0);
+        assert!(r.0.scale < 5.0 + 1e-14);
+    }
+
+    #[test]
+    fn value_stability() {
+        test_samples(0f32, 1000000f32, &[30701.041, 266307.47, 979833.0], &[
+            819413.3, 398172.03, 742853.6,
+        ]);
+        test_samples(
+            -1f64,
+            1f64,
+            &[-0.4673848682871551, 0.6388267422932352, 0.4857075081198343],
+            &[0.11733752121808161, 0.191764285210958, 0.2365076174315397],
+        );
+
+        // TODO: SIMD types
+    }
+
+    #[test]
+    fn uniform_distributions_can_be_compared() {
+        assert_eq!(Uniform::new(1.0, 2.0), Uniform::new(1.0, 2.0));
+    }
+}

From 00869d7c3d223d1f575e7f4d64b44dca9cca92ef Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 20 Oct 2021 10:13:52 +0100
Subject: [PATCH 04/17] Impl ONeill, Canon, Canon-Lemire and Bitmask methods
 for integer types

This is based on @TheIronBorn's work (#1154, #1172), with some changes.
---
 src/distributions/uniform.rs | 198 ++++++++++++++++++++++++++++++++---
 1 file changed, 182 insertions(+), 16 deletions(-)

diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 7e1bdcb2c8..1029a3f888 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -433,7 +433,7 @@ pub struct UniformInt<X> {
 }
 
 macro_rules! uniform_int_impl {
-    ($ty:ty, $unsigned:ident, $u_large:ident) => {
+    ($ty:ty, $unsigned:ident, $u_large:ident, $u_extra_large:ident) => {
         impl SampleUniform for $ty {
             type Sampler = UniformInt<$ty>;
         }
@@ -536,9 +536,8 @@ macro_rules! uniform_int_impl {
                     "UniformSampler::sample_single_inclusive: low > high"
                 );
                 let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
-                // If the above resulted in wrap-around to 0, the range is $ty::MIN..=$ty::MAX,
-                // and any integer will do.
                 if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
                     return rng.gen();
                 }
 
@@ -564,21 +563,188 @@ macro_rules! uniform_int_impl {
                 }
             }
         }
+
+        impl UniformInt<$ty> {
+            /// Sample single inclusive, using ONeill's method
+            #[inline]
+            pub fn sample_single_inclusive_oneill<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                // we use the "Debiased Int Mult (t-opt, m-opt)" rejection sampling method
+                // described here https://www.pcg-random.org/posts/bounded-rands.html
+                // and here https://github.com/imneme/bounded-rands
+
+                let (mut hi, mut lo) = rng.gen::<$u_large>().wmul(range);
+                if lo < range {
+                    let mut threshold = range.wrapping_neg();
+                    // this shortcut works best with large ranges
+                    if threshold >= range {
+                        threshold -= range;
+                        if threshold >= range {
+                            threshold %= range;
+                        }
+                    }
+                    while lo < threshold {
+                        let (new_hi, new_lo) = rng.gen::<$u_large>().wmul(range);
+                        hi = new_hi;
+                        lo = new_lo;
+                    }
+                }
+                low.wrapping_add(hi as $ty)
+            }
+
+            /// Sample single inclusive, using Canon's method
+            #[inline]
+            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                // generate a sample using a sensible integer type
+                let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range);
+
+                // if the sample is biased...
+                if lo_order > range.wrapping_neg() {
+                    // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                    let (new_hi_order, _) =
+                        (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large);
+                    // and adjust if needed
+                    result += lo_order
+                        .checked_add(new_hi_order as $u_extra_large)
+                        .is_none() as $u_extra_large;
+                }
+
+                low.wrapping_add(result as $ty)
+            }
+
+            /// Sample single inclusive, using Canon's method with Lemire's early-out
+            #[inline]
+            pub fn sample_inclusive_canon_lemire<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                // generate a sample using a sensible integer type
+                let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range);
+
+                // if the sample is biased... (since range won't be changing we can further
+                // improve this check with a modulo)
+                if lo_order < range.wrapping_neg() % range {
+                    // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                    let (new_hi_order, _) =
+                        (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large);
+                    // and adjust if needed
+                    result += lo_order
+                        .checked_add(new_hi_order as $u_extra_large)
+                        .is_none() as $u_extra_large;
+                }
+
+                low.wrapping_add(result as $ty)
+            }
+
+            /// Sample single inclusive, using the Bitmask method
+            #[inline]
+            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let mut range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                // the old impl use a mix of methods for different integer sizes, we only use
+                // the lz method here for a better comparison.
+
+                let mut mask = $u_large::max_value();
+                range -= 1;
+                mask >>= (range | 1).leading_zeros();
+                loop {
+                    let x = rng.gen::<$u_large>() & mask;
+                    if x <= range {
+                        return low.wrapping_add(x as $ty);
+                    }
+                }
+            }
+        }
     };
 }
-
-uniform_int_impl! { i8, u8, u32 }
-uniform_int_impl! { i16, u16, u32 }
-uniform_int_impl! { i32, u32, u32 }
-uniform_int_impl! { i64, u64, u64 }
-uniform_int_impl! { i128, u128, u128 }
-uniform_int_impl! { isize, usize, usize }
-uniform_int_impl! { u8, u8, u32 }
-uniform_int_impl! { u16, u16, u32 }
-uniform_int_impl! { u32, u32, u32 }
-uniform_int_impl! { u64, u64, u64 }
-uniform_int_impl! { usize, usize, usize }
-uniform_int_impl! { u128, u128, u128 }
+uniform_int_impl! { i8, u8, u32, u64 }
+uniform_int_impl! { i16, u16, u32, u64 }
+uniform_int_impl! { i32, u32, u32, u64 }
+uniform_int_impl! { i64, u64, u64, u64 }
+uniform_int_impl! { i128, u128, u128, u128 }
+uniform_int_impl! { u8, u8, u32, u64 }
+uniform_int_impl! { u16, u16, u32, u64 }
+uniform_int_impl! { u32, u32, u32, u64 }
+uniform_int_impl! { u64, u64, u64, u64 }
+uniform_int_impl! { u128, u128, u128, u128 }
+#[cfg(any(target_pointer_width = "16", target_pointer_width = "32",))]
+mod isize_int_impls {
+    use super::*;
+    uniform_int_impl! { isize, usize, usize, u64 }
+    uniform_int_impl! { usize, usize, usize, u64 }
+}
+#[cfg(not(any(target_pointer_width = "16", target_pointer_width = "32",)))]
+mod isize_int_impls {
+    use super::*;
+    uniform_int_impl! { isize, usize, usize, usize }
+    uniform_int_impl! { usize, usize, usize, usize }
+}
 
 #[cfg(feature = "simd_support")]
 macro_rules! uniform_simd_int_impl {

From 8753100a6c685f8ff990cc9c414becb39f413ac5 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 20 Oct 2021 11:37:50 +0100
Subject: [PATCH 05/17] Add Criterion benchmark for uniform int distributions

---
 Cargo.toml         |  6 +++++
 benches/uniform.rs | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 benches/uniform.rs

diff --git a/Cargo.toml b/Cargo.toml
index 98ba373c68..cb27f4193c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -80,6 +80,12 @@ features = ["into_bits"]
 libc = { version = "0.2.22", optional = true, default-features = false }
 
 [dev-dependencies]
+criterion = { version = "0.3", features = ["html_reports"] }
+num-traits = "0.2.14"
 rand_pcg = { path = "rand_pcg", version = "0.3.0" }
 # Only to test serde1
 bincode = "1.2.1"
+
+[[bench]]
+name = "uniform"
+harness = false
diff --git a/benches/uniform.rs b/benches/uniform.rs
new file mode 100644
index 0000000000..5e19c046e1
--- /dev/null
+++ b/benches/uniform.rs
@@ -0,0 +1,67 @@
+// Copyright 2021 Developers of the Rand project.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use criterion::{criterion_group, criterion_main};
+use criterion::{BenchmarkId, Criterion};
+use rand::distributions::uniform::{SampleUniform, UniformSampler};
+use rand::prelude::*;
+
+type BenchRng = SmallRng;
+
+macro_rules! bench_int_group {
+    ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => {
+        for input in $inputs {
+            $g.bench_with_input(
+                BenchmarkId::new($name, input.0),
+                &input.1,
+                |b, (low, high)| {
+                    let mut rng = BenchRng::from_entropy();
+                    b.iter(|| <$T as SampleUniform>::Sampler::$f(low, high, &mut rng))
+                },
+            );
+        }
+        $g.bench_function(BenchmarkId::new($name, "varying"), |b| {
+            let (low, mut high) = ($inputs[0].1 .0, $inputs[1].1 .1);
+            let mut rng = BenchRng::from_entropy();
+            b.iter(|| {
+                high = high.wrapping_add(1);
+                <$T as SampleUniform>::Sampler::$f(low, high, &mut rng)
+            })
+        });
+    };
+}
+
+macro_rules! bench_int {
+    ($c:expr, $T:ty, $high:expr) => {{
+        let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T)));
+        let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))];
+
+        bench_int_group!("Old", $T, sample_single_inclusive, g, inputs);
+        bench_int_group!("ONeill", $T, sample_single_inclusive_oneill, g, inputs);
+        bench_int_group!("Canon", $T, sample_single_inclusive_canon, g, inputs);
+        bench_int_group!("Canon-Lemire", $T, sample_inclusive_canon_lemire, g, inputs);
+        bench_int_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs);
+    }};
+}
+
+fn uniform_int(c: &mut Criterion) {
+    // for i8/i16, we use 32-bit integers internally so rejection is most common near full-size
+    // the exact values were determined with an exhaustive search
+    bench_int!(c, i8, (i8::MIN, 116));
+    bench_int!(c, i16, (i16::MIN, 32407));
+    bench_int!(c, i32, (i32::MIN, 1));
+    bench_int!(c, i64, (i64::MIN, 1));
+    bench_int!(c, i128, (i128::MIN, 1));
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default();
+    targets = uniform_int
+}
+criterion_main!(benches);

From aada17da89db175e5123fdc4ede8d2a65b3eb7bb Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 20 Oct 2021 12:28:30 +0100
Subject: [PATCH 06/17] Add uniform SIMD impls for new algs

This is from @TheIronBorn (see #1172)
---
 src/distributions/integer.rs |   4 +-
 src/distributions/uniform.rs | 417 ++++++++++++++++++++++++++++++++++-
 src/distributions/utils.rs   | 191 ++++++++++++++++
 3 files changed, 607 insertions(+), 5 deletions(-)

diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs
index 19ce71599c..931c2235d7 100644
--- a/src/distributions/integer.rs
+++ b/src/distributions/integer.rs
@@ -148,9 +148,9 @@ simd_impl!(64, u8x8, i8x8, u16x4, i16x4, u32x2, i32x2,);
 #[cfg(feature = "simd_support")]
 simd_impl!(128, u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2,);
 #[cfg(feature = "simd_support")]
-simd_impl!(256, u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4,);
+simd_impl!(256, u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4, u128x2, i128x2,);
 #[cfg(feature = "simd_support")]
-simd_impl!(512, u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8,);
+simd_impl!(512, u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8, u128x4, i128x4,);
 #[cfg(all(
     feature = "simd_support",
     any(target_arch = "x86", target_arch = "x86_64")
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 1029a3f888..231237609f 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -109,6 +109,8 @@
 use core::ops::{Range, RangeInclusive};
 
 use crate::distributions::utils::WideningMultiply;
+#[cfg(feature = "simd_support")]
+use crate::distributions::utils::{OverflowingAdd, SimdCombine, SimdSplit};
 use crate::distributions::Distribution;
 use crate::{Rng, RngCore};
 
@@ -853,6 +855,381 @@ macro_rules! uniform_simd_int_impl {
     };
 }
 
+#[cfg(feature = "simd_support")]
+macro_rules! uniform_simd_int_gt8_impl {
+    ($ty:ident, $unsigned:ident) => {
+        impl UniformInt<$ty> {
+            #[inline(always)]
+            fn sample_inc_setup<B1, B2>(low_b: B1, high_b: B2) -> ($unsigned, $ty)
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high");
+                let range: $unsigned = ((high - low) + 1).cast();
+                (range, low)
+            }
+
+            #[inline(always)]
+            fn canon_successive<R: Rng + ?Sized>(
+                range: $unsigned,
+                result: &mut $unsigned,
+                lo_order: $unsigned,
+                rng: &mut R
+            ) {
+                let mut vecs_64 = range.simd_split();
+                for x in &mut vecs_64 {
+                    *x = rng.gen::<u64x8>().wmul((*x).cast()).0.cast();
+                }
+                let cast_new_hi: $unsigned = vecs_64.simd_combine();
+
+                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
+                *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
+            }
+
+            /// Canon's method
+            #[inline(always)]
+            pub fn sample_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                Self::canon_successive(range, &mut result, lo_order, rng);
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            /// Canon's method
+            #[inline(always)]
+            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                if lo_order.gt(0 - range).any() {
+                    Self::canon_successive(range, &mut result, lo_order, rng);
+                }
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            /// Canon + Lemire's early-out
+            #[inline(always)]
+            pub fn sample_inclusive_canon_lemire<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                // lo_order < range.wrapping_neg() % range {
+                // may panic if range == 0
+                if lo_order.lt((0 - range) % range).any() {
+                    Self::canon_successive(range, &mut result, lo_order, rng);
+                }
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            /// Bitmask
+            #[inline(always)]
+            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (mut range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate bitmask
+                range -= 1;
+                let mut mask = range | 1;
+
+                mask |= mask >> 1;
+                mask |= mask >> 2;
+                mask |= mask >> 4;
+
+                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
+                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
+                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
+                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
+                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
+
+                let mut v: $unsigned = rng.gen();
+                loop {
+                    let masked = v & mask;
+                    let accept = masked.le(range);
+                    if accept.all() {
+                        let masked: $ty = masked.cast();
+                        // wrapping addition
+                        let result = low + masked;
+                        // `select` here compiles to a blend operation
+                        // When `range.eq(0).none()` the compare and blend
+                        // operations are avoided.
+                        let v: $ty = v.cast();
+                        return is_full_range.select(v, result);
+                    }
+                    // Replace only the failing lanes
+                    v = accept.select(v, rng.gen());
+                }
+            }
+        }
+    };
+
+    ($(($unsigned:ident, $signed:ident)),+) => {$(
+        uniform_simd_int_gt8_impl!{ $unsigned, $unsigned }
+        uniform_simd_int_gt8_impl!{ $signed, $unsigned }
+    )+};
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! uniform_simd_int_le8_impl {
+    ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => {
+        impl UniformInt<$ty> {
+            #[inline(always)]
+            fn sample_inc_setup<B1, B2>(low_b: B1, high_b: B2) -> ($unsigned, $ty)
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high");
+                // wrapping sub and add
+                let range: $unsigned = ((high - low) + 1).cast();
+                (range, low)
+            }
+
+            #[inline(always)]
+            fn canon_successive<R: Rng + ?Sized>(
+                range: $unsigned,
+                result: &mut $unsigned,
+                lo_order: $unsigned,
+                rng: &mut R
+            ) {
+                // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast();
+                let large_range: $u_extra_large = range.cast();
+                let (new_hi_order, _) = new_bits.wmul(large_range);
+                // and adjust if needed
+                let cast_new_hi: $unsigned = new_hi_order.cast();
+                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
+                *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                Self::canon_successive(range, &mut result, lo_order, rng);
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_inclusive_canon_scalar<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast();
+                let large_range: $u_extra_large = range.cast();
+
+                // let (new_hi_order, _) = new_bits.wmul(large_range);
+                let mut new_hi_order = <$u_extra_large>::default();
+
+                for i in 0..<$ty>::lanes() {
+                    let (shi, _slo) = new_bits.extract(i).wmul(large_range.extract(i));
+                    new_hi_order = new_hi_order.replace(i, shi);
+                }
+
+                // and adjust if needed
+                let cast_new_hi: $unsigned = new_hi_order.cast();
+                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
+                result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                if lo_order.gt(0 - range).any() {
+                    Self::canon_successive(range, &mut result, lo_order, rng);
+                }
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_inclusive_canon_lemire<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                // lo_order < range.wrapping_neg() % range {
+                // may panic if range == 0
+                if lo_order.lt((0 - range) % range).any() {
+                    Self::canon_successive(range, &mut result, lo_order, rng);
+                }
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (mut range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate bitmask
+                range -= 1;
+                let mut mask = range | 1;
+
+                mask |= mask >> 1;
+                mask |= mask >> 2;
+                mask |= mask >> 4;
+
+                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
+                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
+                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
+                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
+                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
+
+                let mut v: $unsigned = rng.gen();
+                loop {
+                    let masked = v & mask;
+                    let accept = masked.le(range);
+                    if accept.all() {
+                        let masked: $ty = masked.cast();
+                        // wrapping addition
+                        let result = low + masked;
+                        // `select` here compiles to a blend operation
+                        // When `range.eq(0).none()` the compare and blend
+                        // operations are avoided.
+                        let v: $ty = v.cast();
+                        return is_full_range.select(v, result);
+                    }
+                    // Replace only the failing lanes
+                    v = accept.select(v, rng.gen());
+                }
+            }
+        }
+    };
+
+    ($(($unsigned:ident, $signed:ident, $u64xN_type:ident, $u_extra_large:ident)),+) => {$(
+        uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u64xN_type, $u_extra_large }
+        uniform_simd_int_le8_impl!{ $signed, $unsigned, $u64xN_type, $u_extra_large }
+    )+};
+}
+
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u128x2, i128x2),
+    (u128x4, i128x4),
+    u128
+}
+// #[cfg(feature = "simd_support")]
+// uniform_simd_int_impl! {
+// (usizex2, isizex2),
+// (usizex4, isizex4),
+// (usizex8, isizex8),
+// usize
+// }
 #[cfg(feature = "simd_support")]
 uniform_simd_int_impl! {
     (u64x2, i64x2),
@@ -860,7 +1237,6 @@ uniform_simd_int_impl! {
     (u64x8, i64x8),
     u64
 }
-
 #[cfg(feature = "simd_support")]
 uniform_simd_int_impl! {
     (u32x2, i32x2),
@@ -869,7 +1245,6 @@ uniform_simd_int_impl! {
     (u32x16, i32x16),
     u32
 }
-
 #[cfg(feature = "simd_support")]
 uniform_simd_int_impl! {
     (u16x2, i16x2),
@@ -879,7 +1254,6 @@ uniform_simd_int_impl! {
     (u16x32, i16x32),
     u16
 }
-
 #[cfg(feature = "simd_support")]
 uniform_simd_int_impl! {
     (u8x2, i8x2),
@@ -891,6 +1265,43 @@ uniform_simd_int_impl! {
     u8
 }
 
+#[cfg(feature = "simd_support")]
+uniform_simd_int_gt8_impl! {
+    (u8x16, i8x16),
+    (u8x32, i8x32),
+    (u8x64, i8x64),
+
+    (u16x16, i16x16),
+    (u16x32, i16x32),
+
+    (u32x16, i32x16)
+}
+#[cfg(feature = "simd_support")]
+uniform_simd_int_le8_impl! {
+    (u8x2, i8x2, i64x2, u64x2),
+    (u8x4, i8x4, i64x4, u64x4),
+    (u8x8, i8x8, i64x8, u64x8),
+
+    (u16x2, i16x2, i64x2, u64x2),
+    (u16x4, i16x4, i64x4, u64x4),
+    (u16x8, i16x8, i64x8, u64x8),
+
+    (u32x2, i32x2, i64x2, u64x2),
+    (u32x4, i32x4, i64x4, u64x4),
+    (u32x8, i32x8, i64x8, u64x8),
+
+    (u64x2, i64x2, i64x2, u64x2),
+    (u64x4, i64x4, i64x4, u64x4),
+    (u64x8, i64x8, i64x8, u64x8),
+
+    (u128x2, i128x2, i64x2, u128x2),
+    (u128x4, i128x4, i64x4, u128x4)
+
+    // (usizex2, isizex2, i64x2, u64x2),
+    // (usizex4, isizex4, i64x4, u64x4),
+    // (usizex8, isizex8, i64x8, u64x8)
+}
+
 
 #[cfg(test)]
 mod tests {
diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs
index 89da5fd7aa..f614524364 100644
--- a/src/distributions/utils.rs
+++ b/src/distributions/utils.rs
@@ -202,6 +202,197 @@ mod simd_wmul {
     wmul_impl_large! { (u16x32,) u16, 8 }
     wmul_impl_large! { (u32x16,) u32, 16 }
     wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 }
+    wmul_impl_large! { (u128x2, u128x4,) u128, 64 }
+
+    #[cfg(target_pointer_width = "64")]
+    wmul_impl_large! { (usizex2, usizex4, usizex8,) usize, 32 }
+    #[cfg(target_pointer_width = "32")]
+    wmul_impl_large! { (usizex2, usizex4, usizex8,) usize, 16 }
+    #[cfg(target_pointer_width = "16")]
+    wmul_impl_large! { (usizex2, usizex4, usizex8,) usize, 8 }
+}
+
+pub(crate) trait OverflowingAdd<T> {
+    fn overflowing_add(&self, y: Self) -> (Self, T) where Self: Sized;
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! impl_overflowing_add {
+    ($(($ty:ty, $signed_ty:ty, $mask:ty)),+) => {$(
+        #[cfg(feature = "simd_support")]
+        impl OverflowingAdd<$mask> for $ty {
+            #[inline]
+            fn overflowing_add(&self, y: Self) -> (Self, $mask) {
+                let sum = *self + y;
+
+                let lane_bitwidth = std::mem::size_of::<$ty>() / <$ty>::lanes();
+                let mask = <$ty>::splat(1 << (lane_bitwidth - 1));
+
+                let neg_self: $signed_ty = (mask ^ *self).cast();
+                let neg_sum: $signed_ty = (mask ^ sum).cast();
+
+                let overflowed = neg_self.gt(neg_sum);
+
+                (sum, overflowed)
+            }
+        }
+    )+};
+}
+
+#[cfg(feature = "simd_support")]
+impl_overflowing_add!(
+    (u8x2, i8x2, m8x2),
+    (u8x4, i8x4, m8x4),
+    (u8x8, i8x8, m8x8),
+    (u8x16, i8x16, m8x16),
+    (u8x32, i8x32, m8x32),
+    (u8x64, i8x64, m8x64),
+
+    (u16x2, i16x2, m16x2),
+    (u16x4, i16x4, m16x4),
+    (u16x8, i16x8, m16x8),
+    (u16x16, i16x16, m16x16),
+    (u16x32, i16x32, m16x32),
+
+    (u32x2, i32x2, m32x2),
+    (u32x4, i32x4, m32x4),
+    (u32x8, i32x8, m32x8),
+    (u32x16, i32x16, m32x16),
+
+    (u64x2, i64x2, m64x2),
+    (u64x4, i64x4, m64x4),
+    (u64x8, i64x8, m64x8),
+
+    (u128x2, i128x2, m128x2),
+    (u128x4, i128x4, m128x4)
+);
+
+
+#[cfg(feature = "simd_support")]
+pub(crate) trait SimdCombine<T> {
+    fn simd_combine(&self) -> T;
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! impl_combine_2 {
+    ($(($wide:ident, $short:ident)),+) => {$(
+        impl SimdCombine<$wide> for [$short] {
+            #[inline]
+            fn simd_combine(&self) -> $wide {
+                shuffle!(self[0], self[1], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+            }
+        }
+    )+};
+}
+
+#[cfg(feature = "simd_support")]
+impl_combine_2!{
+    (u32x16, u32x8),
+    (u16x16, u16x8),
+    (u8x16, u8x8)
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! impl_combine_4 {
+    ($(($wide:ident, $mid:ident, $short:ident)),+) => {$(
+        impl SimdCombine<$wide> for [$short] {
+            #[inline]
+            fn simd_combine(&self) -> $wide {
+                let a: $mid = self.chunks_exact(2).nth(0).unwrap().simd_combine();
+                let b: $mid = self.chunks_exact(2).nth(1).unwrap().simd_combine();
+                shuffle!(a, b, [
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+                ])
+            }
+        }
+    )+};
+}
+
+#[cfg(feature = "simd_support")]
+impl_combine_4!{
+    (u16x32, u16x16, u16x8),
+    (u8x32, u8x16, u8x8)
+}
+
+#[cfg(feature = "simd_support")]
+impl SimdCombine<u8x64> for [u8x8; 8] {
+    #[inline]
+    fn simd_combine(&self) -> u8x64 {
+        let a: u8x32 = self.chunks_exact(4).nth(0).unwrap().simd_combine();
+        let b: u8x32 = self.chunks_exact(4).nth(1).unwrap().simd_combine();
+        shuffle!(a, b, [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
+        ])
+    }
+}
+
+
+#[cfg(feature = "simd_support")]
+pub(crate) trait SimdSplit<T> {
+    fn simd_split(&self) -> T;
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! impl_split_2 {
+    ($(($wide:ident, $short:ident)),+) => {$(
+        impl SimdSplit<[$short; 2]> for $wide {
+            #[inline]
+            fn simd_split(&self) -> [$short; 2] {
+                let a = shuffle!(self, [0, 1, 2, 3, 4, 5, 6, 7]);
+                let b = shuffle!(self, [8, 9, 10, 11, 12, 13, 14, 15]);
+                [a, b]
+            }
+        }
+    )+};
+}
+
+#[cfg(feature = "simd_support")]
+impl_split_2!{
+    (u32x16, u32x8),
+    (u16x16, u16x8),
+    (u8x16, u8x8)
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! impl_split_4 {
+    ($(($wide:ident, $mid:ident, $short:ident)),+) => {$(
+        impl SimdSplit<[$short; 4]> for $wide {
+            #[inline]
+            fn simd_split(&self) -> [$short; 4] {
+                let a = shuffle!(self, [0, 1, 2, 3, 4, 5, 6, 7]);
+                let b = shuffle!(self, [8, 9, 10, 11, 12, 13, 14, 15]);
+                let c = shuffle!(self, [16, 17, 18, 19, 20, 21, 22, 23]);
+                let d = shuffle!(self, [24, 25, 26, 27, 28, 29, 30, 31]);
+                [a, b, c, d]
+            }
+        }
+    )+};
+}
+
+#[cfg(feature = "simd_support")]
+impl_split_4!{
+    (u16x32, u16x16, u16x8),
+    (u8x32, u8x16, u8x8)
+}
+
+#[cfg(feature = "simd_support")]
+impl SimdSplit<[u8x8; 8]> for u8x64 {
+    #[inline]
+    fn simd_split(&self) -> [u8x8; 8] {
+        let x0 = shuffle!(self, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let x1 = shuffle!(self, [8, 9, 10, 11, 12, 13, 14, 15]);
+        let x2 = shuffle!(self, [16, 17, 18, 19, 20, 21, 22, 23]);
+        let x3 = shuffle!(self, [24, 25, 26, 27, 28, 29, 30, 31]);
+        let x4 = shuffle!(self, [32, 33, 34, 35, 36, 37, 38, 39]);
+        let x5 = shuffle!(self, [40, 41, 42, 43, 44, 45, 46, 47]);
+        let x6 = shuffle!(self, [48, 49, 50, 51, 52, 53, 54, 55]);
+        let x7 = shuffle!(self, [56, 57, 58, 59, 60, 61, 62, 63]);
+        [x0, x1, x2, x3, x4, x5, x6, x7]
+    }
 }
 
 /// Helper trait when dealing with scalar and SIMD floating point types.

From 1cfba720bf73e79e5fb301b0a1b142adda4bb36e Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 20 Oct 2021 16:19:01 +0100
Subject: [PATCH 07/17] Add uniform benches for SIMD types

---
 benches/uniform.rs           | 68 +++++++++++++++++++++++++++++++++++-
 src/distributions/uniform.rs | 60 +++++--------------------------
 2 files changed, 76 insertions(+), 52 deletions(-)

diff --git a/benches/uniform.rs b/benches/uniform.rs
index 5e19c046e1..e5b02a2912 100644
--- a/benches/uniform.rs
+++ b/benches/uniform.rs
@@ -8,6 +8,7 @@
 
 use criterion::{criterion_group, criterion_main};
 use criterion::{BenchmarkId, Criterion};
+#[cfg(feature = "simd_support")] use packed_simd::*;
 use rand::distributions::uniform::{SampleUniform, UniformSampler};
 use rand::prelude::*;
 
@@ -59,9 +60,74 @@ fn uniform_int(c: &mut Criterion) {
     bench_int!(c, i128, (i128::MIN, 1));
 }
 
+#[cfg(feature = "simd_support")]
+macro_rules! bench_simd_group {
+    ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => {
+        for input in $inputs {
+            $g.bench_with_input(
+                BenchmarkId::new($name, input.0),
+                &input.1,
+                |b, (low, high)| {
+                    let mut rng = BenchRng::from_entropy();
+                    let (low, high) = (<$T>::splat(*low), <$T>::splat(*high));
+                    b.iter(|| <$T as SampleUniform>::Sampler::$f(low, high, &mut rng))
+                },
+            );
+        }
+        $g.bench_function(BenchmarkId::new($name, "varying"), |b| {
+            let (low, mut high) = (<$T>::splat($inputs[0].1 .0), <$T>::splat($inputs[1].1 .1));
+            let mut rng = BenchRng::from_entropy();
+            b.iter(|| {
+                high += 1;
+                <$T as SampleUniform>::Sampler::$f(low, high, &mut rng)
+            })
+        });
+    };
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! bench_simd {
+    ($c:expr, $T:ty, $high:expr/*, $incr:expr*/) => {{
+        let mut g = $c.benchmark_group(concat!("uniform_simd_", stringify!($T)));
+        let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))];
+
+        bench_simd_group!("Old", $T, sample_single_inclusive, g, inputs);
+        bench_simd_group!("Canon", $T, sample_single_inclusive_canon, g, inputs);
+        bench_simd_group!(
+            "Canon-branchless",
+            $T,
+            sample_single_inclusive_canon_branchless,
+            g,
+            inputs
+        );
+        bench_simd_group!("Canon-scalar", $T, sample_inclusive_canon_scalar, g, inputs);
+        bench_simd_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs);
+    }};
+}
+
+#[cfg(feature = "simd_support")]
+fn uniform_simd(c: &mut Criterion) {
+    // for i8/i16, we use 32-bit integers internally so rejection is most common near full-size
+    // the exact values were determined with an exhaustive search
+    bench_simd!(c, i8x16, (i8::MIN, 116));
+    bench_simd!(c, i8x32, (i8::MIN, 116));
+    bench_simd!(c, i16x8, (i16::MIN, 32407));
+    bench_simd!(c, i16x16, (i16::MIN, 32407));
+    bench_simd!(c, i32x4, (i32::MIN, 1));
+    bench_simd!(c, i32x8, (i32::MIN, 1));
+    bench_simd!(c, i64x2, (i64::MIN, 1));
+    bench_simd!(c, i64x4, (i64::MIN, 1));
+    bench_simd!(c, i64x8, (i64::MIN, 1));
+    bench_simd!(c, i128x2, (i128::MIN, 1));
+    bench_simd!(c, i128x4, (i128::MIN, 1));
+}
+
+#[cfg(not(feature = "simd_support"))]
+fn uniform_simd(_c: &mut Criterion) {}
+
 criterion_group! {
     name = benches;
     config = Criterion::default();
-    targets = uniform_int
+    targets = uniform_int, uniform_simd
 }
 criterion_main!(benches);
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 231237609f..33ee9d63d0 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -891,7 +891,7 @@ macro_rules! uniform_simd_int_gt8_impl {
 
             /// Canon's method
             #[inline(always)]
-            pub fn sample_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+            pub fn sample_single_inclusive_canon_branchless<R: Rng + ?Sized, B1, B2>(
                 low_b: B1, high_b: B2, rng: &mut R,
             ) -> $ty
             where
@@ -911,34 +911,21 @@ macro_rules! uniform_simd_int_gt8_impl {
                 is_full_range.select(cast_rand_bits, low + cast_result)
             }
 
-            /// Canon's method
+            ///
             #[inline(always)]
-            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
+            pub fn sample_inclusive_canon_scalar<R: Rng + ?Sized, B1, B2>(
+                _low_b: B1, _high_b: B2, _rng: &mut R,
             ) -> $ty
             where
                 B1: SampleBorrow<$ty> + Sized,
                 B2: SampleBorrow<$ty> + Sized,
             {
-                let (range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate a sample using a sensible integer type
-                let rand_bits = rng.gen::<$unsigned>();
-                let (mut result, lo_order) = rand_bits.wmul(range);
-
-                if lo_order.gt(0 - range).any() {
-                    Self::canon_successive(range, &mut result, lo_order, rng);
-                }
-
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
+                Default::default() // dummy impl
             }
 
-            /// Canon + Lemire's early-out
+            /// Canon's method
             #[inline(always)]
-            pub fn sample_inclusive_canon_lemire<R: Rng + ?Sized, B1, B2>(
+            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
                 low_b: B1, high_b: B2, rng: &mut R,
             ) -> $ty
             where
@@ -952,9 +939,7 @@ macro_rules! uniform_simd_int_gt8_impl {
                 let rand_bits = rng.gen::<$unsigned>();
                 let (mut result, lo_order) = rand_bits.wmul(range);
 
-                // lo_order < range.wrapping_neg() % range {
-                // may panic if range == 0
-                if lo_order.lt((0 - range) % range).any() {
+                if lo_order.gt(0 - range).any() {
                     Self::canon_successive(range, &mut result, lo_order, rng);
                 }
 
@@ -1053,7 +1038,7 @@ macro_rules! uniform_simd_int_le8_impl {
 
             ///
             #[inline(always)]
-            pub fn sample_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+            pub fn sample_single_inclusive_canon_branchless<R: Rng + ?Sized, B1, B2>(
                 low_b: B1, high_b: B2, rng: &mut R,
             ) -> $ty
             where
@@ -1137,33 +1122,6 @@ macro_rules! uniform_simd_int_le8_impl {
                 is_full_range.select(cast_rand_bits, low + cast_result)
             }
 
-            ///
-            #[inline(always)]
-            pub fn sample_inclusive_canon_lemire<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate a sample using a sensible integer type
-                let rand_bits = rng.gen::<$unsigned>();
-                let (mut result, lo_order) = rand_bits.wmul(range);
-
-                // lo_order < range.wrapping_neg() % range {
-                // may panic if range == 0
-                if lo_order.lt((0 - range) % range).any() {
-                    Self::canon_successive(range, &mut result, lo_order, rng);
-                }
-
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
-            }
-
             ///
             #[inline(always)]
             pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(

From 7cfa6f90228daf8cc529e598338f186263355853 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 20 Oct 2021 16:41:54 +0100
Subject: [PATCH 08/17] Uniform: dedup

---
 src/distributions/uniform.rs | 43 ++++++++++++++----------------------
 1 file changed, 16 insertions(+), 27 deletions(-)

diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 33ee9d63d0..4588856884 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -844,20 +844,7 @@ macro_rules! uniform_simd_int_impl {
                 }
             }
         }
-    };
 
-    // bulk implementation
-    ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => {
-        $(
-            uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar);
-            uniform_simd_int_impl!($signed, $unsigned, $u_scalar);
-        )+
-    };
-}
-
-#[cfg(feature = "simd_support")]
-macro_rules! uniform_simd_int_gt8_impl {
-    ($ty:ident, $unsigned:ident) => {
         impl UniformInt<$ty> {
             #[inline(always)]
             fn sample_inc_setup<B1, B2>(low_b: B1, high_b: B2) -> ($unsigned, $ty)
@@ -868,10 +855,26 @@ macro_rules! uniform_simd_int_gt8_impl {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
                 assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high");
+                // wrapping sub and add
                 let range: $unsigned = ((high - low) + 1).cast();
                 (range, low)
             }
+        }
+    };
 
+    // bulk implementation
+    ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => {
+        $(
+            uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar);
+            uniform_simd_int_impl!($signed, $unsigned, $u_scalar);
+        )+
+    };
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! uniform_simd_int_gt8_impl {
+    ($ty:ident, $unsigned:ident) => {
+        impl UniformInt<$ty> {
             #[inline(always)]
             fn canon_successive<R: Rng + ?Sized>(
                 range: $unsigned,
@@ -1005,20 +1008,6 @@ macro_rules! uniform_simd_int_gt8_impl {
 macro_rules! uniform_simd_int_le8_impl {
     ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => {
         impl UniformInt<$ty> {
-            #[inline(always)]
-            fn sample_inc_setup<B1, B2>(low_b: B1, high_b: B2) -> ($unsigned, $ty)
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high");
-                // wrapping sub and add
-                let range: $unsigned = ((high - low) + 1).cast();
-                (range, low)
-            }
-
             #[inline(always)]
             fn canon_successive<R: Rng + ?Sized>(
                 range: $unsigned,

From 4518f4dac6aa86483f3939e2323f963ea881290b Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 24 Feb 2022 15:27:39 +0000
Subject: [PATCH 09/17] Move UniformInt to a new submodule

---
 src/distributions/uniform.rs             | 1038 +---------------------
 src/distributions/uniform/uniform_int.rs | 1038 ++++++++++++++++++++++
 2 files changed, 1043 insertions(+), 1033 deletions(-)
 create mode 100644 src/distributions/uniform/uniform_int.rs

diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 4588856884..4b81ab9ee9 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -106,26 +106,20 @@
 //! [`UniformDuration`]: crate::distributions::uniform::UniformDuration
 //! [`SampleBorrow::borrow`]: crate::distributions::uniform::SampleBorrow::borrow
 
-use core::ops::{Range, RangeInclusive};
-
-use crate::distributions::utils::WideningMultiply;
-#[cfg(feature = "simd_support")]
-use crate::distributions::utils::{OverflowingAdd, SimdCombine, SimdSplit};
-use crate::distributions::Distribution;
-use crate::{Rng, RngCore};
-
 #[cfg(not(feature = "std"))]
 #[allow(unused_imports)] // rustc doesn't detect that this is actually used
 use crate::distributions::utils::Float;
-
-#[cfg(feature = "simd_support")] use packed_simd::*;
-
+use crate::distributions::Distribution;
+use crate::{Rng, RngCore};
+use core::ops::{Range, RangeInclusive};
 #[cfg(feature = "serde1")] use serde::{Deserialize, Serialize};
 
 mod uniform_float;
+mod uniform_int;
 mod uniform_other;
 
 pub use uniform_float::UniformFloat;
+pub use uniform_int::UniformInt;
 pub use uniform_other::{UniformChar, UniformDuration};
 
 /// Sample values uniformly between two bounds.
@@ -387,992 +381,10 @@ impl<T: SampleUniform + PartialOrd> SampleRange<T> for RangeInclusive<T> {
 }
 
 
-////////////////////////////////////////////////////////////////////////////////
-
-// What follows are all back-ends.
-
-
-/// The back-end implementing [`UniformSampler`] for integer types.
-///
-/// Unless you are implementing [`UniformSampler`] for your own type, this type
-/// should not be used directly, use [`Uniform`] instead.
-///
-/// # Implementation notes
-///
-/// For simplicity, we use the same generic struct `UniformInt<X>` for all
-/// integer types `X`. This gives us only one field type, `X`; to store unsigned
-/// values of this size, we take use the fact that these conversions are no-ops.
-///
-/// For a closed range, the number of possible numbers we should generate is
-/// `range = (high - low + 1)`. To avoid bias, we must ensure that the size of
-/// our sample space, `zone`, is a multiple of `range`; other values must be
-/// rejected (by replacing with a new random sample).
-///
-/// As a special case, we use `range = 0` to represent the full range of the
-/// result type (i.e. for `new_inclusive($ty::MIN, $ty::MAX)`).
-///
-/// The optimum `zone` is the largest product of `range` which fits in our
-/// (unsigned) target type. We calculate this by calculating how many numbers we
-/// must reject: `reject = (MAX + 1) % range = (MAX - range + 1) % range`. Any (large)
-/// product of `range` will suffice, thus in `sample_single` we multiply by a
-/// power of 2 via bit-shifting (faster but may cause more rejections).
-///
-/// The smallest integer PRNGs generate is `u32`. For 8- and 16-bit outputs we
-/// use `u32` for our `zone` and samples (because it's not slower and because
-/// it reduces the chance of having to reject a sample). In this case we cannot
-/// store `zone` in the target type since it is too large, however we know
-/// `ints_to_reject < range <= $unsigned::MAX`.
-///
-/// An alternative to using a modulus is widening multiply: After a widening
-/// multiply by `range`, the result is in the high word. Then comparing the low
-/// word against `zone` makes sure our distribution is uniform.
-#[derive(Clone, Copy, Debug, PartialEq)]
-#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
-pub struct UniformInt<X> {
-    low: X,
-    range: X,
-    z: X, // either ints_to_reject or zone depending on implementation
-}
-
-macro_rules! uniform_int_impl {
-    ($ty:ty, $unsigned:ident, $u_large:ident, $u_extra_large:ident) => {
-        impl SampleUniform for $ty {
-            type Sampler = UniformInt<$ty>;
-        }
-
-        impl UniformSampler for UniformInt<$ty> {
-            // We play free and fast with unsigned vs signed here
-            // (when $ty is signed), but that's fine, since the
-            // contract of this macro is for $ty and $unsigned to be
-            // "bit-equal", so casting between them is a no-op.
-
-            type X = $ty;
-
-            #[inline] // if the range is constant, this helps LLVM to do the
-                      // calculations at compile-time.
-            fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
-            where
-                B1: SampleBorrow<Self::X> + Sized,
-                B2: SampleBorrow<Self::X> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(low < high, "Uniform::new called with `low >= high`");
-                UniformSampler::new_inclusive(low, high - 1)
-            }
-
-            #[inline] // if the range is constant, this helps LLVM to do the
-                      // calculations at compile-time.
-            fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
-            where
-                B1: SampleBorrow<Self::X> + Sized,
-                B2: SampleBorrow<Self::X> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(
-                    low <= high,
-                    "Uniform::new_inclusive called with `low > high`"
-                );
-                let unsigned_max = ::core::$u_large::MAX;
-
-                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned;
-                let ints_to_reject = if range > 0 {
-                    let range = $u_large::from(range);
-                    (unsigned_max - range + 1) % range
-                } else {
-                    0
-                };
-
-                UniformInt {
-                    low,
-                    // These are really $unsigned values, but store as $ty:
-                    range: range as $ty,
-                    z: ints_to_reject as $unsigned as $ty,
-                }
-            }
-
-            #[inline]
-            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
-                let range = self.range as $unsigned as $u_large;
-                if range > 0 {
-                    let unsigned_max = ::core::$u_large::MAX;
-                    let zone = unsigned_max - (self.z as $unsigned as $u_large);
-                    loop {
-                        let v: $u_large = rng.gen();
-                        let (hi, lo) = v.wmul(range);
-                        if lo <= zone {
-                            return self.low.wrapping_add(hi as $ty);
-                        }
-                    }
-                } else {
-                    // Sample from the entire integer range.
-                    rng.gen()
-                }
-            }
-
-            #[inline]
-            fn sample_single<R: Rng + ?Sized, B1, B2>(low_b: B1, high_b: B2, rng: &mut R) -> Self::X
-            where
-                B1: SampleBorrow<Self::X> + Sized,
-                B2: SampleBorrow<Self::X> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(low < high, "UniformSampler::sample_single: low >= high");
-                Self::sample_single_inclusive(low, high - 1, rng)
-            }
-
-            #[inline]
-            fn sample_single_inclusive<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> Self::X
-            where
-                B1: SampleBorrow<Self::X> + Sized,
-                B2: SampleBorrow<Self::X> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(
-                    low <= high,
-                    "UniformSampler::sample_single_inclusive: low > high"
-                );
-                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
-                if range == 0 {
-                    // Range is MAX+1 (unrepresentable), so we need a special case
-                    return rng.gen();
-                }
-
-                let zone = if ::core::$unsigned::MAX <= ::core::u16::MAX as $unsigned {
-                    // Using a modulus is faster than the approximation for
-                    // i8 and i16. I suppose we trade the cost of one
-                    // modulus for near-perfect branch prediction.
-                    let unsigned_max: $u_large = ::core::$u_large::MAX;
-                    let ints_to_reject = (unsigned_max - range + 1) % range;
-                    unsigned_max - ints_to_reject
-                } else {
-                    // conservative but fast approximation. `- 1` is necessary to allow the
-                    // same comparison without bias.
-                    (range << range.leading_zeros()).wrapping_sub(1)
-                };
-
-                loop {
-                    let v: $u_large = rng.gen();
-                    let (hi, lo) = v.wmul(range);
-                    if lo <= zone {
-                        return low.wrapping_add(hi as $ty);
-                    }
-                }
-            }
-        }
-
-        impl UniformInt<$ty> {
-            /// Sample single inclusive, using ONeill's method
-            #[inline]
-            pub fn sample_single_inclusive_oneill<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(
-                    low <= high,
-                    "UniformSampler::sample_single_inclusive: low > high"
-                );
-                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
-                if range == 0 {
-                    // Range is MAX+1 (unrepresentable), so we need a special case
-                    return rng.gen();
-                }
-
-                // we use the "Debiased Int Mult (t-opt, m-opt)" rejection sampling method
-                // described here https://www.pcg-random.org/posts/bounded-rands.html
-                // and here https://github.com/imneme/bounded-rands
-
-                let (mut hi, mut lo) = rng.gen::<$u_large>().wmul(range);
-                if lo < range {
-                    let mut threshold = range.wrapping_neg();
-                    // this shortcut works best with large ranges
-                    if threshold >= range {
-                        threshold -= range;
-                        if threshold >= range {
-                            threshold %= range;
-                        }
-                    }
-                    while lo < threshold {
-                        let (new_hi, new_lo) = rng.gen::<$u_large>().wmul(range);
-                        hi = new_hi;
-                        lo = new_lo;
-                    }
-                }
-                low.wrapping_add(hi as $ty)
-            }
-
-            /// Sample single inclusive, using Canon's method
-            #[inline]
-            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(
-                    low <= high,
-                    "UniformSampler::sample_single_inclusive: low > high"
-                );
-                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large;
-                if range == 0 {
-                    // Range is MAX+1 (unrepresentable), so we need a special case
-                    return rng.gen();
-                }
-
-                // generate a sample using a sensible integer type
-                let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range);
-
-                // if the sample is biased...
-                if lo_order > range.wrapping_neg() {
-                    // ...generate a new sample with 64 more bits, enough that bias is undetectable
-                    let (new_hi_order, _) =
-                        (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large);
-                    // and adjust if needed
-                    result += lo_order
-                        .checked_add(new_hi_order as $u_extra_large)
-                        .is_none() as $u_extra_large;
-                }
-
-                low.wrapping_add(result as $ty)
-            }
-
-            /// Sample single inclusive, using Canon's method with Lemire's early-out
-            #[inline]
-            pub fn sample_inclusive_canon_lemire<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(
-                    low <= high,
-                    "UniformSampler::sample_single_inclusive: low > high"
-                );
-                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large;
-                if range == 0 {
-                    // Range is MAX+1 (unrepresentable), so we need a special case
-                    return rng.gen();
-                }
-
-                // generate a sample using a sensible integer type
-                let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range);
-
-                // if the sample is biased... (since range won't be changing we can further
-                // improve this check with a modulo)
-                if lo_order < range.wrapping_neg() % range {
-                    // ...generate a new sample with 64 more bits, enough that bias is undetectable
-                    let (new_hi_order, _) =
-                        (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large);
-                    // and adjust if needed
-                    result += lo_order
-                        .checked_add(new_hi_order as $u_extra_large)
-                        .is_none() as $u_extra_large;
-                }
-
-                low.wrapping_add(result as $ty)
-            }
-
-            /// Sample single inclusive, using the Bitmask method
-            #[inline]
-            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(
-                    low <= high,
-                    "UniformSampler::sample_single_inclusive: low > high"
-                );
-                let mut range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
-                if range == 0 {
-                    // Range is MAX+1 (unrepresentable), so we need a special case
-                    return rng.gen();
-                }
-
-                // the old impl use a mix of methods for different integer sizes, we only use
-                // the lz method here for a better comparison.
-
-                let mut mask = $u_large::max_value();
-                range -= 1;
-                mask >>= (range | 1).leading_zeros();
-                loop {
-                    let x = rng.gen::<$u_large>() & mask;
-                    if x <= range {
-                        return low.wrapping_add(x as $ty);
-                    }
-                }
-            }
-        }
-    };
-}
-uniform_int_impl! { i8, u8, u32, u64 }
-uniform_int_impl! { i16, u16, u32, u64 }
-uniform_int_impl! { i32, u32, u32, u64 }
-uniform_int_impl! { i64, u64, u64, u64 }
-uniform_int_impl! { i128, u128, u128, u128 }
-uniform_int_impl! { u8, u8, u32, u64 }
-uniform_int_impl! { u16, u16, u32, u64 }
-uniform_int_impl! { u32, u32, u32, u64 }
-uniform_int_impl! { u64, u64, u64, u64 }
-uniform_int_impl! { u128, u128, u128, u128 }
-#[cfg(any(target_pointer_width = "16", target_pointer_width = "32",))]
-mod isize_int_impls {
-    use super::*;
-    uniform_int_impl! { isize, usize, usize, u64 }
-    uniform_int_impl! { usize, usize, usize, u64 }
-}
-#[cfg(not(any(target_pointer_width = "16", target_pointer_width = "32",)))]
-mod isize_int_impls {
-    use super::*;
-    uniform_int_impl! { isize, usize, usize, usize }
-    uniform_int_impl! { usize, usize, usize, usize }
-}
-
-#[cfg(feature = "simd_support")]
-macro_rules! uniform_simd_int_impl {
-    ($ty:ident, $unsigned:ident, $u_scalar:ident) => {
-        // The "pick the largest zone that can fit in an `u32`" optimization
-        // is less useful here. Multiple lanes complicate things, we don't
-        // know the PRNG's minimal output size, and casting to a larger vector
-        // is generally a bad idea for SIMD performance. The user can still
-        // implement it manually.
-
-        // TODO: look into `Uniform::<u32x4>::new(0u32, 100)` functionality
-        //       perhaps `impl SampleUniform for $u_scalar`?
-        impl SampleUniform for $ty {
-            type Sampler = UniformInt<$ty>;
-        }
-
-        impl UniformSampler for UniformInt<$ty> {
-            type X = $ty;
-
-            #[inline] // if the range is constant, this helps LLVM to do the
-                      // calculations at compile-time.
-            fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
-                where B1: SampleBorrow<Self::X> + Sized,
-                      B2: SampleBorrow<Self::X> + Sized
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(low.lt(high).all(), "Uniform::new called with `low >= high`");
-                UniformSampler::new_inclusive(low, high - 1)
-            }
-
-            #[inline] // if the range is constant, this helps LLVM to do the
-                      // calculations at compile-time.
-            fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
-                where B1: SampleBorrow<Self::X> + Sized,
-                      B2: SampleBorrow<Self::X> + Sized
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(low.le(high).all(),
-                        "Uniform::new_inclusive called with `low > high`");
-                let unsigned_max = ::core::$u_scalar::MAX;
-
-                // NOTE: these may need to be replaced with explicitly
-                // wrapping operations if `packed_simd` changes
-                let range: $unsigned = ((high - low) + 1).cast();
-                // `% 0` will panic at runtime.
-                let not_full_range = range.gt($unsigned::splat(0));
-                // replacing 0 with `unsigned_max` allows a faster `select`
-                // with bitwise OR
-                let modulo = not_full_range.select(range, $unsigned::splat(unsigned_max));
-                // wrapping addition
-                let ints_to_reject = (unsigned_max - range + 1) % modulo;
-                // When `range` is 0, `lo` of `v.wmul(range)` will always be
-                // zero which means only one sample is needed.
-                let zone = unsigned_max - ints_to_reject;
-
-                UniformInt {
-                    low,
-                    // These are really $unsigned values, but store as $ty:
-                    range: range.cast(),
-                    z: zone.cast(),
-                }
-            }
-
-            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
-                let range: $unsigned = self.range.cast();
-                let zone: $unsigned = self.z.cast();
-
-                // This might seem very slow, generating a whole new
-                // SIMD vector for every sample rejection. For most uses
-                // though, the chance of rejection is small and provides good
-                // general performance. With multiple lanes, that chance is
-                // multiplied. To mitigate this, we replace only the lanes of
-                // the vector which fail, iteratively reducing the chance of
-                // rejection. The replacement method does however add a little
-                // overhead. Benchmarking or calculating probabilities might
-                // reveal contexts where this replacement method is slower.
-                let mut v: $unsigned = rng.gen();
-                loop {
-                    let (hi, lo) = v.wmul(range);
-                    let mask = lo.le(zone);
-                    if mask.all() {
-                        let hi: $ty = hi.cast();
-                        // wrapping addition
-                        let result = self.low + hi;
-                        // `select` here compiles to a blend operation
-                        // When `range.eq(0).none()` the compare and blend
-                        // operations are avoided.
-                        let v: $ty = v.cast();
-                        return range.gt($unsigned::splat(0)).select(result, v);
-                    }
-                    // Replace only the failing lanes
-                    v = mask.select(v, rng.gen());
-                }
-            }
-        }
-
-        impl UniformInt<$ty> {
-            #[inline(always)]
-            fn sample_inc_setup<B1, B2>(low_b: B1, high_b: B2) -> ($unsigned, $ty)
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high");
-                // wrapping sub and add
-                let range: $unsigned = ((high - low) + 1).cast();
-                (range, low)
-            }
-        }
-    };
-
-    // bulk implementation
-    ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => {
-        $(
-            uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar);
-            uniform_simd_int_impl!($signed, $unsigned, $u_scalar);
-        )+
-    };
-}
-
-#[cfg(feature = "simd_support")]
-macro_rules! uniform_simd_int_gt8_impl {
-    ($ty:ident, $unsigned:ident) => {
-        impl UniformInt<$ty> {
-            #[inline(always)]
-            fn canon_successive<R: Rng + ?Sized>(
-                range: $unsigned,
-                result: &mut $unsigned,
-                lo_order: $unsigned,
-                rng: &mut R
-            ) {
-                let mut vecs_64 = range.simd_split();
-                for x in &mut vecs_64 {
-                    *x = rng.gen::<u64x8>().wmul((*x).cast()).0.cast();
-                }
-                let cast_new_hi: $unsigned = vecs_64.simd_combine();
-
-                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
-                *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
-            }
-
-            /// Canon's method
-            #[inline(always)]
-            pub fn sample_single_inclusive_canon_branchless<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                let rand_bits = rng.gen::<$unsigned>();
-                let (mut result, lo_order) = rand_bits.wmul(range);
-
-                Self::canon_successive(range, &mut result, lo_order, rng);
-
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
-            }
-
-            ///
-            #[inline(always)]
-            pub fn sample_inclusive_canon_scalar<R: Rng + ?Sized, B1, B2>(
-                _low_b: B1, _high_b: B2, _rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                Default::default() // dummy impl
-            }
-
-            /// Canon's method
-            #[inline(always)]
-            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate a sample using a sensible integer type
-                let rand_bits = rng.gen::<$unsigned>();
-                let (mut result, lo_order) = rand_bits.wmul(range);
-
-                if lo_order.gt(0 - range).any() {
-                    Self::canon_successive(range, &mut result, lo_order, rng);
-                }
-
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
-            }
-
-            /// Bitmask
-            #[inline(always)]
-            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (mut range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate bitmask
-                range -= 1;
-                let mut mask = range | 1;
-
-                mask |= mask >> 1;
-                mask |= mask >> 2;
-                mask |= mask >> 4;
-
-                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
-                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
-                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
-                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
-                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
-
-                let mut v: $unsigned = rng.gen();
-                loop {
-                    let masked = v & mask;
-                    let accept = masked.le(range);
-                    if accept.all() {
-                        let masked: $ty = masked.cast();
-                        // wrapping addition
-                        let result = low + masked;
-                        // `select` here compiles to a blend operation
-                        // When `range.eq(0).none()` the compare and blend
-                        // operations are avoided.
-                        let v: $ty = v.cast();
-                        return is_full_range.select(v, result);
-                    }
-                    // Replace only the failing lanes
-                    v = accept.select(v, rng.gen());
-                }
-            }
-        }
-    };
-
-    ($(($unsigned:ident, $signed:ident)),+) => {$(
-        uniform_simd_int_gt8_impl!{ $unsigned, $unsigned }
-        uniform_simd_int_gt8_impl!{ $signed, $unsigned }
-    )+};
-}
-
-#[cfg(feature = "simd_support")]
-macro_rules! uniform_simd_int_le8_impl {
-    ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => {
-        impl UniformInt<$ty> {
-            #[inline(always)]
-            fn canon_successive<R: Rng + ?Sized>(
-                range: $unsigned,
-                result: &mut $unsigned,
-                lo_order: $unsigned,
-                rng: &mut R
-            ) {
-                // ...generate a new sample with 64 more bits, enough that bias is undetectable
-                let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast();
-                let large_range: $u_extra_large = range.cast();
-                let (new_hi_order, _) = new_bits.wmul(large_range);
-                // and adjust if needed
-                let cast_new_hi: $unsigned = new_hi_order.cast();
-                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
-                *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
-            }
-
-            ///
-            #[inline(always)]
-            pub fn sample_single_inclusive_canon_branchless<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate a sample using a sensible integer type
-                let rand_bits = rng.gen::<$unsigned>();
-                let (mut result, lo_order) = rand_bits.wmul(range);
-
-                Self::canon_successive(range, &mut result, lo_order, rng);
-
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
-            }
-
-            ///
-            #[inline(always)]
-            pub fn sample_inclusive_canon_scalar<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate a sample using a sensible integer type
-                let rand_bits = rng.gen::<$unsigned>();
-                let (mut result, lo_order) = rand_bits.wmul(range);
-
-                // ...generate a new sample with 64 more bits, enough that bias is undetectable
-                let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast();
-                let large_range: $u_extra_large = range.cast();
-
-                // let (new_hi_order, _) = new_bits.wmul(large_range);
-                let mut new_hi_order = <$u_extra_large>::default();
-
-                for i in 0..<$ty>::lanes() {
-                    let (shi, _slo) = new_bits.extract(i).wmul(large_range.extract(i));
-                    new_hi_order = new_hi_order.replace(i, shi);
-                }
-
-                // and adjust if needed
-                let cast_new_hi: $unsigned = new_hi_order.cast();
-                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
-                result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
-
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
-            }
-
-            ///
-            #[inline(always)]
-            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate a sample using a sensible integer type
-                let rand_bits = rng.gen::<$unsigned>();
-                let (mut result, lo_order) = rand_bits.wmul(range);
-
-                if lo_order.gt(0 - range).any() {
-                    Self::canon_successive(range, &mut result, lo_order, rng);
-                }
-
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
-            }
-
-            ///
-            #[inline(always)]
-            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (mut range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate bitmask
-                range -= 1;
-                let mut mask = range | 1;
-
-                mask |= mask >> 1;
-                mask |= mask >> 2;
-                mask |= mask >> 4;
-
-                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
-                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
-                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
-                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
-                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
-
-                let mut v: $unsigned = rng.gen();
-                loop {
-                    let masked = v & mask;
-                    let accept = masked.le(range);
-                    if accept.all() {
-                        let masked: $ty = masked.cast();
-                        // wrapping addition
-                        let result = low + masked;
-                        // `select` here compiles to a blend operation
-                        // When `range.eq(0).none()` the compare and blend
-                        // operations are avoided.
-                        let v: $ty = v.cast();
-                        return is_full_range.select(v, result);
-                    }
-                    // Replace only the failing lanes
-                    v = accept.select(v, rng.gen());
-                }
-            }
-        }
-    };
-
-    ($(($unsigned:ident, $signed:ident, $u64xN_type:ident, $u_extra_large:ident)),+) => {$(
-        uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u64xN_type, $u_extra_large }
-        uniform_simd_int_le8_impl!{ $signed, $unsigned, $u64xN_type, $u_extra_large }
-    )+};
-}
-
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u128x2, i128x2),
-    (u128x4, i128x4),
-    u128
-}
-// #[cfg(feature = "simd_support")]
-// uniform_simd_int_impl! {
-// (usizex2, isizex2),
-// (usizex4, isizex4),
-// (usizex8, isizex8),
-// usize
-// }
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u64x2, i64x2),
-    (u64x4, i64x4),
-    (u64x8, i64x8),
-    u64
-}
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u32x2, i32x2),
-    (u32x4, i32x4),
-    (u32x8, i32x8),
-    (u32x16, i32x16),
-    u32
-}
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u16x2, i16x2),
-    (u16x4, i16x4),
-    (u16x8, i16x8),
-    (u16x16, i16x16),
-    (u16x32, i16x32),
-    u16
-}
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u8x2, i8x2),
-    (u8x4, i8x4),
-    (u8x8, i8x8),
-    (u8x16, i8x16),
-    (u8x32, i8x32),
-    (u8x64, i8x64),
-    u8
-}
-
-#[cfg(feature = "simd_support")]
-uniform_simd_int_gt8_impl! {
-    (u8x16, i8x16),
-    (u8x32, i8x32),
-    (u8x64, i8x64),
-
-    (u16x16, i16x16),
-    (u16x32, i16x32),
-
-    (u32x16, i32x16)
-}
-#[cfg(feature = "simd_support")]
-uniform_simd_int_le8_impl! {
-    (u8x2, i8x2, i64x2, u64x2),
-    (u8x4, i8x4, i64x4, u64x4),
-    (u8x8, i8x8, i64x8, u64x8),
-
-    (u16x2, i16x2, i64x2, u64x2),
-    (u16x4, i16x4, i64x4, u64x4),
-    (u16x8, i16x8, i64x8, u64x8),
-
-    (u32x2, i32x2, i64x2, u64x2),
-    (u32x4, i32x4, i64x4, u64x4),
-    (u32x8, i32x8, i64x8, u64x8),
-
-    (u64x2, i64x2, i64x2, u64x2),
-    (u64x4, i64x4, i64x4, u64x4),
-    (u64x8, i64x8, i64x8, u64x8),
-
-    (u128x2, i128x2, i64x2, u128x2),
-    (u128x4, i128x4, i64x4, u128x4)
-
-    // (usizex2, isizex2, i64x2, u64x2),
-    // (usizex4, isizex4, i64x4, u64x4),
-    // (usizex8, isizex8, i64x8, u64x8)
-}
-
-
 #[cfg(test)]
 mod tests {
     use super::*;
 
-    #[test]
-    #[cfg(feature = "serde1")]
-    fn test_uniform_serialization() {
-        let unit_box: Uniform<i32> = Uniform::new(-1, 1);
-        let de_unit_box: Uniform<i32> =
-            bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap();
-
-        assert_eq!(unit_box.0.low, de_unit_box.0.low);
-        assert_eq!(unit_box.0.range, de_unit_box.0.range);
-        assert_eq!(unit_box.0.z, de_unit_box.0.z);
-    }
-
-    #[should_panic]
-    #[test]
-    fn test_uniform_bad_limits_equal_int() {
-        Uniform::new(10, 10);
-    }
-
-    #[test]
-    fn test_uniform_good_limits_equal_int() {
-        let mut rng = crate::test::rng(804);
-        let dist = Uniform::new_inclusive(10, 10);
-        for _ in 0..20 {
-            assert_eq!(rng.sample(dist), 10);
-        }
-    }
-
-    #[should_panic]
-    #[test]
-    fn test_uniform_bad_limits_flipped_int() {
-        Uniform::new(10, 5);
-    }
-
-    #[test]
-    #[cfg_attr(miri, ignore)] // Miri is too slow
-    fn test_integers() {
-        use core::{i128, u128};
-        use core::{i16, i32, i64, i8, isize};
-        use core::{u16, u32, u64, u8, usize};
-
-        let mut rng = crate::test::rng(251);
-        macro_rules! t {
-            ($ty:ident, $v:expr, $le:expr, $lt:expr) => {{
-                for &(low, high) in $v.iter() {
-                    let my_uniform = Uniform::new(low, high);
-                    for _ in 0..1000 {
-                        let v: $ty = rng.sample(my_uniform);
-                        assert!($le(low, v) && $lt(v, high));
-                    }
-
-                    let my_uniform = Uniform::new_inclusive(low, high);
-                    for _ in 0..1000 {
-                        let v: $ty = rng.sample(my_uniform);
-                        assert!($le(low, v) && $le(v, high));
-                    }
-
-                    let my_uniform = Uniform::new(&low, high);
-                    for _ in 0..1000 {
-                        let v: $ty = rng.sample(my_uniform);
-                        assert!($le(low, v) && $lt(v, high));
-                    }
-
-                    let my_uniform = Uniform::new_inclusive(&low, &high);
-                    for _ in 0..1000 {
-                        let v: $ty = rng.sample(my_uniform);
-                        assert!($le(low, v) && $le(v, high));
-                    }
-
-                    for _ in 0..1000 {
-                        let v = <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng);
-                        assert!($le(low, v) && $lt(v, high));
-                    }
-
-                    for _ in 0..1000 {
-                        let v = <$ty as SampleUniform>::Sampler::sample_single_inclusive(low, high, &mut rng);
-                        assert!($le(low, v) && $le(v, high));
-                    }
-                }
-            }};
-
-            // scalar bulk
-            ($($ty:ident),*) => {{
-                $(t!(
-                    $ty,
-                    [(0, 10), (10, 127), ($ty::MIN, $ty::MAX)],
-                    |x, y| x <= y,
-                    |x, y| x < y
-                );)*
-            }};
-
-            // simd bulk
-            ($($ty:ident),* => $scalar:ident) => {{
-                $(t!(
-                    $ty,
-                    [
-                        ($ty::splat(0), $ty::splat(10)),
-                        ($ty::splat(10), $ty::splat(127)),
-                        ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)),
-                    ],
-                    |x: $ty, y| x.le(y).all(),
-                    |x: $ty, y| x.lt(y).all()
-                );)*
-            }};
-        }
-        t!(i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, i128, u128);
-
-        #[cfg(feature = "simd_support")]
-        {
-            t!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64 => u8);
-            t!(i8x2, i8x4, i8x8, i8x16, i8x32, i8x64 => i8);
-            t!(u16x2, u16x4, u16x8, u16x16, u16x32 => u16);
-            t!(i16x2, i16x4, i16x8, i16x16, i16x32 => i16);
-            t!(u32x2, u32x4, u32x8, u32x16 => u32);
-            t!(i32x2, i32x4, i32x8, i32x16 => i32);
-            t!(u64x2, u64x4, u64x8 => u64);
-            t!(i64x2, i64x4, i64x8 => i64);
-        }
-    }
-
     #[test]
     fn test_custom_uniform() {
         use crate::distributions::uniform::{
@@ -1422,20 +434,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_uniform_from_std_range() {
-        let r = Uniform::from(2u32..7);
-        assert_eq!(r.0.low, 2);
-        assert_eq!(r.0.range, 5);
-    }
-
-    #[test]
-    fn test_uniform_from_std_range_inclusive() {
-        let r = Uniform::from(2u32..=6);
-        assert_eq!(r.0.low, 2);
-        assert_eq!(r.0.range, 5);
-    }
-
     pub(crate) fn test_samples<T: SampleUniform + Copy + core::fmt::Debug + PartialEq>(
         lb: T, ub: T, expected_single: &[T], expected_multiple: &[T],
     ) where Uniform<T>: Distribution<T> {
@@ -1453,30 +451,4 @@ mod tests {
         }
         assert_eq!(&buf, expected_multiple);
     }
-
-    #[test]
-    fn value_stability() {
-        // We test on a sub-set of types; possibly we should do more.
-        // TODO: SIMD types
-
-        test_samples(11u8, 219, &[17, 66, 214], &[181, 93, 165]);
-        test_samples(11u32, 219, &[17, 66, 214], &[181, 93, 165]);
-
-        test_samples(0f32, 1e-2f32, &[0.0003070104, 0.0026630748, 0.00979833], &[
-            0.008194133,
-            0.00398172,
-            0.007428536,
-        ]);
-        test_samples(
-            -1e10f64,
-            1e10f64,
-            &[-4673848682.871551, 6388267422.932352, 4857075081.198343],
-            &[1173375212.1808167, 1917642852.109581, 2365076174.3153973],
-        );
-    }
-
-    #[test]
-    fn uniform_distributions_can_be_compared() {
-        assert_eq!(Uniform::new(1u32, 2u32), Uniform::new(1u32, 2u32));
-    }
 }
diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs
new file mode 100644
index 0000000000..8a7dac5b71
--- /dev/null
+++ b/src/distributions/uniform/uniform_int.rs
@@ -0,0 +1,1038 @@
+// Copyright 2018-2021 Developers of the Rand project.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use super::{SampleBorrow, SampleUniform, UniformSampler};
+use crate::distributions::utils::WideningMultiply;
+#[cfg(feature = "simd_support")]
+use crate::distributions::utils::{OverflowingAdd, SimdCombine, SimdSplit};
+use crate::Rng;
+#[cfg(feature = "simd_support")] use packed_simd::*;
+#[cfg(feature = "serde1")] use serde::{Deserialize, Serialize};
+
+/// The back-end implementing [`UniformSampler`] for integer types.
+///
+/// Unless you are implementing [`UniformSampler`] for your own type, this type
+/// should not be used directly, use [`Uniform`] instead.
+///
+/// # Implementation notes
+///
+/// For simplicity, we use the same generic struct `UniformInt<X>` for all
+/// integer types `X`. This gives us only one field type, `X`; to store unsigned
+/// values of this size, we take use the fact that these conversions are no-ops.
+///
+/// For a closed range, the number of possible numbers we should generate is
+/// `range = (high - low + 1)`. To avoid bias, we must ensure that the size of
+/// our sample space, `zone`, is a multiple of `range`; other values must be
+/// rejected (by replacing with a new random sample).
+///
+/// As a special case, we use `range = 0` to represent the full range of the
+/// result type (i.e. for `new_inclusive($ty::MIN, $ty::MAX)`).
+///
+/// The optimum `zone` is the largest product of `range` which fits in our
+/// (unsigned) target type. We calculate this by calculating how many numbers we
+/// must reject: `reject = (MAX + 1) % range = (MAX - range + 1) % range`. Any (large)
+/// product of `range` will suffice, thus in `sample_single` we multiply by a
+/// power of 2 via bit-shifting (faster but may cause more rejections).
+///
+/// The smallest integer PRNGs generate is `u32`. For 8- and 16-bit outputs we
+/// use `u32` for our `zone` and samples (because it's not slower and because
+/// it reduces the chance of having to reject a sample). In this case we cannot
+/// store `zone` in the target type since it is too large, however we know
+/// `ints_to_reject < range <= $unsigned::MAX`.
+///
+/// An alternative to using a modulus is widening multiply: After a widening
+/// multiply by `range`, the result is in the high word. Then comparing the low
+/// word against `zone` makes sure our distribution is uniform.
+#[derive(Clone, Copy, Debug, PartialEq)]
+#[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
+pub struct UniformInt<X> {
+    low: X,
+    range: X,
+    z: X, // either ints_to_reject or zone depending on implementation
+}
+
+macro_rules! uniform_int_impl {
+    ($ty:ty, $unsigned:ident, $u_large:ident, $u_extra_large:ident) => {
+        impl SampleUniform for $ty {
+            type Sampler = UniformInt<$ty>;
+        }
+
+        impl UniformSampler for UniformInt<$ty> {
+            // We play free and fast with unsigned vs signed here
+            // (when $ty is signed), but that's fine, since the
+            // contract of this macro is for $ty and $unsigned to be
+            // "bit-equal", so casting between them is a no-op.
+
+            type X = $ty;
+
+            #[inline] // if the range is constant, this helps LLVM to do the
+                      // calculations at compile-time.
+            fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
+            where
+                B1: SampleBorrow<Self::X> + Sized,
+                B2: SampleBorrow<Self::X> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(low < high, "Uniform::new called with `low >= high`");
+                UniformSampler::new_inclusive(low, high - 1)
+            }
+
+            #[inline] // if the range is constant, this helps LLVM to do the
+                      // calculations at compile-time.
+            fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
+            where
+                B1: SampleBorrow<Self::X> + Sized,
+                B2: SampleBorrow<Self::X> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "Uniform::new_inclusive called with `low > high`"
+                );
+                let unsigned_max = ::core::$u_large::MAX;
+
+                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned;
+                let ints_to_reject = if range > 0 {
+                    let range = $u_large::from(range);
+                    (unsigned_max - range + 1) % range
+                } else {
+                    0
+                };
+
+                UniformInt {
+                    low,
+                    // These are really $unsigned values, but store as $ty:
+                    range: range as $ty,
+                    z: ints_to_reject as $unsigned as $ty,
+                }
+            }
+
+            #[inline]
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+                let range = self.range as $unsigned as $u_large;
+                if range > 0 {
+                    let unsigned_max = ::core::$u_large::MAX;
+                    let zone = unsigned_max - (self.z as $unsigned as $u_large);
+                    loop {
+                        let v: $u_large = rng.gen();
+                        let (hi, lo) = v.wmul(range);
+                        if lo <= zone {
+                            return self.low.wrapping_add(hi as $ty);
+                        }
+                    }
+                } else {
+                    // Sample from the entire integer range.
+                    rng.gen()
+                }
+            }
+
+            #[inline]
+            fn sample_single<R: Rng + ?Sized, B1, B2>(low_b: B1, high_b: B2, rng: &mut R) -> Self::X
+            where
+                B1: SampleBorrow<Self::X> + Sized,
+                B2: SampleBorrow<Self::X> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(low < high, "UniformSampler::sample_single: low >= high");
+                Self::sample_single_inclusive(low, high - 1, rng)
+            }
+
+            #[inline]
+            fn sample_single_inclusive<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> Self::X
+            where
+                B1: SampleBorrow<Self::X> + Sized,
+                B2: SampleBorrow<Self::X> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                let zone = if ::core::$unsigned::MAX <= ::core::u16::MAX as $unsigned {
+                    // Using a modulus is faster than the approximation for
+                    // i8 and i16. I suppose we trade the cost of one
+                    // modulus for near-perfect branch prediction.
+                    let unsigned_max: $u_large = ::core::$u_large::MAX;
+                    let ints_to_reject = (unsigned_max - range + 1) % range;
+                    unsigned_max - ints_to_reject
+                } else {
+                    // conservative but fast approximation. `- 1` is necessary to allow the
+                    // same comparison without bias.
+                    (range << range.leading_zeros()).wrapping_sub(1)
+                };
+
+                loop {
+                    let v: $u_large = rng.gen();
+                    let (hi, lo) = v.wmul(range);
+                    if lo <= zone {
+                        return low.wrapping_add(hi as $ty);
+                    }
+                }
+            }
+        }
+
+        impl UniformInt<$ty> {
+            /// Sample single inclusive, using ONeill's method
+            #[inline]
+            pub fn sample_single_inclusive_oneill<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                // we use the "Debiased Int Mult (t-opt, m-opt)" rejection sampling method
+                // described here https://www.pcg-random.org/posts/bounded-rands.html
+                // and here https://github.com/imneme/bounded-rands
+
+                let (mut hi, mut lo) = rng.gen::<$u_large>().wmul(range);
+                if lo < range {
+                    let mut threshold = range.wrapping_neg();
+                    // this shortcut works best with large ranges
+                    if threshold >= range {
+                        threshold -= range;
+                        if threshold >= range {
+                            threshold %= range;
+                        }
+                    }
+                    while lo < threshold {
+                        let (new_hi, new_lo) = rng.gen::<$u_large>().wmul(range);
+                        hi = new_hi;
+                        lo = new_lo;
+                    }
+                }
+                low.wrapping_add(hi as $ty)
+            }
+
+            /// Sample single inclusive, using Canon's method
+            #[inline]
+            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                // generate a sample using a sensible integer type
+                let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range);
+
+                // if the sample is biased...
+                if lo_order > range.wrapping_neg() {
+                    // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                    let (new_hi_order, _) =
+                        (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large);
+                    // and adjust if needed
+                    result += lo_order
+                        .checked_add(new_hi_order as $u_extra_large)
+                        .is_none() as $u_extra_large;
+                }
+
+                low.wrapping_add(result as $ty)
+            }
+
+            /// Sample single inclusive, using Canon's method with Lemire's early-out
+            #[inline]
+            pub fn sample_inclusive_canon_lemire<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_extra_large;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                // generate a sample using a sensible integer type
+                let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range);
+
+                // if the sample is biased... (since range won't be changing we can further
+                // improve this check with a modulo)
+                if lo_order < range.wrapping_neg() % range {
+                    // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                    let (new_hi_order, _) =
+                        (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large);
+                    // and adjust if needed
+                    result += lo_order
+                        .checked_add(new_hi_order as $u_extra_large)
+                        .is_none() as $u_extra_large;
+                }
+
+                low.wrapping_add(result as $ty)
+            }
+
+            /// Sample single inclusive, using the Bitmask method
+            #[inline]
+            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let mut range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as $u_large;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                // the old impl use a mix of methods for different integer sizes, we only use
+                // the lz method here for a better comparison.
+
+                let mut mask = $u_large::max_value();
+                range -= 1;
+                mask >>= (range | 1).leading_zeros();
+                loop {
+                    let x = rng.gen::<$u_large>() & mask;
+                    if x <= range {
+                        return low.wrapping_add(x as $ty);
+                    }
+                }
+            }
+        }
+    };
+}
+uniform_int_impl! { i8, u8, u32, u64 }
+uniform_int_impl! { i16, u16, u32, u64 }
+uniform_int_impl! { i32, u32, u32, u64 }
+uniform_int_impl! { i64, u64, u64, u64 }
+uniform_int_impl! { i128, u128, u128, u128 }
+uniform_int_impl! { u8, u8, u32, u64 }
+uniform_int_impl! { u16, u16, u32, u64 }
+uniform_int_impl! { u32, u32, u32, u64 }
+uniform_int_impl! { u64, u64, u64, u64 }
+uniform_int_impl! { u128, u128, u128, u128 }
+#[cfg(any(target_pointer_width = "16", target_pointer_width = "32",))]
+mod isize_int_impls {
+    use super::*;
+    uniform_int_impl! { isize, usize, usize, u64 }
+    uniform_int_impl! { usize, usize, usize, u64 }
+}
+#[cfg(not(any(target_pointer_width = "16", target_pointer_width = "32",)))]
+mod isize_int_impls {
+    use super::*;
+    uniform_int_impl! { isize, usize, usize, usize }
+    uniform_int_impl! { usize, usize, usize, usize }
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! uniform_simd_int_impl {
+    ($ty:ident, $unsigned:ident, $u_scalar:ident) => {
+        // The "pick the largest zone that can fit in an `u32`" optimization
+        // is less useful here. Multiple lanes complicate things, we don't
+        // know the PRNG's minimal output size, and casting to a larger vector
+        // is generally a bad idea for SIMD performance. The user can still
+        // implement it manually.
+
+        // TODO: look into `Uniform::<u32x4>::new(0u32, 100)` functionality
+        //       perhaps `impl SampleUniform for $u_scalar`?
+        impl SampleUniform for $ty {
+            type Sampler = UniformInt<$ty>;
+        }
+
+        impl UniformSampler for UniformInt<$ty> {
+            type X = $ty;
+
+            #[inline] // if the range is constant, this helps LLVM to do the
+                      // calculations at compile-time.
+            fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
+                where B1: SampleBorrow<Self::X> + Sized,
+                      B2: SampleBorrow<Self::X> + Sized
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(low.lt(high).all(), "Uniform::new called with `low >= high`");
+                UniformSampler::new_inclusive(low, high - 1)
+            }
+
+            #[inline] // if the range is constant, this helps LLVM to do the
+                      // calculations at compile-time.
+            fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
+                where B1: SampleBorrow<Self::X> + Sized,
+                      B2: SampleBorrow<Self::X> + Sized
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(low.le(high).all(),
+                        "Uniform::new_inclusive called with `low > high`");
+                let unsigned_max = ::core::$u_scalar::MAX;
+
+                // NOTE: these may need to be replaced with explicitly
+                // wrapping operations if `packed_simd` changes
+                let range: $unsigned = ((high - low) + 1).cast();
+                // `% 0` will panic at runtime.
+                let not_full_range = range.gt($unsigned::splat(0));
+                // replacing 0 with `unsigned_max` allows a faster `select`
+                // with bitwise OR
+                let modulo = not_full_range.select(range, $unsigned::splat(unsigned_max));
+                // wrapping addition
+                let ints_to_reject = (unsigned_max - range + 1) % modulo;
+                // When `range` is 0, `lo` of `v.wmul(range)` will always be
+                // zero which means only one sample is needed.
+                let zone = unsigned_max - ints_to_reject;
+
+                UniformInt {
+                    low,
+                    // These are really $unsigned values, but store as $ty:
+                    range: range.cast(),
+                    z: zone.cast(),
+                }
+            }
+
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+                let range: $unsigned = self.range.cast();
+                let zone: $unsigned = self.z.cast();
+
+                // This might seem very slow, generating a whole new
+                // SIMD vector for every sample rejection. For most uses
+                // though, the chance of rejection is small and provides good
+                // general performance. With multiple lanes, that chance is
+                // multiplied. To mitigate this, we replace only the lanes of
+                // the vector which fail, iteratively reducing the chance of
+                // rejection. The replacement method does however add a little
+                // overhead. Benchmarking or calculating probabilities might
+                // reveal contexts where this replacement method is slower.
+                let mut v: $unsigned = rng.gen();
+                loop {
+                    let (hi, lo) = v.wmul(range);
+                    let mask = lo.le(zone);
+                    if mask.all() {
+                        let hi: $ty = hi.cast();
+                        // wrapping addition
+                        let result = self.low + hi;
+                        // `select` here compiles to a blend operation
+                        // When `range.eq(0).none()` the compare and blend
+                        // operations are avoided.
+                        let v: $ty = v.cast();
+                        return range.gt($unsigned::splat(0)).select(result, v);
+                    }
+                    // Replace only the failing lanes
+                    v = mask.select(v, rng.gen());
+                }
+            }
+        }
+
+        impl UniformInt<$ty> {
+            #[inline(always)]
+            fn sample_inc_setup<B1, B2>(low_b: B1, high_b: B2) -> ($unsigned, $ty)
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(low.le(high).all(), "UniformSampler::sample_single_inclusive: low > high");
+                // wrapping sub and add
+                let range: $unsigned = ((high - low) + 1).cast();
+                (range, low)
+            }
+        }
+    };
+
+    // bulk implementation
+    ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => {
+        $(
+            uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar);
+            uniform_simd_int_impl!($signed, $unsigned, $u_scalar);
+        )+
+    };
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! uniform_simd_int_gt8_impl {
+    ($ty:ident, $unsigned:ident) => {
+        impl UniformInt<$ty> {
+            #[inline(always)]
+            fn canon_successive<R: Rng + ?Sized>(
+                range: $unsigned,
+                result: &mut $unsigned,
+                lo_order: $unsigned,
+                rng: &mut R
+            ) {
+                let mut vecs_64 = range.simd_split();
+                for x in &mut vecs_64 {
+                    *x = rng.gen::<u64x8>().wmul((*x).cast()).0.cast();
+                }
+                let cast_new_hi: $unsigned = vecs_64.simd_combine();
+
+                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
+                *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
+            }
+
+            /// Canon's method
+            #[inline(always)]
+            pub fn sample_single_inclusive_canon_branchless<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                Self::canon_successive(range, &mut result, lo_order, rng);
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_inclusive_canon_scalar<R: Rng + ?Sized, B1, B2>(
+                _low_b: B1, _high_b: B2, _rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                Default::default() // dummy impl
+            }
+
+            /// Canon's method
+            #[inline(always)]
+            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                if lo_order.gt(0 - range).any() {
+                    Self::canon_successive(range, &mut result, lo_order, rng);
+                }
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            /// Bitmask
+            #[inline(always)]
+            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (mut range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate bitmask
+                range -= 1;
+                let mut mask = range | 1;
+
+                mask |= mask >> 1;
+                mask |= mask >> 2;
+                mask |= mask >> 4;
+
+                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
+                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
+                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
+                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
+                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
+
+                let mut v: $unsigned = rng.gen();
+                loop {
+                    let masked = v & mask;
+                    let accept = masked.le(range);
+                    if accept.all() {
+                        let masked: $ty = masked.cast();
+                        // wrapping addition
+                        let result = low + masked;
+                        // `select` here compiles to a blend operation
+                        // When `range.eq(0).none()` the compare and blend
+                        // operations are avoided.
+                        let v: $ty = v.cast();
+                        return is_full_range.select(v, result);
+                    }
+                    // Replace only the failing lanes
+                    v = accept.select(v, rng.gen());
+                }
+            }
+        }
+    };
+
+    ($(($unsigned:ident, $signed:ident)),+) => {$(
+        uniform_simd_int_gt8_impl!{ $unsigned, $unsigned }
+        uniform_simd_int_gt8_impl!{ $signed, $unsigned }
+    )+};
+}
+
+#[cfg(feature = "simd_support")]
+macro_rules! uniform_simd_int_le8_impl {
+    ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => {
+        impl UniformInt<$ty> {
+            #[inline(always)]
+            fn canon_successive<R: Rng + ?Sized>(
+                range: $unsigned,
+                result: &mut $unsigned,
+                lo_order: $unsigned,
+                rng: &mut R
+            ) {
+                // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast();
+                let large_range: $u_extra_large = range.cast();
+                let (new_hi_order, _) = new_bits.wmul(large_range);
+                // and adjust if needed
+                let cast_new_hi: $unsigned = new_hi_order.cast();
+                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
+                *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_single_inclusive_canon_branchless<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                Self::canon_successive(range, &mut result, lo_order, rng);
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_inclusive_canon_scalar<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast();
+                let large_range: $u_extra_large = range.cast();
+
+                // let (new_hi_order, _) = new_bits.wmul(large_range);
+                let mut new_hi_order = <$u_extra_large>::default();
+
+                for i in 0..<$ty>::lanes() {
+                    let (shi, _slo) = new_bits.extract(i).wmul(large_range.extract(i));
+                    new_hi_order = new_hi_order.replace(i, shi);
+                }
+
+                // and adjust if needed
+                let cast_new_hi: $unsigned = new_hi_order.cast();
+                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
+                result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$unsigned>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                if lo_order.gt(0 - range).any() {
+                    Self::canon_successive(range, &mut result, lo_order, rng);
+                }
+
+                let cast_result: $ty = result.cast();
+                let cast_rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(cast_rand_bits, low + cast_result)
+            }
+
+            ///
+            #[inline(always)]
+            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (mut range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate bitmask
+                range -= 1;
+                let mut mask = range | 1;
+
+                mask |= mask >> 1;
+                mask |= mask >> 2;
+                mask |= mask >> 4;
+
+                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
+                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
+                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
+                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
+                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
+
+                let mut v: $unsigned = rng.gen();
+                loop {
+                    let masked = v & mask;
+                    let accept = masked.le(range);
+                    if accept.all() {
+                        let masked: $ty = masked.cast();
+                        // wrapping addition
+                        let result = low + masked;
+                        // `select` here compiles to a blend operation
+                        // When `range.eq(0).none()` the compare and blend
+                        // operations are avoided.
+                        let v: $ty = v.cast();
+                        return is_full_range.select(v, result);
+                    }
+                    // Replace only the failing lanes
+                    v = accept.select(v, rng.gen());
+                }
+            }
+        }
+    };
+
+    ($(($unsigned:ident, $signed:ident, $u64xN_type:ident, $u_extra_large:ident)),+) => {$(
+        uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u64xN_type, $u_extra_large }
+        uniform_simd_int_le8_impl!{ $signed, $unsigned, $u64xN_type, $u_extra_large }
+    )+};
+}
+
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u128x2, i128x2),
+    (u128x4, i128x4),
+    u128
+}
+// #[cfg(feature = "simd_support")]
+// uniform_simd_int_impl! {
+// (usizex2, isizex2),
+// (usizex4, isizex4),
+// (usizex8, isizex8),
+// usize
+// }
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u64x2, i64x2),
+    (u64x4, i64x4),
+    (u64x8, i64x8),
+    u64
+}
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u32x2, i32x2),
+    (u32x4, i32x4),
+    (u32x8, i32x8),
+    (u32x16, i32x16),
+    u32
+}
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u16x2, i16x2),
+    (u16x4, i16x4),
+    (u16x8, i16x8),
+    (u16x16, i16x16),
+    (u16x32, i16x32),
+    u16
+}
+#[cfg(feature = "simd_support")]
+uniform_simd_int_impl! {
+    (u8x2, i8x2),
+    (u8x4, i8x4),
+    (u8x8, i8x8),
+    (u8x16, i8x16),
+    (u8x32, i8x32),
+    (u8x64, i8x64),
+    u8
+}
+
+#[cfg(feature = "simd_support")]
+uniform_simd_int_gt8_impl! {
+    (u8x16, i8x16),
+    (u8x32, i8x32),
+    (u8x64, i8x64),
+
+    (u16x16, i16x16),
+    (u16x32, i16x32),
+
+    (u32x16, i32x16)
+}
+#[cfg(feature = "simd_support")]
+uniform_simd_int_le8_impl! {
+    (u8x2, i8x2, i64x2, u64x2),
+    (u8x4, i8x4, i64x4, u64x4),
+    (u8x8, i8x8, i64x8, u64x8),
+
+    (u16x2, i16x2, i64x2, u64x2),
+    (u16x4, i16x4, i64x4, u64x4),
+    (u16x8, i16x8, i64x8, u64x8),
+
+    (u32x2, i32x2, i64x2, u64x2),
+    (u32x4, i32x4, i64x4, u64x4),
+    (u32x8, i32x8, i64x8, u64x8),
+
+    (u64x2, i64x2, i64x2, u64x2),
+    (u64x4, i64x4, i64x4, u64x4),
+    (u64x8, i64x8, i64x8, u64x8),
+
+    (u128x2, i128x2, i64x2, u128x2),
+    (u128x4, i128x4, i64x4, u128x4)
+
+    // (usizex2, isizex2, i64x2, u64x2),
+    // (usizex4, isizex4, i64x4, u64x4),
+    // (usizex8, isizex8, i64x8, u64x8)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::distributions::uniform::tests::test_samples;
+    use crate::distributions::Uniform;
+
+    #[test]
+    #[cfg(feature = "serde1")]
+    fn test_uniform_serialization() {
+        let unit_box: Uniform<i32> = Uniform::new(-1, 1);
+        let de_unit_box: Uniform<i32> =
+            bincode::deserialize(&bincode::serialize(&unit_box).unwrap()).unwrap();
+
+        assert_eq!(unit_box.0.low, de_unit_box.0.low);
+        assert_eq!(unit_box.0.range, de_unit_box.0.range);
+        assert_eq!(unit_box.0.z, de_unit_box.0.z);
+    }
+
+    #[should_panic]
+    #[test]
+    fn test_uniform_bad_limits_equal_int() {
+        Uniform::new(10, 10);
+    }
+
+    #[test]
+    fn test_uniform_good_limits_equal_int() {
+        let mut rng = crate::test::rng(804);
+        let dist = Uniform::new_inclusive(10, 10);
+        for _ in 0..20 {
+            assert_eq!(rng.sample(dist), 10);
+        }
+    }
+
+    #[should_panic]
+    #[test]
+    fn test_uniform_bad_limits_flipped_int() {
+        Uniform::new(10, 5);
+    }
+
+    #[test]
+    #[cfg_attr(miri, ignore)] // Miri is too slow
+    fn test_integers() {
+        use core::{i128, u128};
+        use core::{i16, i32, i64, i8, isize};
+        use core::{u16, u32, u64, u8, usize};
+
+        let mut rng = crate::test::rng(251);
+        macro_rules! t {
+            ($ty:ident, $v:expr, $le:expr, $lt:expr) => {{
+                for &(low, high) in $v.iter() {
+                    let my_uniform = Uniform::new(low, high);
+                    for _ in 0..1000 {
+                        let v: $ty = rng.sample(my_uniform);
+                        assert!($le(low, v) && $lt(v, high));
+                    }
+
+                    let my_uniform = Uniform::new_inclusive(low, high);
+                    for _ in 0..1000 {
+                        let v: $ty = rng.sample(my_uniform);
+                        assert!($le(low, v) && $le(v, high));
+                    }
+
+                    let my_uniform = Uniform::new(&low, high);
+                    for _ in 0..1000 {
+                        let v: $ty = rng.sample(my_uniform);
+                        assert!($le(low, v) && $lt(v, high));
+                    }
+
+                    let my_uniform = Uniform::new_inclusive(&low, &high);
+                    for _ in 0..1000 {
+                        let v: $ty = rng.sample(my_uniform);
+                        assert!($le(low, v) && $le(v, high));
+                    }
+
+                    for _ in 0..1000 {
+                        let v = <$ty as SampleUniform>::Sampler::sample_single(low, high, &mut rng);
+                        assert!($le(low, v) && $lt(v, high));
+                    }
+
+                    for _ in 0..1000 {
+                        let v = <$ty as SampleUniform>::Sampler::sample_single_inclusive(low, high, &mut rng);
+                        assert!($le(low, v) && $le(v, high));
+                    }
+                }
+            }};
+
+            // scalar bulk
+            ($($ty:ident),*) => {{
+                $(t!(
+                    $ty,
+                    [(0, 10), (10, 127), ($ty::MIN, $ty::MAX)],
+                    |x, y| x <= y,
+                    |x, y| x < y
+                );)*
+            }};
+
+            // simd bulk
+            ($($ty:ident),* => $scalar:ident) => {{
+                $(t!(
+                    $ty,
+                    [
+                        ($ty::splat(0), $ty::splat(10)),
+                        ($ty::splat(10), $ty::splat(127)),
+                        ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)),
+                    ],
+                    |x: $ty, y| x.le(y).all(),
+                    |x: $ty, y| x.lt(y).all()
+                );)*
+            }};
+        }
+        t!(i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, i128, u128);
+
+        #[cfg(feature = "simd_support")]
+        {
+            t!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64 => u8);
+            t!(i8x2, i8x4, i8x8, i8x16, i8x32, i8x64 => i8);
+            t!(u16x2, u16x4, u16x8, u16x16, u16x32 => u16);
+            t!(i16x2, i16x4, i16x8, i16x16, i16x32 => i16);
+            t!(u32x2, u32x4, u32x8, u32x16 => u32);
+            t!(i32x2, i32x4, i32x8, i32x16 => i32);
+            t!(u64x2, u64x4, u64x8 => u64);
+            t!(i64x2, i64x4, i64x8 => i64);
+        }
+    }
+
+    #[test]
+    fn test_uniform_from_std_range() {
+        let r = Uniform::from(2u32..7);
+        assert_eq!(r.0.low, 2);
+        assert_eq!(r.0.range, 5);
+    }
+
+    #[test]
+    fn test_uniform_from_std_range_inclusive() {
+        let r = Uniform::from(2u32..=6);
+        assert_eq!(r.0.low, 2);
+        assert_eq!(r.0.range, 5);
+    }
+
+    #[test]
+    fn value_stability() {
+        // We test on a sub-set of types; possibly we should do more.
+        // TODO: SIMD types
+
+        test_samples(11u8, 219, &[17, 66, 214], &[181, 93, 165]);
+        test_samples(11u32, 219, &[17, 66, 214], &[181, 93, 165]);
+
+        test_samples(0f32, 1e-2f32, &[0.0003070104, 0.0026630748, 0.00979833], &[
+            0.008194133,
+            0.00398172,
+            0.007428536,
+        ]);
+        test_samples(
+            -1e10f64,
+            1e10f64,
+            &[-4673848682.871551, 6388267422.932352, 4857075081.198343],
+            &[1173375212.1808167, 1917642852.109581, 2365076174.3153973],
+        );
+    }
+
+    #[test]
+    fn uniform_distributions_can_be_compared() {
+        assert_eq!(Uniform::new(1u32, 2u32), Uniform::new(1u32, 2u32));
+    }
+}

From 1a56b3e1c6ec4857b66bcbeb0c629b451cb0e60a Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 20 Oct 2021 17:21:01 +0100
Subject: [PATCH 10/17] Add algorithm impls for repeated sampling

---
 benches/uniform.rs                       |  28 +++++-
 src/distributions/uniform.rs             |   3 +-
 src/distributions/uniform/uniform_int.rs | 119 ++++++++++++++++++++---
 3 files changed, 134 insertions(+), 16 deletions(-)

diff --git a/benches/uniform.rs b/benches/uniform.rs
index e5b02a2912..57d62dd7c9 100644
--- a/benches/uniform.rs
+++ b/benches/uniform.rs
@@ -9,11 +9,27 @@
 use criterion::{criterion_group, criterion_main};
 use criterion::{BenchmarkId, Criterion};
 #[cfg(feature = "simd_support")] use packed_simd::*;
-use rand::distributions::uniform::{SampleUniform, UniformSampler};
+use rand::distributions::uniform::{SampleUniform, Uniform, UniformSampler};
 use rand::prelude::*;
 
 type BenchRng = SmallRng;
 
+macro_rules! bench_dist_int_group {
+    ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => {
+        for input in $inputs {
+            $g.bench_with_input(
+                BenchmarkId::new($name, input.0),
+                &input.1,
+                |b, (low, high)| {
+                    let mut rng = BenchRng::from_entropy();
+                    let dist = Uniform::new_inclusive(low, high);
+                    b.iter(|| <$T as SampleUniform>::Sampler::$f(&dist.0, &mut rng))
+                },
+            );
+        }
+    };
+}
+
 macro_rules! bench_int_group {
     ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => {
         for input in $inputs {
@@ -39,9 +55,17 @@ macro_rules! bench_int_group {
 
 macro_rules! bench_int {
     ($c:expr, $T:ty, $high:expr) => {{
-        let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T)));
         let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))];
 
+        let mut g = $c.benchmark_group(concat!("uniform_dist_int_", stringify!($T)));
+        bench_dist_int_group!("Old", $T, sample, g, inputs);
+        bench_dist_int_group!("Lemire", $T, sample_lemire, g, inputs);
+        bench_dist_int_group!("Canon", $T, sample_canon, g, inputs);
+        bench_dist_int_group!("Canon-Lemire", $T, sample_canon_lemire, g, inputs);
+        bench_dist_int_group!("Bitmask", $T, sample_bitmask, g, inputs);
+        drop(g);
+
+        let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T)));
         bench_int_group!("Old", $T, sample_single_inclusive, g, inputs);
         bench_int_group!("ONeill", $T, sample_single_inclusive_oneill, g, inputs);
         bench_int_group!("Canon", $T, sample_single_inclusive_canon, g, inputs);
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 4b81ab9ee9..8b1beb80dc 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -178,7 +178,8 @@ pub use uniform_other::{UniformChar, UniformDuration};
     feature = "serde1",
     serde(bound(deserialize = "X::Sampler: Deserialize<'de>"))
 )]
-pub struct Uniform<X: SampleUniform>(X::Sampler);
+// HACK: field is pub
+pub struct Uniform<X: SampleUniform>(pub X::Sampler);
 
 impl<X: SampleUniform> Uniform<X> {
     /// Create a new `Uniform` instance which samples uniformly from the half
diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs
index 8a7dac5b71..8eb7ac98b9 100644
--- a/src/distributions/uniform/uniform_int.rs
+++ b/src/distributions/uniform/uniform_int.rs
@@ -53,7 +53,8 @@ use crate::Rng;
 pub struct UniformInt<X> {
     low: X,
     range: X,
-    z: X, // either ints_to_reject or zone depending on implementation
+    nrmr: X, // range.wrapping_neg() % range
+    z: X,    // either ints_to_reject or zone depending on implementation
 }
 
 macro_rules! uniform_int_impl {
@@ -110,6 +111,7 @@ macro_rules! uniform_int_impl {
                     low,
                     // These are really $unsigned values, but store as $ty:
                     range: range as $ty,
+                    nrmr: (range.wrapping_neg() % range) as $ty,
                     z: ints_to_reject as $unsigned as $ty,
                 }
             }
@@ -117,19 +119,18 @@ macro_rules! uniform_int_impl {
             #[inline]
             fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
                 let range = self.range as $unsigned as $u_large;
-                if range > 0 {
-                    let unsigned_max = ::core::$u_large::MAX;
-                    let zone = unsigned_max - (self.z as $unsigned as $u_large);
-                    loop {
-                        let v: $u_large = rng.gen();
-                        let (hi, lo) = v.wmul(range);
-                        if lo <= zone {
-                            return self.low.wrapping_add(hi as $ty);
-                        }
+                if range == 0 {
+                    return rng.gen();
+                }
+
+                let unsigned_max = ::core::$u_large::MAX;
+                let zone = unsigned_max - (self.z as $unsigned as $u_large);
+                loop {
+                    let v: $u_large = rng.gen();
+                    let (hi, lo) = v.wmul(range);
+                    if lo <= zone {
+                        return self.low.wrapping_add(hi as $ty);
                     }
-                } else {
-                    // Sample from the entire integer range.
-                    rng.gen()
                 }
             }
 
@@ -189,6 +190,97 @@ macro_rules! uniform_int_impl {
         }
 
         impl UniformInt<$ty> {
+            /// Sample, Lemire's method
+            #[inline]
+            pub fn sample_lemire<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                let range = self.range as $unsigned as $u_large;
+                if range == 0 {
+                    return rng.gen();
+                }
+
+                let (mut hi, mut lo) = rng.gen::<$u_large>().wmul(range);
+                if lo < range {
+                    while lo < (self.nrmr as $u_large) {
+                        let (new_hi, new_lo) = rng.gen::<$u_large>().wmul(range);
+                        hi = new_hi;
+                        lo = new_lo;
+                    }
+                }
+                self.low.wrapping_add(hi as $ty)
+            }
+
+            /// Sample, Canon's method
+            #[inline]
+            pub fn sample_canon<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                let range = self.range as $unsigned as $u_extra_large;
+                if range == 0 {
+                    return rng.gen();
+                }
+
+                // generate a sample using a sensible integer type
+                let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range);
+
+                // if the sample is biased...
+                if lo_order > range.wrapping_neg() {
+                    // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                    let (new_hi_order, _) =
+                        (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large);
+                    // and adjust if needed
+                    result += lo_order
+                        .checked_add(new_hi_order as $u_extra_large)
+                        .is_none() as $u_extra_large;
+                }
+
+                self.low.wrapping_add(result as $ty)
+            }
+
+            /// Sample, Canon's method with Lemire's early-out
+            #[inline]
+            pub fn sample_canon_lemire<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                let range = self.range as $unsigned as $u_extra_large;
+                if range == 0 {
+                    return rng.gen();
+                }
+
+                // generate a sample using a sensible integer type
+                let (mut result, lo_order) = rng.gen::<$u_extra_large>().wmul(range);
+
+                // if the sample is biased...
+                if lo_order < (self.nrmr as $u_extra_large) {
+                    // ...generate a new sample with 64 more bits, enough that bias is undetectable
+                    let (new_hi_order, _) =
+                        (rng.gen::<$u_extra_large>()).wmul(range as $u_extra_large);
+                    // and adjust if needed
+                    result += lo_order
+                        .checked_add(new_hi_order as $u_extra_large)
+                        .is_none() as $u_extra_large;
+                }
+
+                self.low.wrapping_add(result as $ty)
+            }
+
+            /// Sample, Bitmask method
+            #[inline]
+            pub fn sample_bitmask<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                let mut range = self.range as $unsigned as $u_large;
+                if range == 0 {
+                    return rng.gen();
+                }
+
+                // the old impl use a mix of methods for different integer sizes, we only use
+                // the lz method here for a better comparison.
+
+                let mut mask = $u_large::max_value();
+                range -= 1;
+                mask >>= (range | 1).leading_zeros();
+                loop {
+                    let x = rng.gen::<$u_large>() & mask;
+                    if x <= range {
+                        return self.low.wrapping_add(x as $ty);
+                    }
+                }
+            }
+
             /// Sample single inclusive, using ONeill's method
             #[inline]
             pub fn sample_single_inclusive_oneill<R: Rng + ?Sized, B1, B2>(
@@ -430,6 +522,7 @@ macro_rules! uniform_simd_int_impl {
                     low,
                     // These are really $unsigned values, but store as $ty:
                     range: range.cast(),
+                    nrmr: ((0 - range) % range).cast(),
                     z: zone.cast(),
                 }
             }

From faa141cb53e0c89529dc3ef36abaa058917a8abd Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 21 Oct 2021 10:38:05 +0100
Subject: [PATCH 11/17] Extra SIMD i8 benches

---
 benches/uniform.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benches/uniform.rs b/benches/uniform.rs
index 57d62dd7c9..393c691377 100644
--- a/benches/uniform.rs
+++ b/benches/uniform.rs
@@ -133,8 +133,12 @@ macro_rules! bench_simd {
 fn uniform_simd(c: &mut Criterion) {
     // for i8/i16, we use 32-bit integers internally so rejection is most common near full-size
     // the exact values were determined with an exhaustive search
+    bench_simd!(c, i8x2, (i8::MIN, 116));
+    bench_simd!(c, i8x4, (i8::MIN, 116));
+    bench_simd!(c, i8x8, (i8::MIN, 116));
     bench_simd!(c, i8x16, (i8::MIN, 116));
     bench_simd!(c, i8x32, (i8::MIN, 116));
+    bench_simd!(c, i8x64, (i8::MIN, 116));
     bench_simd!(c, i16x8, (i16::MIN, 32407));
     bench_simd!(c, i16x16, (i16::MIN, 32407));
     bench_simd!(c, i32x4, (i32::MIN, 1));

From b7f9c49d4d4129241cde3789dfefd2924edd5a3a Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Fri, 22 Oct 2021 11:33:58 +0100
Subject: [PATCH 12/17] Add UniformInt<i128>::sample_canon_64 (15% faster)

---
 benches/uniform.rs                       |  4 ++++
 src/distributions/uniform/uniform_int.rs | 25 ++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/benches/uniform.rs b/benches/uniform.rs
index 393c691377..fa2350b4f4 100644
--- a/benches/uniform.rs
+++ b/benches/uniform.rs
@@ -82,6 +82,10 @@ fn uniform_int(c: &mut Criterion) {
     bench_int!(c, i32, (i32::MIN, 1));
     bench_int!(c, i64, (i64::MIN, 1));
     bench_int!(c, i128, (i128::MIN, 1));
+
+    let inputs = &[("high_reject", (i128::MIN, 1)), ("low_reject", (-1, 2))];
+    let mut g = c.benchmark_group(concat!("uniform_dist_int_", stringify!(i128)));
+    bench_dist_int_group!("Canon64", i128, sample_canon_64, g, inputs);
 }
 
 #[cfg(feature = "simd_support")]
diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs
index 8eb7ac98b9..b9381986fc 100644
--- a/src/distributions/uniform/uniform_int.rs
+++ b/src/distributions/uniform/uniform_int.rs
@@ -462,6 +462,31 @@ mod isize_int_impls {
     uniform_int_impl! { usize, usize, usize, usize }
 }
 
+impl UniformInt<i128> {
+    /// Sample, Canon's method variant
+    #[inline]
+    pub fn sample_canon_64<R: Rng + ?Sized>(&self, rng: &mut R) -> i128 {
+        let range = self.range as i128 as u128;
+        if range == 0 {
+            return rng.gen();
+        }
+
+        // Sample multiplied by 2.pow(-128) makes lo1 fractional (>>128):
+        let (mut result, lo1) = rng.gen::<u128>().wmul(range);
+
+        if lo1 > range.wrapping_neg() {
+            // Generate more bits. Sample is multiplied by 2.pow(-192), so
+            // hi2 is multiplied by 2.pow(-64):
+            let (hi2, lo2) = (rng.gen::<u64>() as u128).wmul(range);
+            debug_assert_eq!(hi2 >> 64, 0u128);
+            let is_overflow = lo1.checked_add((hi2 << 64) | (lo2 >> 64)).is_none();
+            result += is_overflow as u128;
+        }
+
+        self.low.wrapping_add(result as i128)
+    }
+}
+
 #[cfg(feature = "simd_support")]
 macro_rules! uniform_simd_int_impl {
     ($ty:ident, $unsigned:ident, $u_scalar:ident) => {

From 5326db3fc5182a439c3a486f9ae9d1f69f310789 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Sun, 24 Oct 2021 08:47:56 +0100
Subject: [PATCH 13/17] Additional Canon64 variants

---
 benches/uniform.rs                       |  5 +---
 src/distributions/uniform/uniform_int.rs | 37 +++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/benches/uniform.rs b/benches/uniform.rs
index fa2350b4f4..aa92035e79 100644
--- a/benches/uniform.rs
+++ b/benches/uniform.rs
@@ -61,6 +61,7 @@ macro_rules! bench_int {
         bench_dist_int_group!("Old", $T, sample, g, inputs);
         bench_dist_int_group!("Lemire", $T, sample_lemire, g, inputs);
         bench_dist_int_group!("Canon", $T, sample_canon, g, inputs);
+        bench_dist_int_group!("Canon64", $T, sample_canon_64, g, inputs);
         bench_dist_int_group!("Canon-Lemire", $T, sample_canon_lemire, g, inputs);
         bench_dist_int_group!("Bitmask", $T, sample_bitmask, g, inputs);
         drop(g);
@@ -82,10 +83,6 @@ fn uniform_int(c: &mut Criterion) {
     bench_int!(c, i32, (i32::MIN, 1));
     bench_int!(c, i64, (i64::MIN, 1));
     bench_int!(c, i128, (i128::MIN, 1));
-
-    let inputs = &[("high_reject", (i128::MIN, 1)), ("low_reject", (-1, 2))];
-    let mut g = c.benchmark_group(concat!("uniform_dist_int_", stringify!(i128)));
-    bench_dist_int_group!("Canon64", i128, sample_canon_64, g, inputs);
 }
 
 #[cfg(feature = "simd_support")]
diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs
index b9381986fc..9c65c582d0 100644
--- a/src/distributions/uniform/uniform_int.rs
+++ b/src/distributions/uniform/uniform_int.rs
@@ -462,11 +462,46 @@ mod isize_int_impls {
     uniform_int_impl! { usize, usize, usize, usize }
 }
 
+macro_rules! uniform_int_64_impl {
+    ($ty:ty, $unsigned:ident) => {
+        impl UniformInt<$ty> {
+            /// Sample, Canon's method variant
+            #[inline]
+            pub fn sample_canon_64<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                let range = self.range as $unsigned as u64;
+                if range == 0 {
+                    return rng.gen();
+                }
+
+                let (result, _lo1) = rng.gen::<u64>().wmul(range);
+                // bias is at most 1 in 2.pow(56) for i8, 1 in 2.pow(48) for i16
+                self.low.wrapping_add(result as $ty)
+            }
+        }
+    }
+}
+uniform_int_64_impl!(i8, u8);
+uniform_int_64_impl!(i16, u16);
+
+macro_rules! uniform_int_64void_impl {
+    ($ty:ty) => {
+        impl UniformInt<$ty> {
+            /// Sample, Canon's method variant
+            #[inline]
+            pub fn sample_canon_64<R: Rng + ?Sized>(&self, _rng: &mut R) -> $ty {
+                Default::default() // not used
+            }
+        }
+    }
+}
+uniform_int_64void_impl!(i32);
+uniform_int_64void_impl!(i64);
+
 impl UniformInt<i128> {
     /// Sample, Canon's method variant
     #[inline]
     pub fn sample_canon_64<R: Rng + ?Sized>(&self, rng: &mut R) -> i128 {
-        let range = self.range as i128 as u128;
+        let range = self.range as u128;
         if range == 0 {
             return rng.gen();
         }

From a97199b0a291cf5f42561e3040560e173eb07b9b Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Mon, 15 Nov 2021 15:08:59 +0000
Subject: [PATCH 14/17] Add sample_single_inclusive_canon_64

This beats other uniform_int_i128 results
---
 benches/uniform.rs                       |  1 +
 src/distributions/uniform/uniform_int.rs | 81 ++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/benches/uniform.rs b/benches/uniform.rs
index aa92035e79..cb743b4f78 100644
--- a/benches/uniform.rs
+++ b/benches/uniform.rs
@@ -70,6 +70,7 @@ macro_rules! bench_int {
         bench_int_group!("Old", $T, sample_single_inclusive, g, inputs);
         bench_int_group!("ONeill", $T, sample_single_inclusive_oneill, g, inputs);
         bench_int_group!("Canon", $T, sample_single_inclusive_canon, g, inputs);
+        bench_int_group!("Canon64", $T, sample_single_inclusive_canon_64, g, inputs);
         bench_int_group!("Canon-Lemire", $T, sample_inclusive_canon_lemire, g, inputs);
         bench_int_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs);
     }};
diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs
index 9c65c582d0..b1bd86927b 100644
--- a/src/distributions/uniform/uniform_int.rs
+++ b/src/distributions/uniform/uniform_int.rs
@@ -466,6 +466,8 @@ macro_rules! uniform_int_64_impl {
     ($ty:ty, $unsigned:ident) => {
         impl UniformInt<$ty> {
             /// Sample, Canon's method variant
+            ///
+            /// Variant: potential increase to bias (uses a single `u64` sample).
             #[inline]
             pub fn sample_canon_64<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
                 let range = self.range as $unsigned as u64;
@@ -477,6 +479,35 @@ macro_rules! uniform_int_64_impl {
                 // bias is at most 1 in 2.pow(56) for i8, 1 in 2.pow(48) for i16
                 self.low.wrapping_add(result as $ty)
             }
+
+            /// Sample single inclusive, using Canon's method variant
+            ///
+            /// Variant: potential increase to bias (uses a single `u64` sample).
+            #[inline]
+            pub fn sample_single_inclusive_canon_64<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let low = *low_b.borrow();
+                let high = *high_b.borrow();
+                assert!(
+                    low <= high,
+                    "UniformSampler::sample_single_inclusive: low > high"
+                );
+                let range = high.wrapping_sub(low).wrapping_add(1) as $unsigned as u64;
+                if range == 0 {
+                    // Range is MAX+1 (unrepresentable), so we need a special case
+                    return rng.gen();
+                }
+
+                // generate a sample using a sensible integer type
+                let (result, _lo1) = rng.gen::<u64>().wmul(range);
+                // bias is at most 1 in 2.pow(56) for i8, 1 in 2.pow(48) for i16
+                low.wrapping_add(result as $ty)
+            }
         }
     }
 }
@@ -491,6 +522,18 @@ macro_rules! uniform_int_64void_impl {
             pub fn sample_canon_64<R: Rng + ?Sized>(&self, _rng: &mut R) -> $ty {
                 Default::default() // not used
             }
+
+            /// Sample single inclusive, using Canon's method variant
+            #[inline]
+            pub fn sample_single_inclusive_canon_64<R: Rng + ?Sized, B1, B2>(
+                _low_b: B1, _high_b: B2, _rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                Default::default() // not used
+            }
         }
     }
 }
@@ -520,6 +563,44 @@ impl UniformInt<i128> {
 
         self.low.wrapping_add(result as i128)
     }
+
+    /// Sample single inclusive, using Canon's method variant
+    ///
+    /// Variant: potential increase to bias (uses a single `u64` sample).
+    #[inline]
+    pub fn sample_single_inclusive_canon_64<R: Rng + ?Sized, B1, B2>(
+        low_b: B1, high_b: B2, rng: &mut R,
+    ) -> i128
+    where
+        B1: SampleBorrow<i128> + Sized,
+        B2: SampleBorrow<i128> + Sized,
+    {
+        let low = *low_b.borrow();
+        let high = *high_b.borrow();
+        assert!(
+            low <= high,
+            "UniformSampler::sample_single_inclusive: low > high"
+        );
+        let range = high.wrapping_sub(low).wrapping_add(1) as u128;
+        if range == 0 {
+            // Range is MAX+1 (unrepresentable), so we need a special case
+            return rng.gen();
+        }
+
+        // generate a sample using a sensible integer type
+        let (mut result, lo1) = rng.gen::<u128>().wmul(range);
+
+        if lo1 > range.wrapping_neg() {
+            // Generate more bits. Sample is multiplied by 2.pow(-192), so
+            // hi2 is multiplied by 2.pow(-64):
+            let (hi2, lo2) = (rng.gen::<u64>() as u128).wmul(range);
+            debug_assert_eq!(hi2 >> 64, 0u128);
+            let is_overflow = lo1.checked_add((hi2 << 64) | (lo2 >> 64)).is_none();
+            result += is_overflow as u128;
+        }
+
+        low.wrapping_add(result as i128)
+    }
 }
 
 #[cfg(feature = "simd_support")]

From 9d0c88a2979974bd3088368660fa0f832ac5d037 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Mon, 15 Nov 2021 15:16:30 +0000
Subject: [PATCH 15/17] dedup SIMD sample_single_inclusive_bitmask

---
 src/distributions/uniform/uniform_int.rs | 135 ++++++++---------------
 1 file changed, 45 insertions(+), 90 deletions(-)

diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs
index b1bd86927b..da1a5b3e93 100644
--- a/src/distributions/uniform/uniform_int.rs
+++ b/src/distributions/uniform/uniform_int.rs
@@ -715,6 +715,51 @@ macro_rules! uniform_simd_int_impl {
                 let range: $unsigned = ((high - low) + 1).cast();
                 (range, low)
             }
+
+            ///
+            #[inline(always)]
+            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
+                low_b: B1, high_b: B2, rng: &mut R,
+            ) -> $ty
+            where
+                B1: SampleBorrow<$ty> + Sized,
+                B2: SampleBorrow<$ty> + Sized,
+            {
+                let (mut range, low) = Self::sample_inc_setup(low_b, high_b);
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // generate bitmask
+                range -= 1;
+                let mut mask = range | 1;
+
+                mask |= mask >> 1;
+                mask |= mask >> 2;
+                mask |= mask >> 4;
+
+                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
+                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
+                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
+                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
+                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
+
+                let mut v: $unsigned = rng.gen();
+                loop {
+                    let masked = v & mask;
+                    let accept = masked.le(range);
+                    if accept.all() {
+                        let masked: $ty = masked.cast();
+                        // wrapping addition
+                        let result = low + masked;
+                        // `select` here compiles to a blend operation
+                        // When `range.eq(0).none()` the compare and blend
+                        // operations are avoided.
+                        let v: $ty = v.cast();
+                        return is_full_range.select(v, result);
+                    }
+                    // Replace only the failing lanes
+                    v = accept.select(v, rng.gen());
+                }
+            }
         }
     };
 
@@ -806,51 +851,6 @@ macro_rules! uniform_simd_int_gt8_impl {
                 let cast_rand_bits: $ty = rand_bits.cast();
                 is_full_range.select(cast_rand_bits, low + cast_result)
             }
-
-            /// Bitmask
-            #[inline(always)]
-            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (mut range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate bitmask
-                range -= 1;
-                let mut mask = range | 1;
-
-                mask |= mask >> 1;
-                mask |= mask >> 2;
-                mask |= mask >> 4;
-
-                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
-                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
-                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
-                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
-                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
-
-                let mut v: $unsigned = rng.gen();
-                loop {
-                    let masked = v & mask;
-                    let accept = masked.le(range);
-                    if accept.all() {
-                        let masked: $ty = masked.cast();
-                        // wrapping addition
-                        let result = low + masked;
-                        // `select` here compiles to a blend operation
-                        // When `range.eq(0).none()` the compare and blend
-                        // operations are avoided.
-                        let v: $ty = v.cast();
-                        return is_full_range.select(v, result);
-                    }
-                    // Replace only the failing lanes
-                    v = accept.select(v, rng.gen());
-                }
-            }
         }
     };
 
@@ -966,51 +966,6 @@ macro_rules! uniform_simd_int_le8_impl {
                 let cast_rand_bits: $ty = rand_bits.cast();
                 is_full_range.select(cast_rand_bits, low + cast_result)
             }
-
-            ///
-            #[inline(always)]
-            pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (mut range, low) = Self::sample_inc_setup(low_b, high_b);
-                let is_full_range = range.eq($unsigned::splat(0));
-
-                // generate bitmask
-                range -= 1;
-                let mut mask = range | 1;
-
-                mask |= mask >> 1;
-                mask |= mask >> 2;
-                mask |= mask >> 4;
-
-                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
-                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
-                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
-                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
-                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
-
-                let mut v: $unsigned = rng.gen();
-                loop {
-                    let masked = v & mask;
-                    let accept = masked.le(range);
-                    if accept.all() {
-                        let masked: $ty = masked.cast();
-                        // wrapping addition
-                        let result = low + masked;
-                        // `select` here compiles to a blend operation
-                        // When `range.eq(0).none()` the compare and blend
-                        // operations are avoided.
-                        let v: $ty = v.cast();
-                        return is_full_range.select(v, result);
-                    }
-                    // Replace only the failing lanes
-                    v = accept.select(v, rng.gen());
-                }
-            }
         }
     };
 

From b912a09c06eb6f1e4befc1a1fc0d30c92e4d2072 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Tue, 16 Nov 2021 09:30:25 +0000
Subject: [PATCH 16/17] temp

---
 benches/uniform.rs                       |  45 +++--
 src/distributions/uniform/uniform_int.rs | 236 ++++++++++++++---------
 2 files changed, 177 insertions(+), 104 deletions(-)

diff --git a/benches/uniform.rs b/benches/uniform.rs
index cb743b4f78..cdd8376482 100644
--- a/benches/uniform.rs
+++ b/benches/uniform.rs
@@ -86,6 +86,24 @@ fn uniform_int(c: &mut Criterion) {
     bench_int!(c, i128, (i128::MIN, 1));
 }
 
+#[cfg(feature = "simd_support")]
+macro_rules! bench_dist_simd_group {
+    ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => {
+        for input in $inputs {
+            $g.bench_with_input(
+                BenchmarkId::new($name, input.0),
+                &input.1,
+                |b, (low, high)| {
+                    let mut rng = BenchRng::from_entropy();
+                    let (low, high) = (<$T>::splat(*low), <$T>::splat(*high));
+                    let dist = Uniform::new_inclusive(low, high);
+                    b.iter(|| <$T as SampleUniform>::Sampler::$f(&dist.0, &mut rng))
+                },
+            );
+        }
+    };
+}
+
 #[cfg(feature = "simd_support")]
 macro_rules! bench_simd_group {
     ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => {
@@ -114,20 +132,21 @@ macro_rules! bench_simd_group {
 #[cfg(feature = "simd_support")]
 macro_rules! bench_simd {
     ($c:expr, $T:ty, $high:expr/*, $incr:expr*/) => {{
-        let mut g = $c.benchmark_group(concat!("uniform_simd_", stringify!($T)));
         let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))];
 
+        let mut g = $c.benchmark_group(concat!("uniform_dist_simd_", stringify!($T)));
+        bench_dist_simd_group!("Old", $T, sample, g, inputs);
+        bench_dist_simd_group!("Canon", $T, sample_canon, g, inputs);
+        bench_dist_simd_group!("Canon-Lemire", $T, sample_canon_lemire, g, inputs);
+        bench_dist_simd_group!("Bitmask", $T, sample_bitmask, g, inputs);
+        drop(g);
+
+        let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T)));
         bench_simd_group!("Old", $T, sample_single_inclusive, g, inputs);
         bench_simd_group!("Canon", $T, sample_single_inclusive_canon, g, inputs);
-        bench_simd_group!(
-            "Canon-branchless",
-            $T,
-            sample_single_inclusive_canon_branchless,
-            g,
-            inputs
-        );
-        bench_simd_group!("Canon-scalar", $T, sample_inclusive_canon_scalar, g, inputs);
+        bench_simd_group!("Canon-Lemire", $T, sample_inclusive_canon_lemire, g, inputs);
         bench_simd_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs);
+
     }};
 }
 
@@ -138,11 +157,11 @@ fn uniform_simd(c: &mut Criterion) {
     bench_simd!(c, i8x2, (i8::MIN, 116));
     bench_simd!(c, i8x4, (i8::MIN, 116));
     bench_simd!(c, i8x8, (i8::MIN, 116));
-    bench_simd!(c, i8x16, (i8::MIN, 116));
-    bench_simd!(c, i8x32, (i8::MIN, 116));
-    bench_simd!(c, i8x64, (i8::MIN, 116));
+//     bench_simd!(c, i8x16, (i8::MIN, 116));
+//     bench_simd!(c, i8x32, (i8::MIN, 116));
+//     bench_simd!(c, i8x64, (i8::MIN, 116));
     bench_simd!(c, i16x8, (i16::MIN, 32407));
-    bench_simd!(c, i16x16, (i16::MIN, 32407));
+//     bench_simd!(c, i16x16, (i16::MIN, 32407));
     bench_simd!(c, i32x4, (i32::MIN, 1));
     bench_simd!(c, i32x8, (i32::MIN, 1));
     bench_simd!(c, i64x2, (i64::MIN, 1));
diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs
index da1a5b3e93..24c84f38d9 100644
--- a/src/distributions/uniform/uniform_int.rs
+++ b/src/distributions/uniform/uniform_int.rs
@@ -702,6 +702,48 @@ macro_rules! uniform_simd_int_impl {
         }
 
         impl UniformInt<$ty> {
+            /// Sample, Bitmask method
+            #[inline]
+            pub fn sample_bitmask<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                let mut range: $unsigned = self.range.cast();
+                let is_full_range = range.eq($unsigned::splat(0));
+
+                // the old impl use a mix of methods for different integer sizes, we only use
+                // the lz method here for a better comparison.
+
+                // generate bitmask
+                range -= 1;
+                let mut mask = range | 1;
+
+                mask |= mask >> 1;
+                mask |= mask >> 2;
+                mask |= mask >> 4;
+
+                const LANE_WIDTH: usize = std::mem::size_of::<$ty>() * 8 / <$ty>::lanes();
+                if LANE_WIDTH >=  16 { mask |= mask >>  8; }
+                if LANE_WIDTH >=  32 { mask |= mask >> 16; }
+                if LANE_WIDTH >=  64 { mask |= mask >> 32; }
+                if LANE_WIDTH >= 128 { mask |= mask >> 64; }
+
+                let mut v: $unsigned = rng.gen();
+                loop {
+                    let masked = v & mask;
+                    let accept = masked.le(range);
+                    if accept.all() {
+                        let masked: $ty = masked.cast();
+                        // wrapping addition
+                        let result = self.low + masked;
+                        // `select` here compiles to a blend operation
+                        // When `range.eq(0).none()` the compare and blend
+                        // operations are avoided.
+                        let v: $ty = v.cast();
+                        return is_full_range.select(v, result);
+                    }
+                    // Replace only the failing lanes
+                    v = accept.select(v, rng.gen());
+                }
+            }
+
             #[inline(always)]
             fn sample_inc_setup<B1, B2>(low_b: B1, high_b: B2) -> ($unsigned, $ty)
             where
@@ -716,8 +758,8 @@ macro_rules! uniform_simd_int_impl {
                 (range, low)
             }
 
-            ///
-            #[inline(always)]
+            /// Sample single inclusive, using the Bitmask method
+            #[inline]
             pub fn sample_single_inclusive_bitmask<R: Rng + ?Sized, B1, B2>(
                 low_b: B1, high_b: B2, rng: &mut R,
             ) -> $ty
@@ -771,7 +813,7 @@ macro_rules! uniform_simd_int_impl {
         )+
     };
 }
-
+/*
 #[cfg(feature = "simd_support")]
 macro_rules! uniform_simd_int_gt8_impl {
     ($ty:ident, $unsigned:ident) => {
@@ -792,7 +834,7 @@ macro_rules! uniform_simd_int_gt8_impl {
                 let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
                 *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
             }
-
+/*
             /// Canon's method
             #[inline(always)]
             pub fn sample_single_inclusive_canon_branchless<R: Rng + ?Sized, B1, B2>(
@@ -851,6 +893,7 @@ macro_rules! uniform_simd_int_gt8_impl {
                 let cast_rand_bits: $ty = rand_bits.cast();
                 is_full_range.select(cast_rand_bits, low + cast_result)
             }
+*/
         }
     };
 
@@ -859,54 +902,77 @@ macro_rules! uniform_simd_int_gt8_impl {
         uniform_simd_int_gt8_impl!{ $signed, $unsigned }
     )+};
 }
+*/
 
+// These are the naive ports of the above algorithms to SIMD.
+// Caveat: supports only up to x8, and at x8 it relies on 512-bit SIMD.
+// Caveat: always generates u64 samples which will often be missing.
 #[cfg(feature = "simd_support")]
 macro_rules! uniform_simd_int_le8_impl {
-    ($ty:ident, $unsigned:ident, $u64xN_type:ident, $u_extra_large:ident) => {
+    ($ty:ident, $unsigned:ident, $u_extra_large:ident) => {
         impl UniformInt<$ty> {
             #[inline(always)]
             fn canon_successive<R: Rng + ?Sized>(
-                range: $unsigned,
-                result: &mut $unsigned,
-                lo_order: $unsigned,
+                range: $u_extra_large,
+                result: &mut $u_extra_large,
+                lo_order: $u_extra_large,
                 rng: &mut R
             ) {
                 // ...generate a new sample with 64 more bits, enough that bias is undetectable
-                let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast();
-                let large_range: $u_extra_large = range.cast();
-                let (new_hi_order, _) = new_bits.wmul(large_range);
+                let new_bits: $u_extra_large = rng.gen::<$u_extra_large>();
+                let (new_hi_order, _) = new_bits.wmul(range);
                 // and adjust if needed
-                let cast_new_hi: $unsigned = new_hi_order.cast();
-                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
-                *result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
+                let (_, overflowed) = lo_order.overflowing_add(new_hi_order);
+                *result += overflowed.select($u_extra_large::splat(1), $u_extra_large::splat(0));
             }
 
-            ///
-            #[inline(always)]
-            pub fn sample_single_inclusive_canon_branchless<R: Rng + ?Sized, B1, B2>(
-                low_b: B1, high_b: B2, rng: &mut R,
-            ) -> $ty
-            where
-                B1: SampleBorrow<$ty> + Sized,
-                B2: SampleBorrow<$ty> + Sized,
-            {
-                let (range, low) = Self::sample_inc_setup(low_b, high_b);
+            /// Sample, Canon's method
+            #[inline]
+            pub fn sample_canon<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                let range: $unsigned = self.range.cast();
                 let is_full_range = range.eq($unsigned::splat(0));
+                let range: $u_extra_large = range.cast();
 
                 // generate a sample using a sensible integer type
-                let rand_bits = rng.gen::<$unsigned>();
+                let rand_bits = rng.gen::<$u_extra_large>();
                 let (mut result, lo_order) = rand_bits.wmul(range);
 
-                Self::canon_successive(range, &mut result, lo_order, rng);
+                if lo_order.gt(0 - range).any() {
+                    Self::canon_successive(range, &mut result, lo_order, rng);
+                }
 
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
+                // truncate and return the result:
+                let result: $ty = result.cast();
+                let rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(rand_bits, self.low + result)
             }
 
-            ///
-            #[inline(always)]
-            pub fn sample_inclusive_canon_scalar<R: Rng + ?Sized, B1, B2>(
+            /// Sample, Canon's method with Lemire's early-out
+            #[inline]
+            pub fn sample_canon_lemire<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                let range: $unsigned = self.range.cast();
+                let is_full_range = range.eq($unsigned::splat(0));
+                let range: $u_extra_large = range.cast();
+
+                // generate a sample using a sensible integer type
+                let rand_bits = rng.gen::<$u_extra_large>();
+                let (mut result, lo_order) = rand_bits.wmul(range);
+
+                let nrmr: $unsigned = self.nrmr.cast();
+                let nrmr: $u_extra_large = nrmr.cast();
+                if lo_order.lt(nrmr).any() {
+                    Self::canon_successive(range, &mut result, lo_order, rng);
+                }
+
+                // truncate and return the result:
+                let result: $ty = result.cast();
+                let rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(rand_bits, self.low + result)
+            }
+
+            /// Sample single inclusive, using Canon's method
+            #[inline]
+            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
                 low_b: B1, high_b: B2, rng: &mut R,
             ) -> $ty
             where
@@ -915,36 +981,25 @@ macro_rules! uniform_simd_int_le8_impl {
             {
                 let (range, low) = Self::sample_inc_setup(low_b, high_b);
                 let is_full_range = range.eq($unsigned::splat(0));
+                let range: $u_extra_large = range.cast();
 
                 // generate a sample using a sensible integer type
-                let rand_bits = rng.gen::<$unsigned>();
+                let rand_bits = rng.gen::<$u_extra_large>();
                 let (mut result, lo_order) = rand_bits.wmul(range);
 
-                // ...generate a new sample with 64 more bits, enough that bias is undetectable
-                let new_bits: $u_extra_large = rng.gen::<$u64xN_type>().cast();
-                let large_range: $u_extra_large = range.cast();
-
-                // let (new_hi_order, _) = new_bits.wmul(large_range);
-                let mut new_hi_order = <$u_extra_large>::default();
-
-                for i in 0..<$ty>::lanes() {
-                    let (shi, _slo) = new_bits.extract(i).wmul(large_range.extract(i));
-                    new_hi_order = new_hi_order.replace(i, shi);
+                if lo_order.gt(0 - range).any() {
+                    Self::canon_successive(range, &mut result, lo_order, rng);
                 }
 
-                // and adjust if needed
-                let cast_new_hi: $unsigned = new_hi_order.cast();
-                let (_, overflowed) = lo_order.overflowing_add(cast_new_hi);
-                result += overflowed.select($unsigned::splat(1), $unsigned::splat(0));
-
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
+                // truncate and return the result:
+                let result: $ty = result.cast();
+                let rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(rand_bits, low + result)
             }
 
-            ///
-            #[inline(always)]
-            pub fn sample_single_inclusive_canon<R: Rng + ?Sized, B1, B2>(
+            /// Sample single inclusive, using Canon's method with Lemire's early-out
+            #[inline]
+            pub fn sample_inclusive_canon_lemire<R: Rng + ?Sized, B1, B2>(
                 low_b: B1, high_b: B2, rng: &mut R,
             ) -> $ty
             where
@@ -953,25 +1008,28 @@ macro_rules! uniform_simd_int_le8_impl {
             {
                 let (range, low) = Self::sample_inc_setup(low_b, high_b);
                 let is_full_range = range.eq($unsigned::splat(0));
+                let range: $u_extra_large = range.cast();
 
                 // generate a sample using a sensible integer type
-                let rand_bits = rng.gen::<$unsigned>();
+                let rand_bits = rng.gen::<$u_extra_large>();
                 let (mut result, lo_order) = rand_bits.wmul(range);
 
-                if lo_order.gt(0 - range).any() {
+                let nrmr = ((0 - range) % range);
+                if lo_order.lt(nrmr).any() {
                     Self::canon_successive(range, &mut result, lo_order, rng);
                 }
 
-                let cast_result: $ty = result.cast();
-                let cast_rand_bits: $ty = rand_bits.cast();
-                is_full_range.select(cast_rand_bits, low + cast_result)
+                // truncate and return the result:
+                let result: $ty = result.cast();
+                let rand_bits: $ty = rand_bits.cast();
+                is_full_range.select(rand_bits, low + result)
             }
         }
     };
 
-    ($(($unsigned:ident, $signed:ident, $u64xN_type:ident, $u_extra_large:ident)),+) => {$(
-        uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u64xN_type, $u_extra_large }
-        uniform_simd_int_le8_impl!{ $signed, $unsigned, $u64xN_type, $u_extra_large }
+    ($(($unsigned:ident, $signed:ident, $u_extra_large:ident),)+) => {$(
+        uniform_simd_int_le8_impl!{ $unsigned, $unsigned, $u_extra_large }
+        uniform_simd_int_le8_impl!{ $signed, $unsigned, $u_extra_large }
     )+};
 }
 
@@ -1023,41 +1081,37 @@ uniform_simd_int_impl! {
     u8
 }
 
-#[cfg(feature = "simd_support")]
-uniform_simd_int_gt8_impl! {
-    (u8x16, i8x16),
-    (u8x32, i8x32),
-    (u8x64, i8x64),
-
-    (u16x16, i16x16),
-    (u16x32, i16x32),
-
-    (u32x16, i32x16)
-}
+// #[cfg(feature = "simd_support")]
+// uniform_simd_int_gt8_impl! {
+//     (u8x16, i8x16),
+//     (u8x32, i8x32),
+//     (u8x64, i8x64),
+//
+//     (u16x16, i16x16),
+//     (u16x32, i16x32),
+//
+//     (u32x16, i32x16)
+// }
 #[cfg(feature = "simd_support")]
 uniform_simd_int_le8_impl! {
-    (u8x2, i8x2, i64x2, u64x2),
-    (u8x4, i8x4, i64x4, u64x4),
-    (u8x8, i8x8, i64x8, u64x8),
-
-    (u16x2, i16x2, i64x2, u64x2),
-    (u16x4, i16x4, i64x4, u64x4),
-    (u16x8, i16x8, i64x8, u64x8),
+    (u8x2, i8x2, u64x2),
+    (u8x4, i8x4, u64x4),
+    (u8x8, i8x8, u64x8),
 
-    (u32x2, i32x2, i64x2, u64x2),
-    (u32x4, i32x4, i64x4, u64x4),
-    (u32x8, i32x8, i64x8, u64x8),
+    (u16x2, i16x2, u64x2),
+    (u16x4, i16x4, u64x4),
+    (u16x8, i16x8, u64x8),
 
-    (u64x2, i64x2, i64x2, u64x2),
-    (u64x4, i64x4, i64x4, u64x4),
-    (u64x8, i64x8, i64x8, u64x8),
+    (u32x2, i32x2, u64x2),
+    (u32x4, i32x4, u64x4),
+    (u32x8, i32x8, u64x8),
 
-    (u128x2, i128x2, i64x2, u128x2),
-    (u128x4, i128x4, i64x4, u128x4)
+    (u64x2, i64x2, u64x2),
+    (u64x4, i64x4, u64x4),
+    (u64x8, i64x8, u64x8),
 
-    // (usizex2, isizex2, i64x2, u64x2),
-    // (usizex4, isizex4, i64x4, u64x4),
-    // (usizex8, isizex8, i64x8, u64x8)
+    (u128x2, i128x2, u128x2),
+    (u128x4, i128x4, u128x4),
 }
 
 #[cfg(test)]

From a9d9a88e17acae487f6fa74a320a037ab68437eb Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 24 Feb 2022 10:58:03 +0000
Subject: [PATCH 17/17] temp

---
 benches/uniform.rs           |  3 +-
 benches/uniform_simd.rs      | 69 ++++++++++++++++++++++++++++++++++++
 src/distributions/uniform.rs |  3 +-
 3 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 benches/uniform_simd.rs

diff --git a/benches/uniform.rs b/benches/uniform.rs
index cdd8376482..80520c8df6 100644
--- a/benches/uniform.rs
+++ b/benches/uniform.rs
@@ -10,9 +10,8 @@ use criterion::{criterion_group, criterion_main};
 use criterion::{BenchmarkId, Criterion};
 #[cfg(feature = "simd_support")] use packed_simd::*;
 use rand::distributions::uniform::{SampleUniform, Uniform, UniformSampler};
-use rand::prelude::*;
 
-type BenchRng = SmallRng;
+type BenchRng = rand::rngs::SmallRng;
 
 macro_rules! bench_dist_int_group {
     ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => {
diff --git a/benches/uniform_simd.rs b/benches/uniform_simd.rs
new file mode 100644
index 0000000000..2cf6ac0bf3
--- /dev/null
+++ b/benches/uniform_simd.rs
@@ -0,0 +1,69 @@
+// Copyright 2021 Developers of the Rand project.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use criterion::{criterion_group, criterion_main};
+use criterion::{BenchmarkGroup, BenchmarkId, Criterion};
+use rand::distributions::uniform::{SampleUniform, UniformSampler};
+use rand::prelude::*;
+use packed_simd::*;
+
+type BenchRng = SmallRng;
+
+macro_rules! bench_simd_group {
+    ($name:literal, $T:ty, $f:ident, $g:expr, $inputs:expr) => {
+        for input in $inputs {
+            $g.bench_with_input(
+                BenchmarkId::new($name, input.0),
+                &input.1,
+                |b, (low, high)| {
+                    let mut rng = BenchRng::from_entropy();
+                    b.iter(|| <$T as SampleUniform>::Sampler::$f(low, high, &mut rng))
+                },
+            );
+        }
+        $g.bench_function(BenchmarkId::new($name, "varying"), |b| {
+            let (low, mut high) = ($inputs[0].1 .0, $inputs[1].1 .1);
+            let mut rng = BenchRng::from_entropy();
+            b.iter(|| {
+                high = high.wrapping_add(1);
+                <$T as SampleUniform>::Sampler::$f(low, high, &mut rng)
+            })
+        });
+    };
+}
+
+macro_rules! bench_simd {
+    ($c:expr, $T:ty, $high:expr) => {{
+        let mut g = $c.benchmark_group(concat!("uniform_int_", stringify!($T)));
+        let inputs = &[("high_reject", $high), ("low_reject", (-1, 2))];
+
+        bench_simd_group!("Old", $T, sample_single_inclusive, g, inputs);
+//         bench_simd_group!("ONeill", $T, sample_single_inclusive_oneill, g, inputs);
+        bench_simd_group!("Canon", $T, sample_single_inclusive_canon, g, inputs);
+//         bench_simd_group!("Canon-Lemire", $T, sample_inclusive_canon_lemire, g, inputs);
+//         bench_simd_group!("Bitmask", $T, sample_single_inclusive_bitmask, g, inputs);
+    }};
+}
+
+fn uniform_int(c: &mut Criterion) {
+    // for i8/i16, we use 32-bit integers internally so rejection is most common near full-size
+    // the exact values were determined with an exhaustive search
+    bench_simd!(c, i8x16, (i8::MIN, 116));
+//     bench_simd!(c, i16, (i16::MIN, 32407));
+//     bench_simd!(c, i32, (i32::MIN, 1));
+//     bench_simd!(c, i64, (i64::MIN, 1));
+//     bench_simd!(c, i128, (i128::MIN, 1));
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default();
+    targets = uniform_int
+    // targets = uniform_int_i8x16, uniform_int_i16, uniform_int_i32, uniform_int_i64, uniform_int_i128
+}
+criterion_main!(benches);
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 8b1beb80dc..4b81ab9ee9 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -178,8 +178,7 @@ pub use uniform_other::{UniformChar, UniformDuration};
     feature = "serde1",
     serde(bound(deserialize = "X::Sampler: Deserialize<'de>"))
 )]
-// HACK: field is pub
-pub struct Uniform<X: SampleUniform>(pub X::Sampler);
+pub struct Uniform<X: SampleUniform>(X::Sampler);
 
 impl<X: SampleUniform> Uniform<X> {
     /// Create a new `Uniform` instance which samples uniformly from the half