Fix conditional compilation, so that everything builds on machine wit…

…hout AVX512
martinothamar · Jul 29, 2023 · 20ddb7b · 20ddb7b
1 parent 1d1ab21
commit 20ddb7b
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 70 deletions.
diff --git a/Makefile b/Makefile
@@ -25,6 +25,10 @@ dasm:
 	cargo objdump --example dasm --release -- \
 	-d -M intel > $(bindir)/dasm.asm 2> $(bindir)/dasm.asm.log
 
+dasmbench:
+	cargo objdump --bench main --release -- \
+	-d -M intel > target/release/bench.asm 2> target/release/bench.asm.log
+
 asm:
 	cargo rustc --release --example dasm -- --emit asm -C "llvm-args=-x86-asm-syntax=intel"
 

diff --git a/README.md b/README.md
@@ -5,7 +5,9 @@ Categories:
 - [`portable`] - portable implementations using `std::simd` (nightly required) 
 - [`specific`] - implementations using architecture-specific hardware intrinsics
   - [`specific::avx2`] - AVX2 for x86_64 architecture (4 lanes for 64bit)
+    - Requires `avx2` CPU flag, but has additional optimization if you have `avx512dq` and `avx512vl`
   - [`specific::avx512`] - AVX512 for x86_64 architecture (8 lanes for 64bit)
+    - Requires `avx512f`, `avx512dq` CPU flags
 
 Vectorized PRNG implementations may perform anywhere from 4-6 times faster in my experience,
 of course very dependent on hardware used ("old" CPUs with AVX512 for example may have excessive thermal throttling).
@@ -40,11 +42,21 @@ fn main() {
 }
 ```
 
+The `portable` module will be available on any architecture, e.g. even on x86_64 with only AVX2 you can still use `Xoshiro256PlusPluxX8` which uses
+8-lane/512bit vectors (u64x8 from `std::simd`). The compiler is able to make it reasonably fast even if using only 256bit wide registers (AVX2) in the generated code.
+
+The `specific` submodules (AVX2 and AVX512 currently) are only compiled in depending on target arch/features.
+
+In general, use the `portable` module. The only risk/drawback to using the `portable` module is that in principle
+the compiler isn't _forced_ to use the "optimal" instructions and registers for your hardware. In practice, it probably will though.
+In the `specific` submodules the respective hardware intrinsics are "hardcoded" so to speak so we always know what the generated code looks like.
+In some contexts that may be useful.
+
 ## Performance
 
 The top performing generator (on my current hardware) is currently Xoshiro256+ using AVX512 intrinsics.
 It is about 5.9x faster. The below benchmarks generates `u64x8` numbers in a loop.
-Note that the RandVectorized variant uses `simd_support` from the rand crate,
+Note that the RandVectorized variant uses `simd_support` from the `rand` crate,
 but this doesn't actually vectorize random number generation.
 
 If you want to actually use these generators, you should benchmark them yourself on your own hardware. See the `bench` target in the [Makefile](/Makefile).

diff --git a/benches/main.rs b/benches/main.rs
@@ -13,7 +13,6 @@ use criterion_perf_events::Perf;
 use perfcnt::linux::HardwareEventType as Hardware;
 use perfcnt::linux::PerfCounterBuilderLinux as Builder;
 use rand_core::SeedableRng;
-use simd_rand::specific::avx2::*;
 
 mod portable;
 mod scratch;
@@ -34,17 +33,13 @@ fn bench<M: Measurement, const T: u8>(c: &mut Criterion<M>) {
 
     crate::portable::add_benchmarks::<_, ITERATIONS>(c, suffix);
 
-    if cfg!(all(target_arch = "x86_64", target_feature = "avx2")) {
-        crate::specific::avx2::add_benchmarks::<_, ITERATIONS>(c, suffix);
-    }
-    if cfg!(all(
-        target_arch = "x86_64",
-        target_feature = "avx512f",
-        target_feature = "avx512dq"
-    )) {
-        crate::specific::avx512::add_benchmarks::<_, ITERATIONS>(c, suffix);
-        crate::top::add_top_benchmark::<_, ITERATIONS>(c);
-    }
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
+    crate::specific::avx2::add_benchmarks::<_, ITERATIONS>(c, suffix);
+
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
+    crate::specific::avx512::add_benchmarks::<_, ITERATIONS>(c, suffix);
+
+    crate::top::add_top_benchmark::<_, ITERATIONS>(c);
 }
 
 #[non_exhaustive]

diff --git a/benches/scratch/mod.rs b/benches/scratch/mod.rs
@@ -5,9 +5,7 @@ use criterion::{black_box, measurement::Measurement, BenchmarkId, Criterion, Thr
 use rand_core::{SeedableRng, RngCore};
 use rand_xoshiro::Xoshiro256Plus;
 use simd_rand::portable;
-use simd_rand::portable::SimdRandX8 as PortableSimdRand;
-use simd_rand::specific;
-use simd_rand::specific::avx512::SimdRand as SpecificSimdRand;
+use simd_rand::portable::*;
 
 #[inline(always)]
 fn do_u64_baseline(rng: &mut Xoshiro256Plus) -> u64x8 {
@@ -24,15 +22,27 @@ fn do_u64_baseline(rng: &mut Xoshiro256Plus) -> u64x8 {
 }
 
 #[inline(always)]
-fn do_u64_portable<RNG: PortableSimdRand>(rng: &mut RNG) -> u64x8 {
-    rng.next_u64x8()
+fn do_u64_portable_x4<RNG: SimdRandX4>(rng: &mut RNG) -> u64x8 {
+    let a = rng.next_u64x4();
+    let b = rng.next_u64x4();
+    u64x8::from_array([
+        a[0],
+        a[1],
+        a[2],
+        a[3],
+        b[0],
+        b[1],
+        b[2],
+        b[3],
+    ])
 }
 
 #[inline(always)]
-fn do_u64_specific<RNG: SpecificSimdRand>(rng: &mut RNG) -> __m512i {
-    rng.next_m512i()
+fn do_u64_portable_x8<RNG: SimdRandX8>(rng: &mut RNG) -> u64x8 {
+    rng.next_u64x8()
 }
 
+
 pub fn add_benchmarks<M: Measurement>(c: &mut Criterion<M>, suffix: &str) {
     let mut group = c.benchmark_group("Scratch");
 
@@ -44,17 +54,17 @@ pub fn add_benchmarks<M: Measurement>(c: &mut Criterion<M>, suffix: &str) {
 
         b.iter(|| do_u64_baseline(&mut rng))
     });
-    let name = BenchmarkId::new(format!("Portable/Xoshiro256+/{suffix}"), "1");
+    let name = BenchmarkId::new(format!("Portable/Xoshiro256+X4/{suffix}"), "1");
     group.bench_with_input(name, &1, |b, _| {
-        let mut rng = portable::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
+        let mut rng = portable::Xoshiro256PlusX4::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
 
-        b.iter(|| do_u64_portable(&mut rng))
+        b.iter(|| do_u64_portable_x4(&mut rng))
     });
-    let name = BenchmarkId::new(format!("Specific/Xoshiro256+/{suffix}"), "1");
+    let name = BenchmarkId::new(format!("Portable/Xoshiro256+X8/{suffix}"), "1");
     group.bench_with_input(name, &1, |b, _| {
-        let mut rng = specific::avx512::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
+        let mut rng = portable::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
 
-        b.iter(|| do_u64_specific(&mut rng))
+        b.iter(|| do_u64_portable_x8(&mut rng))
     });
 
     group.finish();

diff --git a/benches/top/mod.rs b/benches/top/mod.rs
@@ -1,67 +1,75 @@
-use std::{mem, arch::x86_64::*};
+use std::{mem, arch::x86_64::*, simd::u64x8};
 
 use criterion::{measurement::Measurement, Criterion, Throughput, black_box, BenchmarkId};
 use rand::Rng;
 use rand_core::{RngCore, SeedableRng};
 use rand_xoshiro::Xoshiro256Plus;
-use simd_rand::specific::avx512::{U64x8, SimdRand, Xoshiro256PlusX8};
+use simd_rand::portable::*;
+use simd_rand::specific;
 use packed_simd_2::u64x8 as ps_u64x8;
 
-#[inline(always)]
-fn execute_rand<RNG: RngCore>(rng: &mut RNG, data: &mut U64x8, i: usize) {
-    for _ in 0..i {
-        for i in 0..8 {
-            black_box(&mut *data)[i] = rng.next_u64();
-        }
+#[inline(never)]
+fn execute_rand<RNG: RngCore>(rng: &mut RNG, data: &mut u64x8) {
+    for i in 0..8 {
+        data[i] = rng.next_u64();
     }
 }
 
-#[inline(always)]
-fn execute_rand_vectorized<RNG: RngCore>(rng: &mut RNG, data: &mut ps_u64x8, i: usize) {
-    for _ in 0..i {
-        *black_box(&mut *data) = rng.gen::<ps_u64x8>();
-    }
+#[inline(never)]
+fn execute_rand_vectorized<RNG: RngCore>(rng: &mut RNG, data: &mut ps_u64x8) {
+    *data = rng.gen::<ps_u64x8>();
 }
 
-#[inline(always)]
-fn execute_vectorized<RNG: SimdRand>(rng: &mut RNG, data: &mut __m512i, i: usize) {
-    for _ in 0..i {
-        *black_box(&mut *data) = rng.next_m512i();
-    }
+#[inline(never)]
+fn execute_vectorized_portable<RNG: SimdRandX8>(rng: &mut RNG, data: &mut u64x8) {
+    *data = rng.next_u64x8();
 }
 
-pub fn add_top_benchmark<M: Measurement, const ITERATIONS: usize>(c: &mut Criterion<M>) {
-    let mut group = c.benchmark_group("top");
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
+#[inline(never)]
+fn execute_vectorized_specific<RNG: specific::avx512::SimdRand>(rng: &mut RNG, data: &mut __m512i) {
+    *data = rng.next_m512i();
+}
 
-    let iterations: Vec<_> = (0..8).map(|v| (v + 1) * ITERATIONS).collect();
+pub fn add_top_benchmark<M: Measurement, const ITERATIONS: usize>(c: &mut Criterion<M>) {
+    let mut group = c.benchmark_group("Top");
 
-    for iterations in iterations {
-        group.throughput(Throughput::Bytes((iterations * mem::size_of::<__m512i>()) as u64));
+    group.throughput(Throughput::Bytes(mem::size_of::<u64x8>() as u64));
 
-        let name = BenchmarkId::new(format!("Rand/Xoshiro256+"), iterations);
-        group.bench_with_input(name, &iterations, |b, i| {
-            let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
-            let mut data = Default::default();
+    let name = BenchmarkId::new(format!("Rand/Xoshiro256+"), 1);
 
-            b.iter(|| execute_rand(&mut rng, black_box(&mut data), black_box(*i)))
-        });
+    group.bench_with_input(name, &1, |b, i| {
+        let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
+        let mut data = Default::default();
 
-        let name = BenchmarkId::new(format!("RandVectorized/Xoshiro256+"), iterations);
-        group.bench_with_input(name, &iterations, |b, i| {
-            let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
-            let mut data = Default::default();
+        b.iter(|| execute_rand(&mut rng, black_box(&mut data)))
+    });
 
-            b.iter(|| execute_rand_vectorized(&mut rng, black_box(&mut data), black_box(*i)))
-        });
-
-        let name = BenchmarkId::new(format!("AVX512/Xoshiro256+"), iterations);
-        group.bench_with_input(name, &iterations, |b, i| unsafe {
-            let mut rng = Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
-            let mut data: __m512i = _mm512_setzero_si512();
+    let name = BenchmarkId::new(format!("RandVectorized/Xoshiro256+"), 1);
+    group.bench_with_input(name, &1, |b, i| {
+        let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
+        let mut data = Default::default();
 
-            b.iter(|| execute_vectorized(&mut rng, black_box(&mut data), black_box(*i)))
-        });
-    }
+        b.iter(|| execute_rand_vectorized(&mut rng, black_box(&mut data)))
+    });
+
+    let name = BenchmarkId::new(format!("Portable/Xoshiro256+X8"), 1);
+    group.bench_with_input(name, &1, |b, i| {
+        let mut rng = Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
+        let mut data = Default::default();
+
+        b.iter(|| execute_vectorized_portable(&mut rng, black_box(&mut data)))
+    });
+
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
+    let name = BenchmarkId::new(format!("Specific/Xoshiro256+X8"), iterations);
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
+    group.bench_with_input(name, &iterations, |b, i| unsafe {
+        let mut rng = specific::avx512::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
+        let mut data: __m512i = _mm512_setzero_si512();
+
+        b.iter(|| execute_vectorized_specific(&mut rng, black_box(&mut data)))
+    });
 
     group.finish();
 }
diff --git a/examples/_internal/dasm.rs b/examples/_internal/dasm.rs
@@ -7,18 +7,30 @@ use std::simd::u64x4;
 use std::simd::u64x8;
 
 use criterion::black_box;
+use rand_core::RngCore;
 use rand_core::SeedableRng;
 use simd_rand::portable;
 use simd_rand::portable::SimdRandX4 as PortableSimdRandX4;
 use simd_rand::portable::SimdRandX8 as PortableSimdRandX8;
 use simd_rand::specific;
 use simd_rand::specific::avx2::SimdRand as SpecificSimdRandX4;
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
 use simd_rand::specific::avx512::SimdRand as SpecificSimdRandX8;
 
 /// This is a small binary meant to aid in analyzing generated code
 /// For example to see differences between portable and specific code,
 /// and simd_rand and rand code
 
+#[inline(never)]
+fn do_u64x4_baseline<RNG: RngCore>(rng: &mut RNG) -> u64x4 {
+    u64x4::from_array([
+        rng.next_u64(),
+        rng.next_u64(),
+        rng.next_u64(),
+        rng.next_u64(),
+    ])
+}
+
 #[inline(never)]
 fn do_u64x4_portable<RNG: PortableSimdRandX4>(rng: &mut RNG) -> u64x4 {
     rng.next_u64x4()
@@ -29,11 +41,26 @@ fn do_u64x4_specific<RNG: SpecificSimdRandX4>(rng: &mut RNG) -> __m256i {
     rng.next_m256i()
 }
 
+#[inline(never)]
+fn do_u64x8_baseline<RNG: RngCore>(rng: &mut RNG) -> u64x8 {
+    u64x8::from_array([
+        rng.next_u64(),
+        rng.next_u64(),
+        rng.next_u64(),
+        rng.next_u64(),
+        rng.next_u64(),
+        rng.next_u64(),
+        rng.next_u64(),
+        rng.next_u64(),
+    ])
+}
+
 #[inline(never)]
 fn do_u64x8_portable<RNG: PortableSimdRandX8>(rng: &mut RNG) -> u64x8 {
     rng.next_u64x8()
 }
 
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
 #[inline(never)]
 fn do_u64x8_specific<RNG: SpecificSimdRandX8>(rng: &mut RNG) -> __m512i {
     rng.next_m512i()
@@ -50,14 +77,19 @@ fn do_f64x4_portable<RNG: PortableSimdRandX4>(rng: &mut RNG) -> f64x4 {
 }
 
 fn main() {
+    let mut rng_base = rand_xoshiro::Xoshiro256Plus::seed_from_u64(0);
     let mut rng1 = portable::Xoshiro256PlusX4::seed_from_u64(0);
     let mut rng2 = specific::avx2::Xoshiro256PlusX4::seed_from_u64(0);
     let mut rng3 = portable::Xoshiro256PlusX8::seed_from_u64(0);
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
     let mut rng4 = specific::avx512::Xoshiro256PlusX8::seed_from_u64(0);
 
+    black_box(do_u64x4_baseline(&mut rng_base));
     black_box(do_u64x4_portable(&mut rng1));
     black_box(do_u64x4_specific(&mut rng2));
+    black_box(do_u64x8_baseline(&mut rng_base));
     black_box(do_u64x8_portable(&mut rng3));
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
     black_box(do_u64x8_specific(&mut rng4));
     black_box(do_f64x4_specific(&mut rng2));
     black_box(do_f64x4_portable(&mut rng1));

diff --git a/src/lib.rs b/src/lib.rs
@@ -67,6 +67,7 @@
 //! 
 //! There is also some inline assembly used, where the C-style intrinsics haven't been exposed as Rust APIs in `std::arch`.
 
+// stdsimd featuer is required for some of the AVX512 intrinsisc
 #![cfg_attr(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"), feature(stdsimd))]
 #![feature(portable_simd)]
 

diff --git a/src/specific/avx2/simdrand.rs b/src/specific/avx2/simdrand.rs
@@ -49,7 +49,7 @@ pub trait SimdRand {
 
 #[inline(always)]
 unsafe fn m256i_to_m256d(v: __m256i) -> __m256d {
-    if is_x86_feature_detected!("avx512dq") || is_x86_feature_detected!("avx512vl") {
+    if is_x86_feature_detected!("avx512dq") && is_x86_feature_detected!("avx512vl") {
         // With AVX512 DQ/VL we can use the below instruction
         // with both 512bit and 256bit vectors
         // see https://www.felixcloutier.com/x86/vcvtuqq2pd