diff --git a/Makefile b/Makefile index 86d0ee6..878247d 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,10 @@ dasm: cargo objdump --example dasm --release -- \ -d -M intel > $(bindir)/dasm.asm 2> $(bindir)/dasm.asm.log +dasmbench: + cargo objdump --bench main --release -- \ + -d -M intel > target/release/bench.asm 2> target/release/bench.asm.log + asm: cargo rustc --release --example dasm -- --emit asm -C "llvm-args=-x86-asm-syntax=intel" diff --git a/README.md b/README.md index 3724b29..f2284df 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,9 @@ Categories: - [`portable`] - portable implementations using `std::simd` (nightly required) - [`specific`] - implementations using architecture-specific hardware intrinsics - [`specific::avx2`] - AVX2 for x86_64 architecture (4 lanes for 64bit) + - Requires `avx2` CPU flag, but has additional optimization if you have `avx512dq` and `avx512vl` - [`specific::avx512`] - AVX512 for x86_64 architecture (8 lanes for 64bit) + - Requires `avx512f`, `avx512dq` CPU flags Vectorized PRNG implementations may perform anywhere from 4-6 times faster in my experience, of course very dependent on hardware used ("old" CPUs with AVX512 for example may have excessive thermal throttling). @@ -40,11 +42,21 @@ fn main() { } ``` +The `portable` module will be available on any architecture, e.g. even on x86_64 with only AVX2 you can still use `Xoshiro256PlusPluxX8` which uses +8-lane/512bit vectors (u64x8 from `std::simd`). The compiler is able to make it reasonably fast even if using only 256bit wide registers (AVX2) in the generated code. + +The `specific` submodules (AVX2 and AVX512 currently) are only compiled in depending on target arch/features. + +In general, use the `portable` module. The only risk/drawback to using the `portable` module is that in principle +the compiler isn't _forced_ to use the "optimal" instructions and registers for your hardware. In practice, it probably will though. +In the `specific` submodules the respective hardware intrinsics are "hardcoded" so to speak so we always know what the generated code looks like. +In some contexts that may be useful. + ## Performance The top performing generator (on my current hardware) is currently Xoshiro256+ using AVX512 intrinsics. It is about 5.9x faster. The below benchmarks generates `u64x8` numbers in a loop. -Note that the RandVectorized variant uses `simd_support` from the rand crate, +Note that the RandVectorized variant uses `simd_support` from the `rand` crate, but this doesn't actually vectorize random number generation. If you want to actually use these generators, you should benchmark them yourself on your own hardware. See the `bench` target in the [Makefile](/Makefile). diff --git a/benches/main.rs b/benches/main.rs index c8f14c3..1c5ebb1 100644 --- a/benches/main.rs +++ b/benches/main.rs @@ -13,7 +13,6 @@ use criterion_perf_events::Perf; use perfcnt::linux::HardwareEventType as Hardware; use perfcnt::linux::PerfCounterBuilderLinux as Builder; use rand_core::SeedableRng; -use simd_rand::specific::avx2::*; mod portable; mod scratch; @@ -34,17 +33,13 @@ fn bench(c: &mut Criterion) { crate::portable::add_benchmarks::<_, ITERATIONS>(c, suffix); - if cfg!(all(target_arch = "x86_64", target_feature = "avx2")) { - crate::specific::avx2::add_benchmarks::<_, ITERATIONS>(c, suffix); - } - if cfg!(all( - target_arch = "x86_64", - target_feature = "avx512f", - target_feature = "avx512dq" - )) { - crate::specific::avx512::add_benchmarks::<_, ITERATIONS>(c, suffix); - crate::top::add_top_benchmark::<_, ITERATIONS>(c); - } + #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))] + crate::specific::avx2::add_benchmarks::<_, ITERATIONS>(c, suffix); + + #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))] + crate::specific::avx512::add_benchmarks::<_, ITERATIONS>(c, suffix); + + crate::top::add_top_benchmark::<_, ITERATIONS>(c); } #[non_exhaustive] diff --git a/benches/scratch/mod.rs b/benches/scratch/mod.rs index 8586fdf..2a304a0 100644 --- a/benches/scratch/mod.rs +++ b/benches/scratch/mod.rs @@ -5,9 +5,7 @@ use criterion::{black_box, measurement::Measurement, BenchmarkId, Criterion, Thr use rand_core::{SeedableRng, RngCore}; use rand_xoshiro::Xoshiro256Plus; use simd_rand::portable; -use simd_rand::portable::SimdRandX8 as PortableSimdRand; -use simd_rand::specific; -use simd_rand::specific::avx512::SimdRand as SpecificSimdRand; +use simd_rand::portable::*; #[inline(always)] fn do_u64_baseline(rng: &mut Xoshiro256Plus) -> u64x8 { @@ -24,15 +22,27 @@ fn do_u64_baseline(rng: &mut Xoshiro256Plus) -> u64x8 { } #[inline(always)] -fn do_u64_portable(rng: &mut RNG) -> u64x8 { - rng.next_u64x8() +fn do_u64_portable_x4(rng: &mut RNG) -> u64x8 { + let a = rng.next_u64x4(); + let b = rng.next_u64x4(); + u64x8::from_array([ + a[0], + a[1], + a[2], + a[3], + b[0], + b[1], + b[2], + b[3], + ]) } #[inline(always)] -fn do_u64_specific(rng: &mut RNG) -> __m512i { - rng.next_m512i() +fn do_u64_portable_x8(rng: &mut RNG) -> u64x8 { + rng.next_u64x8() } + pub fn add_benchmarks(c: &mut Criterion, suffix: &str) { let mut group = c.benchmark_group("Scratch"); @@ -44,17 +54,17 @@ pub fn add_benchmarks(c: &mut Criterion, suffix: &str) { b.iter(|| do_u64_baseline(&mut rng)) }); - let name = BenchmarkId::new(format!("Portable/Xoshiro256+/{suffix}"), "1"); + let name = BenchmarkId::new(format!("Portable/Xoshiro256+X4/{suffix}"), "1"); group.bench_with_input(name, &1, |b, _| { - let mut rng = portable::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); + let mut rng = portable::Xoshiro256PlusX4::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); - b.iter(|| do_u64_portable(&mut rng)) + b.iter(|| do_u64_portable_x4(&mut rng)) }); - let name = BenchmarkId::new(format!("Specific/Xoshiro256+/{suffix}"), "1"); + let name = BenchmarkId::new(format!("Portable/Xoshiro256+X8/{suffix}"), "1"); group.bench_with_input(name, &1, |b, _| { - let mut rng = specific::avx512::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); + let mut rng = portable::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); - b.iter(|| do_u64_specific(&mut rng)) + b.iter(|| do_u64_portable_x8(&mut rng)) }); group.finish(); diff --git a/benches/top/mod.rs b/benches/top/mod.rs index 6504fc1..48e3d3a 100644 --- a/benches/top/mod.rs +++ b/benches/top/mod.rs @@ -1,67 +1,75 @@ -use std::{mem, arch::x86_64::*}; +use std::{mem, arch::x86_64::*, simd::u64x8}; use criterion::{measurement::Measurement, Criterion, Throughput, black_box, BenchmarkId}; use rand::Rng; use rand_core::{RngCore, SeedableRng}; use rand_xoshiro::Xoshiro256Plus; -use simd_rand::specific::avx512::{U64x8, SimdRand, Xoshiro256PlusX8}; +use simd_rand::portable::*; +use simd_rand::specific; use packed_simd_2::u64x8 as ps_u64x8; -#[inline(always)] -fn execute_rand(rng: &mut RNG, data: &mut U64x8, i: usize) { - for _ in 0..i { - for i in 0..8 { - black_box(&mut *data)[i] = rng.next_u64(); - } +#[inline(never)] +fn execute_rand(rng: &mut RNG, data: &mut u64x8) { + for i in 0..8 { + data[i] = rng.next_u64(); } } -#[inline(always)] -fn execute_rand_vectorized(rng: &mut RNG, data: &mut ps_u64x8, i: usize) { - for _ in 0..i { - *black_box(&mut *data) = rng.gen::(); - } +#[inline(never)] +fn execute_rand_vectorized(rng: &mut RNG, data: &mut ps_u64x8) { + *data = rng.gen::(); } -#[inline(always)] -fn execute_vectorized(rng: &mut RNG, data: &mut __m512i, i: usize) { - for _ in 0..i { - *black_box(&mut *data) = rng.next_m512i(); - } +#[inline(never)] +fn execute_vectorized_portable(rng: &mut RNG, data: &mut u64x8) { + *data = rng.next_u64x8(); } -pub fn add_top_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("top"); +#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))] +#[inline(never)] +fn execute_vectorized_specific(rng: &mut RNG, data: &mut __m512i) { + *data = rng.next_m512i(); +} - let iterations: Vec<_> = (0..8).map(|v| (v + 1) * ITERATIONS).collect(); +pub fn add_top_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("Top"); - for iterations in iterations { - group.throughput(Throughput::Bytes((iterations * mem::size_of::<__m512i>()) as u64)); + group.throughput(Throughput::Bytes(mem::size_of::() as u64)); - let name = BenchmarkId::new(format!("Rand/Xoshiro256+"), iterations); - group.bench_with_input(name, &iterations, |b, i| { - let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); - let mut data = Default::default(); + let name = BenchmarkId::new(format!("Rand/Xoshiro256+"), 1); - b.iter(|| execute_rand(&mut rng, black_box(&mut data), black_box(*i))) - }); + group.bench_with_input(name, &1, |b, i| { + let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); + let mut data = Default::default(); - let name = BenchmarkId::new(format!("RandVectorized/Xoshiro256+"), iterations); - group.bench_with_input(name, &iterations, |b, i| { - let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); - let mut data = Default::default(); + b.iter(|| execute_rand(&mut rng, black_box(&mut data))) + }); - b.iter(|| execute_rand_vectorized(&mut rng, black_box(&mut data), black_box(*i))) - }); - - let name = BenchmarkId::new(format!("AVX512/Xoshiro256+"), iterations); - group.bench_with_input(name, &iterations, |b, i| unsafe { - let mut rng = Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); - let mut data: __m512i = _mm512_setzero_si512(); + let name = BenchmarkId::new(format!("RandVectorized/Xoshiro256+"), 1); + group.bench_with_input(name, &1, |b, i| { + let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); + let mut data = Default::default(); - b.iter(|| execute_vectorized(&mut rng, black_box(&mut data), black_box(*i))) - }); - } + b.iter(|| execute_rand_vectorized(&mut rng, black_box(&mut data))) + }); + + let name = BenchmarkId::new(format!("Portable/Xoshiro256+X8"), 1); + group.bench_with_input(name, &1, |b, i| { + let mut rng = Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); + let mut data = Default::default(); + + b.iter(|| execute_vectorized_portable(&mut rng, black_box(&mut data))) + }); + + #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))] + let name = BenchmarkId::new(format!("Specific/Xoshiro256+X8"), iterations); + #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))] + group.bench_with_input(name, &iterations, |b, i| unsafe { + let mut rng = specific::avx512::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64); + let mut data: __m512i = _mm512_setzero_si512(); + + b.iter(|| execute_vectorized_specific(&mut rng, black_box(&mut data))) + }); group.finish(); } diff --git a/examples/_internal/dasm.rs b/examples/_internal/dasm.rs index edce6b7..d2c9fd6 100644 --- a/examples/_internal/dasm.rs +++ b/examples/_internal/dasm.rs @@ -7,18 +7,30 @@ use std::simd::u64x4; use std::simd::u64x8; use criterion::black_box; +use rand_core::RngCore; use rand_core::SeedableRng; use simd_rand::portable; use simd_rand::portable::SimdRandX4 as PortableSimdRandX4; use simd_rand::portable::SimdRandX8 as PortableSimdRandX8; use simd_rand::specific; use simd_rand::specific::avx2::SimdRand as SpecificSimdRandX4; +#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))] use simd_rand::specific::avx512::SimdRand as SpecificSimdRandX8; /// This is a small binary meant to aid in analyzing generated code /// For example to see differences between portable and specific code, /// and simd_rand and rand code +#[inline(never)] +fn do_u64x4_baseline(rng: &mut RNG) -> u64x4 { + u64x4::from_array([ + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + ]) +} + #[inline(never)] fn do_u64x4_portable(rng: &mut RNG) -> u64x4 { rng.next_u64x4() @@ -29,11 +41,26 @@ fn do_u64x4_specific(rng: &mut RNG) -> __m256i { rng.next_m256i() } +#[inline(never)] +fn do_u64x8_baseline(rng: &mut RNG) -> u64x8 { + u64x8::from_array([ + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + ]) +} + #[inline(never)] fn do_u64x8_portable(rng: &mut RNG) -> u64x8 { rng.next_u64x8() } +#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))] #[inline(never)] fn do_u64x8_specific(rng: &mut RNG) -> __m512i { rng.next_m512i() @@ -50,14 +77,19 @@ fn do_f64x4_portable(rng: &mut RNG) -> f64x4 { } fn main() { + let mut rng_base = rand_xoshiro::Xoshiro256Plus::seed_from_u64(0); let mut rng1 = portable::Xoshiro256PlusX4::seed_from_u64(0); let mut rng2 = specific::avx2::Xoshiro256PlusX4::seed_from_u64(0); let mut rng3 = portable::Xoshiro256PlusX8::seed_from_u64(0); + #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))] let mut rng4 = specific::avx512::Xoshiro256PlusX8::seed_from_u64(0); + black_box(do_u64x4_baseline(&mut rng_base)); black_box(do_u64x4_portable(&mut rng1)); black_box(do_u64x4_specific(&mut rng2)); + black_box(do_u64x8_baseline(&mut rng_base)); black_box(do_u64x8_portable(&mut rng3)); + #[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))] black_box(do_u64x8_specific(&mut rng4)); black_box(do_f64x4_specific(&mut rng2)); black_box(do_f64x4_portable(&mut rng1)); diff --git a/src/lib.rs b/src/lib.rs index ccd9e40..62df5c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -67,6 +67,7 @@ //! //! There is also some inline assembly used, where the C-style intrinsics haven't been exposed as Rust APIs in `std::arch`. +// stdsimd featuer is required for some of the AVX512 intrinsisc #![cfg_attr(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"), feature(stdsimd))] #![feature(portable_simd)] diff --git a/src/specific/avx2/simdrand.rs b/src/specific/avx2/simdrand.rs index 0cb6de7..ee065a3 100644 --- a/src/specific/avx2/simdrand.rs +++ b/src/specific/avx2/simdrand.rs @@ -49,7 +49,7 @@ pub trait SimdRand { #[inline(always)] unsafe fn m256i_to_m256d(v: __m256i) -> __m256d { - if is_x86_feature_detected!("avx512dq") || is_x86_feature_detected!("avx512vl") { + if is_x86_feature_detected!("avx512dq") && is_x86_feature_detected!("avx512vl") { // With AVX512 DQ/VL we can use the below instruction // with both 512bit and 256bit vectors // see https://www.felixcloutier.com/x86/vcvtuqq2pd