Skip to content

Commit

Permalink
Fix conditional compilation, so that everything builds on machine wit…
Browse files Browse the repository at this point in the history
…hout AVX512
  • Loading branch information
martinothamar committed Jul 29, 2023
1 parent 1d1ab21 commit 20ddb7b
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 70 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ dasm:
cargo objdump --example dasm --release -- \
-d -M intel > $(bindir)/dasm.asm 2> $(bindir)/dasm.asm.log

dasmbench:
cargo objdump --bench main --release -- \
-d -M intel > target/release/bench.asm 2> target/release/bench.asm.log

asm:
cargo rustc --release --example dasm -- --emit asm -C "llvm-args=-x86-asm-syntax=intel"

Expand Down
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ Categories:
- [`portable`] - portable implementations using `std::simd` (nightly required)
- [`specific`] - implementations using architecture-specific hardware intrinsics
- [`specific::avx2`] - AVX2 for x86_64 architecture (4 lanes for 64bit)
- Requires `avx2` CPU flag, but has additional optimization if you have `avx512dq` and `avx512vl`
- [`specific::avx512`] - AVX512 for x86_64 architecture (8 lanes for 64bit)
- Requires `avx512f`, `avx512dq` CPU flags

Vectorized PRNG implementations may perform anywhere from 4-6 times faster in my experience,
of course very dependent on hardware used ("old" CPUs with AVX512 for example may have excessive thermal throttling).
Expand Down Expand Up @@ -40,11 +42,21 @@ fn main() {
}
```

The `portable` module will be available on any architecture, e.g. even on x86_64 with only AVX2 you can still use `Xoshiro256PlusPluxX8` which uses
8-lane/512bit vectors (u64x8 from `std::simd`). The compiler is able to make it reasonably fast even if using only 256bit wide registers (AVX2) in the generated code.

The `specific` submodules (AVX2 and AVX512 currently) are only compiled in depending on target arch/features.

In general, use the `portable` module. The only risk/drawback to using the `portable` module is that in principle
the compiler isn't _forced_ to use the "optimal" instructions and registers for your hardware. In practice, it probably will though.
In the `specific` submodules the respective hardware intrinsics are "hardcoded" so to speak so we always know what the generated code looks like.
In some contexts that may be useful.

## Performance

The top performing generator (on my current hardware) is currently Xoshiro256+ using AVX512 intrinsics.
It is about 5.9x faster. The below benchmarks generates `u64x8` numbers in a loop.
Note that the RandVectorized variant uses `simd_support` from the rand crate,
Note that the RandVectorized variant uses `simd_support` from the `rand` crate,
but this doesn't actually vectorize random number generation.

If you want to actually use these generators, you should benchmark them yourself on your own hardware. See the `bench` target in the [Makefile](/Makefile).
Expand Down
19 changes: 7 additions & 12 deletions benches/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ use criterion_perf_events::Perf;
use perfcnt::linux::HardwareEventType as Hardware;
use perfcnt::linux::PerfCounterBuilderLinux as Builder;
use rand_core::SeedableRng;
use simd_rand::specific::avx2::*;

mod portable;
mod scratch;
Expand All @@ -34,17 +33,13 @@ fn bench<M: Measurement, const T: u8>(c: &mut Criterion<M>) {

crate::portable::add_benchmarks::<_, ITERATIONS>(c, suffix);

if cfg!(all(target_arch = "x86_64", target_feature = "avx2")) {
crate::specific::avx2::add_benchmarks::<_, ITERATIONS>(c, suffix);
}
if cfg!(all(
target_arch = "x86_64",
target_feature = "avx512f",
target_feature = "avx512dq"
)) {
crate::specific::avx512::add_benchmarks::<_, ITERATIONS>(c, suffix);
crate::top::add_top_benchmark::<_, ITERATIONS>(c);
}
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
crate::specific::avx2::add_benchmarks::<_, ITERATIONS>(c, suffix);

#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
crate::specific::avx512::add_benchmarks::<_, ITERATIONS>(c, suffix);

crate::top::add_top_benchmark::<_, ITERATIONS>(c);
}

#[non_exhaustive]
Expand Down
36 changes: 23 additions & 13 deletions benches/scratch/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@ use criterion::{black_box, measurement::Measurement, BenchmarkId, Criterion, Thr
use rand_core::{SeedableRng, RngCore};
use rand_xoshiro::Xoshiro256Plus;
use simd_rand::portable;
use simd_rand::portable::SimdRandX8 as PortableSimdRand;
use simd_rand::specific;
use simd_rand::specific::avx512::SimdRand as SpecificSimdRand;
use simd_rand::portable::*;

#[inline(always)]
fn do_u64_baseline(rng: &mut Xoshiro256Plus) -> u64x8 {
Expand All @@ -24,15 +22,27 @@ fn do_u64_baseline(rng: &mut Xoshiro256Plus) -> u64x8 {
}

#[inline(always)]
fn do_u64_portable<RNG: PortableSimdRand>(rng: &mut RNG) -> u64x8 {
rng.next_u64x8()
fn do_u64_portable_x4<RNG: SimdRandX4>(rng: &mut RNG) -> u64x8 {
let a = rng.next_u64x4();
let b = rng.next_u64x4();
u64x8::from_array([
a[0],
a[1],
a[2],
a[3],
b[0],
b[1],
b[2],
b[3],
])
}

#[inline(always)]
fn do_u64_specific<RNG: SpecificSimdRand>(rng: &mut RNG) -> __m512i {
rng.next_m512i()
fn do_u64_portable_x8<RNG: SimdRandX8>(rng: &mut RNG) -> u64x8 {
rng.next_u64x8()
}


pub fn add_benchmarks<M: Measurement>(c: &mut Criterion<M>, suffix: &str) {
let mut group = c.benchmark_group("Scratch");

Expand All @@ -44,17 +54,17 @@ pub fn add_benchmarks<M: Measurement>(c: &mut Criterion<M>, suffix: &str) {

b.iter(|| do_u64_baseline(&mut rng))
});
let name = BenchmarkId::new(format!("Portable/Xoshiro256+/{suffix}"), "1");
let name = BenchmarkId::new(format!("Portable/Xoshiro256+X4/{suffix}"), "1");
group.bench_with_input(name, &1, |b, _| {
let mut rng = portable::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
let mut rng = portable::Xoshiro256PlusX4::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);

b.iter(|| do_u64_portable(&mut rng))
b.iter(|| do_u64_portable_x4(&mut rng))
});
let name = BenchmarkId::new(format!("Specific/Xoshiro256+/{suffix}"), "1");
let name = BenchmarkId::new(format!("Portable/Xoshiro256+X8/{suffix}"), "1");
group.bench_with_input(name, &1, |b, _| {
let mut rng = specific::avx512::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
let mut rng = portable::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);

b.iter(|| do_u64_specific(&mut rng))
b.iter(|| do_u64_portable_x8(&mut rng))
});

group.finish();
Expand Down
94 changes: 51 additions & 43 deletions benches/top/mod.rs
Original file line number Diff line number Diff line change
@@ -1,67 +1,75 @@
use std::{mem, arch::x86_64::*};
use std::{mem, arch::x86_64::*, simd::u64x8};

use criterion::{measurement::Measurement, Criterion, Throughput, black_box, BenchmarkId};
use rand::Rng;
use rand_core::{RngCore, SeedableRng};
use rand_xoshiro::Xoshiro256Plus;
use simd_rand::specific::avx512::{U64x8, SimdRand, Xoshiro256PlusX8};
use simd_rand::portable::*;
use simd_rand::specific;
use packed_simd_2::u64x8 as ps_u64x8;

#[inline(always)]
fn execute_rand<RNG: RngCore>(rng: &mut RNG, data: &mut U64x8, i: usize) {
for _ in 0..i {
for i in 0..8 {
black_box(&mut *data)[i] = rng.next_u64();
}
#[inline(never)]
fn execute_rand<RNG: RngCore>(rng: &mut RNG, data: &mut u64x8) {
for i in 0..8 {
data[i] = rng.next_u64();
}
}

#[inline(always)]
fn execute_rand_vectorized<RNG: RngCore>(rng: &mut RNG, data: &mut ps_u64x8, i: usize) {
for _ in 0..i {
*black_box(&mut *data) = rng.gen::<ps_u64x8>();
}
#[inline(never)]
fn execute_rand_vectorized<RNG: RngCore>(rng: &mut RNG, data: &mut ps_u64x8) {
*data = rng.gen::<ps_u64x8>();
}

#[inline(always)]
fn execute_vectorized<RNG: SimdRand>(rng: &mut RNG, data: &mut __m512i, i: usize) {
for _ in 0..i {
*black_box(&mut *data) = rng.next_m512i();
}
#[inline(never)]
fn execute_vectorized_portable<RNG: SimdRandX8>(rng: &mut RNG, data: &mut u64x8) {
*data = rng.next_u64x8();
}

pub fn add_top_benchmark<M: Measurement, const ITERATIONS: usize>(c: &mut Criterion<M>) {
let mut group = c.benchmark_group("top");
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
#[inline(never)]
fn execute_vectorized_specific<RNG: specific::avx512::SimdRand>(rng: &mut RNG, data: &mut __m512i) {
*data = rng.next_m512i();
}

let iterations: Vec<_> = (0..8).map(|v| (v + 1) * ITERATIONS).collect();
pub fn add_top_benchmark<M: Measurement, const ITERATIONS: usize>(c: &mut Criterion<M>) {
let mut group = c.benchmark_group("Top");

for iterations in iterations {
group.throughput(Throughput::Bytes((iterations * mem::size_of::<__m512i>()) as u64));
group.throughput(Throughput::Bytes(mem::size_of::<u64x8>() as u64));

let name = BenchmarkId::new(format!("Rand/Xoshiro256+"), iterations);
group.bench_with_input(name, &iterations, |b, i| {
let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
let mut data = Default::default();
let name = BenchmarkId::new(format!("Rand/Xoshiro256+"), 1);

b.iter(|| execute_rand(&mut rng, black_box(&mut data), black_box(*i)))
});
group.bench_with_input(name, &1, |b, i| {
let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
let mut data = Default::default();

let name = BenchmarkId::new(format!("RandVectorized/Xoshiro256+"), iterations);
group.bench_with_input(name, &iterations, |b, i| {
let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
let mut data = Default::default();
b.iter(|| execute_rand(&mut rng, black_box(&mut data)))
});

b.iter(|| execute_rand_vectorized(&mut rng, black_box(&mut data), black_box(*i)))
});

let name = BenchmarkId::new(format!("AVX512/Xoshiro256+"), iterations);
group.bench_with_input(name, &iterations, |b, i| unsafe {
let mut rng = Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
let mut data: __m512i = _mm512_setzero_si512();
let name = BenchmarkId::new(format!("RandVectorized/Xoshiro256+"), 1);
group.bench_with_input(name, &1, |b, i| {
let mut rng = Xoshiro256Plus::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
let mut data = Default::default();

b.iter(|| execute_vectorized(&mut rng, black_box(&mut data), black_box(*i)))
});
}
b.iter(|| execute_rand_vectorized(&mut rng, black_box(&mut data)))
});

let name = BenchmarkId::new(format!("Portable/Xoshiro256+X8"), 1);
group.bench_with_input(name, &1, |b, i| {
let mut rng = Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
let mut data = Default::default();

b.iter(|| execute_vectorized_portable(&mut rng, black_box(&mut data)))
});

#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
let name = BenchmarkId::new(format!("Specific/Xoshiro256+X8"), iterations);
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
group.bench_with_input(name, &iterations, |b, i| unsafe {
let mut rng = specific::avx512::Xoshiro256PlusX8::seed_from_u64(0x0DDB1A5E5BAD5EEDu64);
let mut data: __m512i = _mm512_setzero_si512();

b.iter(|| execute_vectorized_specific(&mut rng, black_box(&mut data)))
});

group.finish();
}
32 changes: 32 additions & 0 deletions examples/_internal/dasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,30 @@ use std::simd::u64x4;
use std::simd::u64x8;

use criterion::black_box;
use rand_core::RngCore;
use rand_core::SeedableRng;
use simd_rand::portable;
use simd_rand::portable::SimdRandX4 as PortableSimdRandX4;
use simd_rand::portable::SimdRandX8 as PortableSimdRandX8;
use simd_rand::specific;
use simd_rand::specific::avx2::SimdRand as SpecificSimdRandX4;
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
use simd_rand::specific::avx512::SimdRand as SpecificSimdRandX8;

/// This is a small binary meant to aid in analyzing generated code
/// For example to see differences between portable and specific code,
/// and simd_rand and rand code
#[inline(never)]
fn do_u64x4_baseline<RNG: RngCore>(rng: &mut RNG) -> u64x4 {
u64x4::from_array([
rng.next_u64(),
rng.next_u64(),
rng.next_u64(),
rng.next_u64(),
])
}

#[inline(never)]
fn do_u64x4_portable<RNG: PortableSimdRandX4>(rng: &mut RNG) -> u64x4 {
rng.next_u64x4()
Expand All @@ -29,11 +41,26 @@ fn do_u64x4_specific<RNG: SpecificSimdRandX4>(rng: &mut RNG) -> __m256i {
rng.next_m256i()
}

#[inline(never)]
fn do_u64x8_baseline<RNG: RngCore>(rng: &mut RNG) -> u64x8 {
u64x8::from_array([
rng.next_u64(),
rng.next_u64(),
rng.next_u64(),
rng.next_u64(),
rng.next_u64(),
rng.next_u64(),
rng.next_u64(),
rng.next_u64(),
])
}

#[inline(never)]
fn do_u64x8_portable<RNG: PortableSimdRandX8>(rng: &mut RNG) -> u64x8 {
rng.next_u64x8()
}

#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
#[inline(never)]
fn do_u64x8_specific<RNG: SpecificSimdRandX8>(rng: &mut RNG) -> __m512i {
rng.next_m512i()
Expand All @@ -50,14 +77,19 @@ fn do_f64x4_portable<RNG: PortableSimdRandX4>(rng: &mut RNG) -> f64x4 {
}

fn main() {
let mut rng_base = rand_xoshiro::Xoshiro256Plus::seed_from_u64(0);
let mut rng1 = portable::Xoshiro256PlusX4::seed_from_u64(0);
let mut rng2 = specific::avx2::Xoshiro256PlusX4::seed_from_u64(0);
let mut rng3 = portable::Xoshiro256PlusX8::seed_from_u64(0);
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
let mut rng4 = specific::avx512::Xoshiro256PlusX8::seed_from_u64(0);

black_box(do_u64x4_baseline(&mut rng_base));
black_box(do_u64x4_portable(&mut rng1));
black_box(do_u64x4_specific(&mut rng2));
black_box(do_u64x8_baseline(&mut rng_base));
black_box(do_u64x8_portable(&mut rng3));
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"))]
black_box(do_u64x8_specific(&mut rng4));
black_box(do_f64x4_specific(&mut rng2));
black_box(do_f64x4_portable(&mut rng1));
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
//!
//! There is also some inline assembly used, where the C-style intrinsics haven't been exposed as Rust APIs in `std::arch`.
// stdsimd featuer is required for some of the AVX512 intrinsisc
#![cfg_attr(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512dq"), feature(stdsimd))]
#![feature(portable_simd)]

Expand Down
2 changes: 1 addition & 1 deletion src/specific/avx2/simdrand.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ pub trait SimdRand {

#[inline(always)]
unsafe fn m256i_to_m256d(v: __m256i) -> __m256d {
if is_x86_feature_detected!("avx512dq") || is_x86_feature_detected!("avx512vl") {
if is_x86_feature_detected!("avx512dq") && is_x86_feature_detected!("avx512vl") {
// With AVX512 DQ/VL we can use the below instruction
// with both 512bit and 256bit vectors
// see https://www.felixcloutier.com/x86/vcvtuqq2pd
Expand Down

0 comments on commit 20ddb7b

Please sign in to comment.