Skip to content

Commit

Permalink
Merge pull request #1 from MichaelHirn/bench
Browse files Browse the repository at this point in the history
feat/bench: add bench and perf utilities
  • Loading branch information
ehiggs committed Jan 2, 2016
2 parents ece54e3 + 0e2d34c commit d0cd78b
Show file tree
Hide file tree
Showing 7 changed files with 296 additions and 0 deletions.
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ lazy_static = "0.1"

clippy = { version = "0.0.27", optional = true }

[dev-dependencies]

rand = "0.3"

[features]
default = ["native", "cuda", "opencl"]
native = ["collenchyma/native"]
Expand Down
88 changes: 88 additions & 0 deletions benches/relu.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#![feature(test)]
#![feature(clone_from_slice)]

extern crate test;
extern crate collenchyma as co;
extern crate collenchyma_nn as co_nn;
extern crate rand;

use test::Bencher;
use co::backend::{Backend, BackendConfig};
use co::frameworks::Native;
use co::framework::IFramework;
use co::tensor::SharedTensor;
use co_nn::*;

use rand::{thread_rng, Rng};

fn backend() -> Backend<Native> {
let framework = Native::new();
let hardwares = framework.hardwares();
let backend_config = BackendConfig::new(framework, hardwares);
Backend::new(backend_config).unwrap()
}

fn arguments<T: IFramework + Clone>(backend: &Backend<T>, size: usize) -> (SharedTensor<f32>, SharedTensor<f32>) {
let mut rng = thread_rng();
let slice_x = rng.gen_iter::<f32>().take(size).collect::<Vec<f32>>();

let mut x = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let out = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
x.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
(x, out)
}

fn arguments_grad<T: IFramework + Clone>(backend: &Backend<T>, size: usize) -> (SharedTensor<f32>, SharedTensor<f32>, SharedTensor<f32>, SharedTensor<f32>) {
let mut rng = thread_rng();
let slice_x = rng.gen_iter::<f32>().take(size).collect::<Vec<f32>>();

let mut x = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let mut dx = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let mut out = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let dout = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
x.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
dx.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
out.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
(x, dx, out, dout)
}

#[inline(never)]
fn bench_profile<F: FnMut() -> ()>(
b: &mut Bencher,
mut bench_func: F,
times: usize
) {
b.iter(|| { for _ in 0..times { bench_func(); } });
}

#[bench]
fn bench_1000_relu_100_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut out) = arguments(&backend, 100);
let mut func = || { let _ = backend.relu_plain(&mut x, &mut out); };
{ func(); bench_profile(b, func, 1000); }
}

#[bench]
fn bench_10_relu_10000_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut out) = arguments(&backend, 10000);
let mut func = || { let _ = backend.relu_plain(&mut x, &mut out); };
{ func(); bench_profile(b, func, 10); }
}

#[bench]
fn bench_1000_relu_grad_100_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut dx, mut out, mut dout) = arguments_grad(&backend, 100);
let mut func = || { let _ = backend.relu_grad_plain(&mut x, &mut dx, &mut out, &mut dout); };
{ func(); bench_profile(b, func, 1000); }
}

#[bench]
fn bench_10_relu_grad_10000_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut dx, mut out, mut dout) = arguments_grad(&backend, 10000);
let mut func = || { let _ = backend.relu_grad_plain(&mut x, &mut dx, &mut out, &mut dout); };
{ func(); bench_profile(b, func, 10); }
}
88 changes: 88 additions & 0 deletions benches/sigmoid.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#![feature(test)]
#![feature(clone_from_slice)]

extern crate test;
extern crate collenchyma as co;
extern crate collenchyma_nn as co_nn;
extern crate rand;

use test::Bencher;
use co::backend::{Backend, BackendConfig};
use co::frameworks::Native;
use co::framework::IFramework;
use co::tensor::SharedTensor;
use co_nn::*;

use rand::{thread_rng, Rng};

fn backend() -> Backend<Native> {
let framework = Native::new();
let hardwares = framework.hardwares();
let backend_config = BackendConfig::new(framework, hardwares);
Backend::new(backend_config).unwrap()
}

fn arguments<T: IFramework + Clone>(backend: &Backend<T>, size: usize) -> (SharedTensor<f32>, SharedTensor<f32>) {
let mut rng = thread_rng();
let slice_x = rng.gen_iter::<f32>().take(size).collect::<Vec<f32>>();

let mut x = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let out = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
x.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
(x, out)
}

fn arguments_grad<T: IFramework + Clone>(backend: &Backend<T>, size: usize) -> (SharedTensor<f32>, SharedTensor<f32>, SharedTensor<f32>, SharedTensor<f32>) {
let mut rng = thread_rng();
let slice_x = rng.gen_iter::<f32>().take(size).collect::<Vec<f32>>();

let mut x = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let mut dx = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let mut out = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let dout = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
x.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
dx.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
out.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
(x, dx, out, dout)
}

#[inline(never)]
fn bench_profile<F: FnMut() -> ()>(
b: &mut Bencher,
mut bench_func: F,
times: usize
) {
b.iter(|| { for _ in 0..times { bench_func(); } });
}

#[bench]
fn bench_1000_sigmoid_100_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut out) = arguments(&backend, 100);
let mut func = || { let _ = backend.sigmoid_plain(&mut x, &mut out); };
{ func(); bench_profile(b, func, 1000); }
}

#[bench]
fn bench_10_sigmoid_10000_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut out) = arguments(&backend, 10000);
let mut func = || { let _ = backend.sigmoid_plain(&mut x, &mut out); };
{ func(); bench_profile(b, func, 10); }
}

#[bench]
fn bench_1000_sigmoid_grad_100_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut dx, mut out, mut dout) = arguments_grad(&backend, 100);
let mut func = || { let _ = backend.sigmoid_grad_plain(&mut x, &mut dx, &mut out, &mut dout); };
{ func(); bench_profile(b, func, 1000); }
}

#[bench]
fn bench_10_sigmoid_grad_10000_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut dx, mut out, mut dout) = arguments_grad(&backend, 10000);
let mut func = || { let _ = backend.sigmoid_grad_plain(&mut x, &mut dx, &mut out, &mut dout); };
{ func(); bench_profile(b, func, 10); }
}
88 changes: 88 additions & 0 deletions benches/tanh.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#![feature(test)]
#![feature(clone_from_slice)]

extern crate test;
extern crate collenchyma as co;
extern crate collenchyma_nn as co_nn;
extern crate rand;

use test::Bencher;
use co::backend::{Backend, BackendConfig};
use co::frameworks::Native;
use co::framework::IFramework;
use co::tensor::SharedTensor;
use co_nn::*;

use rand::{thread_rng, Rng};

fn backend() -> Backend<Native> {
let framework = Native::new();
let hardwares = framework.hardwares();
let backend_config = BackendConfig::new(framework, hardwares);
Backend::new(backend_config).unwrap()
}

fn arguments<T: IFramework + Clone>(backend: &Backend<T>, size: usize) -> (SharedTensor<f32>, SharedTensor<f32>) {
let mut rng = thread_rng();
let slice_x = rng.gen_iter::<f32>().take(size).collect::<Vec<f32>>();

let mut x = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let out = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
x.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
(x, out)
}

fn arguments_grad<T: IFramework + Clone>(backend: &Backend<T>, size: usize) -> (SharedTensor<f32>, SharedTensor<f32>, SharedTensor<f32>, SharedTensor<f32>) {
let mut rng = thread_rng();
let slice_x = rng.gen_iter::<f32>().take(size).collect::<Vec<f32>>();

let mut x = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let mut dx = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let mut out = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
let dout = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
x.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
dx.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
out.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
(x, dx, out, dout)
}

#[inline(never)]
fn bench_profile<F: FnMut() -> ()>(
b: &mut Bencher,
mut bench_func: F,
times: usize
) {
b.iter(|| { for _ in 0..times { bench_func(); } });
}

#[bench]
fn bench_1000_tanh_100_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut out) = arguments(&backend, 100);
let mut func = || { let _ = backend.tanh_plain(&mut x, &mut out); };
{ func(); bench_profile(b, func, 1000); }
}

#[bench]
fn bench_10_tanh_10000_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut out) = arguments(&backend, 10000);
let mut func = || { let _ = backend.tanh_plain(&mut x, &mut out); };
{ func(); bench_profile(b, func, 10); }
}

#[bench]
fn bench_1000_tanh_grad_100_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut dx, mut out, mut dout) = arguments_grad(&backend, 100);
let mut func = || { let _ = backend.tanh_grad_plain(&mut x, &mut dx, &mut out, &mut dout); };
{ func(); bench_profile(b, func, 1000); }
}

#[bench]
fn bench_10_tanh_grad_10000_native(b: &mut Bencher) {
let backend = backend();
let (mut x, mut dx, mut out, mut dout) = arguments_grad(&backend, 10000);
let mut func = || { let _ = backend.tanh_grad_plain(&mut x, &mut dx, &mut out, &mut dout); };
{ func(); bench_profile(b, func, 10); }
}
11 changes: 11 additions & 0 deletions perf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Profiling

Collenchyma comes with scripts to help with profiling performance problems.

Run [perf](http://www.brendangregg.com/perf.html) on one of the benchmark test:

```sh
# compile latest version of benchmarks with DWARF information
cargo rustc --bench [bench_file_name] -- -g
sudo ./perf/run_perf.sh [bench_fn_name] # perf needs sudo
```
5 changes: 5 additions & 0 deletions perf/perf_rblas.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#! /bin/bash
perf record -a -g --output perf_rblas_data.perf target/debug/rblas_overhead-cf1a2670c118749d --bench bench_1000_dot_100_rblas
perf script -f -i perf_rblas_data.perf > perf_rblas_script.perf
/home/hobofan/stuff/FlameGraph/stackcollapse-perf.pl perf_rblas_script.perf > perf_rblas_folded.perf
/home/hobofan/stuff/FlameGraph/flamegraph.pl perf_rblas_folded.perf > perf_rblas_graph.svg
12 changes: 12 additions & 0 deletions perf/run_perf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#! /bin/bash
if [ $# -eq 0 ]
then
echo "No benchmark name supplied"
exit 1
fi
benchname=$1
mkdir -p target/perf
perf record -a -g --output target/perf/${benchname}.data target/debug/rblas_overhead-c02a41a1401d43da --bench ${benchname}
perf script -f -i target/perf/${benchname}.data > target/perf/${benchname}.scripted
stackcollapse-perf target/perf/${benchname}.scripted | grep ${benchname} > target/perf/${benchname}.folded
flamegraph target/perf/${benchname}.folded > target/perf/${benchname}.svg

0 comments on commit d0cd78b

Please sign in to comment.