Skip to content

Commit

Permalink
Implementation of vdev_raidz parity generate and reconstruct routines
Browse files Browse the repository at this point in the history
- specialized gen/rec routines for all RAIDZ levels,
- new scalar raidz implementation (unrolled),
- two x86_64 SIMD implementations (SSE and AVX2 instructions sets),
- fastest routines selected on module load (benchmark).
- cmd/raidz_test - verify and benchmark all implementations against original

New zfs module parameters:
- zfs_raidz_math_impl (int): selects a new implementation to use:
    "-1" - the fastest (DEFAULT),
     "0" - new scalar routines,
     "1" - new SSE routines,
     "2" - new AVX2 routines.

- zfs_raidz_new_math (uint): enables or disables new implementations:
    "1" - use new raidz implementation (DEFAULT),
    "0" - use old raidz implementation.

vdev_raidz_math: fix AVX2 code compilation against kernels older and newer than ~3.16
kernel comp: fix CPU feature check for new kernels
vdev_raidz_math_scalar: fix function prototype inconsistencies
autoconf: added m4 macros for userspace CPUID and SIMD toolchain support
  • Loading branch information
ironMann committed Mar 10, 2016
1 parent eea9309 commit db01fae
Show file tree
Hide file tree
Showing 29 changed files with 6,719 additions and 132 deletions.
2 changes: 1 addition & 1 deletion cmd/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest zpios
SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat zed
SUBDIRS += arc_summary
SUBDIRS += arc_summary raidz_test
1 change: 1 addition & 0 deletions cmd/raidz_test/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/raidz_test
20 changes: 20 additions & 0 deletions cmd/raidz_test/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
include $(top_srcdir)/config/Rules.am

AM_CFLAGS += $(DEBUG_STACKFLAGS) $(FRAME_LARGER_THAN)
AM_CPPFLAGS += -DDEBUG

DEFAULT_INCLUDES += \
-I$(top_srcdir)/include \
-I$(top_srcdir)/lib/libspl/include

bin_PROGRAMS = raidz_test

raidz_test_SOURCES = \
raidz_test.h \
raidz_test.c \
raidz_bench.c

raidz_test_LDADD = \
$(top_builddir)/lib/libzpool/libzpool.la

raidz_test_LDADD += -lm -ldl
288 changes: 288 additions & 0 deletions cmd/raidz_test/raidz_bench.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/

#include <sys/zfs_context.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/zio.h>
#include <sys/vdev_raidz_impl.h>
#include <stdio.h>

#include <sys/time.h>
#include <sys/resource.h>

#include "raidz_test.h"

#define GEN_BENCH_MEMORY (1ULL<<32)
#define REC_BENCH_MEMORY (1ULL<<30)

static zio_t zio_bench;
static raidz_map_t *rm_bench;
static size_t max_data_size;

static void
bench_init_raidz_map(void)
{
max_data_size = (1ULL<<24) * rto_opts.rto_dcols;

if (rto_opts.rto_verbose) {
PRINT(DBLSEP);
PRINT("Initializing benchmark data ...\n\n");
}

zio_bench.io_offset = 0;
zio_bench.io_size = max_data_size;

/*
* To permit larger column sizes these have to be done
* allocated using aligned alloc instead of zio_data_buf_alloc
*/
zio_bench.io_data = umem_alloc_aligned(max_data_size, 512, UMEM_NOFAIL);

init_zio_data(&zio_bench);
}

static void
bench_fini_raidz_maps(void)
{
/* tear down golden zio */
umem_free(zio_bench.io_data, max_data_size);
bzero(&zio_bench, sizeof (zio_t));
}

static double
get_time_diff(struct rusage *start, struct rusage *stop)
{
return (((double)stop->ru_utime.tv_sec*1e6 +
(double)stop->ru_utime.tv_usec) -
((double)start->ru_utime.tv_sec*1e6 +
(double)start->ru_utime.tv_usec)) / 1e6;
}

static void
run_gen_bench(void)
{
raidz_math_ops_t *c_ops;
struct rusage start, stop;
size_t impl, fn, ds, iter_cnt, iter;
double elapsed, d_bw;

if (rto_opts.rto_verbose) {
PRINT(DBLSEP);
PRINT("Benchmarking parity generation...\n\n");
}

PRINT("impl, math, dcols, dsize, disk_bw, total_bw, iter\n");

/* benchmark original impl */
zfs_raidz_new_math = 0; /* disable new RAIDZ impl */

/* Benchmark generate functions */
for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {

for (ds = 9; ds <= MAX_DCSIZE; ds++) {

/* create suitable raidz_map */
zio_bench.io_size = (1ULL<<ds) * rto_opts.rto_dcols;
rm_bench = vdev_raidz_map_alloc(&zio_bench, 9,
rto_opts.rto_dcols + fn+1, fn+1);

/* guess iteration count */
iter_cnt = (GEN_BENCH_MEMORY) / zio_bench.io_size;

getrusage(RUSAGE_THREAD, &start);
for (iter = 0; iter < iter_cnt; iter++) {
vdev_raidz_generate_parity(rm_bench);
}
getrusage(RUSAGE_THREAD, &stop);

elapsed = get_time_diff(&start, &stop);
d_bw = (double)iter_cnt * (double)(1ULL<<ds);
d_bw /= (1024. * 1024. * elapsed);

PRINT("%10s, %8s, %zu, %10llu, %lf, %lf, %zu\n",
"original",
raidz_gen_name[fn],
rto_opts.rto_dcols,
(1ULL<<ds),
d_bw,
d_bw * (double)(rto_opts.rto_dcols+fn+1),
iter_cnt);

vdev_raidz_map_free(rm_bench);
}
}

/* benchmark new impl */
for (impl = 0; impl < raidz_supp_maths_cnt; impl++) {
c_ops = raidz_supp_maths[impl];

for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {

for (ds = 9; ds <= MAX_DCSIZE; ds++) {

/* create suitable raidz_map */
zio_bench.io_size = (1ULL<<ds);
zio_bench.io_size *= rto_opts.rto_dcols;
rm_bench = vdev_raidz_map_alloc(&zio_bench, 9,
rto_opts.rto_dcols + fn+1, fn+1);

/* guess iteration count */
iter_cnt = (GEN_BENCH_MEMORY) /
zio_bench.io_size;

getrusage(RUSAGE_THREAD, &start);
for (iter = 0; iter < iter_cnt; iter++) {
c_ops->gen[fn](rm_bench);
}
getrusage(RUSAGE_THREAD, &stop);

elapsed = get_time_diff(&start, &stop);
d_bw = (double)iter_cnt * (double)(1ULL<<ds);
d_bw /= (1024. * 1024. * elapsed);

PRINT("%10s, %8s, %zu, %10llu, %lf, %lf, %zu\n",
c_ops->name,
raidz_gen_name[fn],
rto_opts.rto_dcols,
(1ULL<<ds),
d_bw,
d_bw * (rto_opts.rto_dcols+fn+1),
iter_cnt);
}
}
}
}

static void
run_rec_bench(void)
{
raidz_math_ops_t *c_ops;
struct rusage start, stop;
size_t impl, fn, ds, iter_cnt, iter;
double elapsed, d_bw;
int tgx[3] = { 3, 4, 5};
size_t rec_disks[7] = {1, 1, 1, 2, 2, 2, 3};
int bench_rec_orig[7][3] = {
{1, 2, 3},
{0, 2, 3},
{0, 1, 3},
{2, 3, 4},
{1, 3, 4},
{0, 3, 4},
{3, 4, 5}
};

if (rto_opts.rto_verbose) {
PRINT(DBLSEP);
PRINT("Benchmarking data reconstruction...\n\n");
}

for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
for (ds = 9; ds <= MAX_DCSIZE; ds++) {
/* create suitable raidz_map */
zio_bench.io_size = (1ULL<<ds) * rto_opts.rto_dcols;
rm_bench = vdev_raidz_map_alloc(&zio_bench, 9,
rto_opts.rto_dcols + CODE_PQR, CODE_PQR);

/* guess iteration count */
iter_cnt = (REC_BENCH_MEMORY) / zio_bench.io_size;

getrusage(RUSAGE_THREAD, &start);
for (iter = 0; iter < iter_cnt; iter++) {
vdev_raidz_reconstruct(rm_bench,
bench_rec_orig[fn], 3);
}
getrusage(RUSAGE_THREAD, &stop);

elapsed = get_time_diff(&start, &stop);
d_bw = (double)iter_cnt * (double)(1ULL<<ds);
d_bw /= (1024. * 1024. * elapsed);

PRINT("%10s, %8s, %zu, %10llu, %lf, %lf, %zu\n",
"original",
raidz_rec_name[fn],
rto_opts.rto_dcols,
(1ULL<<ds),
d_bw,
d_bw * (rto_opts.rto_dcols+rec_disks[fn]),
iter_cnt);

vdev_raidz_map_free(rm_bench);
}
}

for (impl = 0; impl < raidz_supp_maths_cnt; impl++) {
c_ops = raidz_supp_maths[impl];

for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
for (ds = 9; ds <= MAX_DCSIZE; ds++) {
/* create suitable raidz_map */
zio_bench.io_size = (1ULL<<ds) *
rto_opts.rto_dcols;
rm_bench = vdev_raidz_map_alloc(&zio_bench, 9,
rto_opts.rto_dcols + CODE_PQR,
CODE_PQR);

/* guess iteration count */
iter_cnt = (REC_BENCH_MEMORY) /
zio_bench.io_size;

getrusage(RUSAGE_THREAD, &start);
for (iter = 0; iter < iter_cnt; iter++) {
c_ops->rec[fn](rm_bench, tgx);
}
getrusage(RUSAGE_THREAD, &stop);

elapsed = get_time_diff(&start, &stop);
d_bw = (double)iter_cnt * (double)(1ULL<<ds);
d_bw /= (1024. * 1024. * elapsed);

PRINT("%10s, %8s, %zu, %10llu, %lf, %lf, %zu\n",
c_ops->name,
raidz_rec_name[fn],
rto_opts.rto_dcols,
(1ULL<<ds),
d_bw,
d_bw * (rto_opts.rto_dcols +
rec_disks[fn]),
iter_cnt);

vdev_raidz_map_free(rm_bench);
}
}
}
}

void
run_raidz_benchmark(void)
{
bench_init_raidz_map();

run_gen_bench();
run_rec_bench();

bench_fini_raidz_maps();
}
Loading

0 comments on commit db01fae

Please sign in to comment.