Skip to content

Commit

Permalink
Implementation of vdev_raidz generate and reconstruct routines
Browse files Browse the repository at this point in the history
- specialized gen/rec routines for all RAIDZ levels,
- new scalar raidz implementation (unrolled),
- two x86_64 SIMD implementations (SSE_4.1 and AVX2 instructions sets),
- fastest routines selected on module load (benchmark).

New zfs module parameters:
- zfs_raidz_math_impl (int): selects a new implementation to use:
    "-1" - the fastest (DEFAULT),
     "0" - new scalar routines,
     "1" - new SSE routines,
     "2" - new AVX2 routines.

- zfs_raidz_new_math (uint): enables or disables new implementations:
    "1" - use new raidz implementation (DEFAULT),
    "0" - use old raidz implementation.

vdev_raidz_math: fix AVX2 code compilation against kernels older and newer than ~3.16
kernel comp: fix CPU feature check for new kernels
vdev_raidz_math_scalar: fix function prototype inconsistencies
  • Loading branch information
ironMann committed Feb 15, 2016
1 parent eea9309 commit 7e8471f
Show file tree
Hide file tree
Showing 17 changed files with 29,180 additions and 113 deletions.
1 change: 1 addition & 0 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include <sys/sa_impl.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_raidz.h>
#include <sys/metaslab_impl.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dir.h>
Expand Down
1 change: 1 addition & 0 deletions cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
#include <sys/zil_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/vdev_raidz.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
Expand Down
1 change: 1 addition & 0 deletions include/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/vdev_file.h \
$(top_srcdir)/include/sys/vdev.h \
$(top_srcdir)/include/sys/vdev_impl.h \
$(top_srcdir)/include/sys/vdev_raidz.h \
$(top_srcdir)/include/sys/xvattr.h \
$(top_srcdir)/include/sys/zap.h \
$(top_srcdir)/include/sys/zap_impl.h \
Expand Down
43 changes: 43 additions & 0 deletions include/sys/vdev_raidz.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (C) 2016 Gvozden Neskovic <[email protected]>.
*/

#ifndef _SYS_VDEV_RAIDZ_H
#define _SYS_VDEV_RAIDZ_H

#ifdef __cplusplus
extern "C" {
#endif

void vdev_raidz_math_init(void);

/* testing interface */
#if !defined(_KENREL)
void vdev_raidz_cycle_impl(unsigned int v);
#endif

#ifdef __cplusplus
}
#endif

#endif /* _SYS_VDEV_RAIDZ_H */
4 changes: 4 additions & 0 deletions lib/libzpool/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ KERNEL_C = \
vdev_missing.c \
vdev_queue.c \
vdev_raidz.c \
vdev_raidz_math.c \
vdev_raidz_math_scalar.c \
vdev_raidz_math_sse.c \
vdev_raidz_math_avx2.c \
vdev_root.c \
zap.c \
zap_leaf.c \
Expand Down
5 changes: 5 additions & 0 deletions module/zfs/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ $(MODULE)-objs += vdev_mirror.o
$(MODULE)-objs += vdev_missing.o
$(MODULE)-objs += vdev_queue.o
$(MODULE)-objs += vdev_raidz.o
$(MODULE)-objs += vdev_raidz_math.o
$(MODULE)-objs += vdev_raidz_math_scalar.o
$(MODULE)-objs += vdev_root.o
$(MODULE)-objs += zap.o
$(MODULE)-objs += zap_leaf.o
Expand Down Expand Up @@ -107,3 +109,6 @@ $(MODULE)-objs += zrlock.o
$(MODULE)-objs += zvol.o
$(MODULE)-objs += dsl_destroy.o
$(MODULE)-objs += dsl_userhold.o

$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse.o
$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o
144 changes: 31 additions & 113 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand All @@ -32,6 +33,8 @@
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>

#include "vdev_raidz.h"

/*
* Virtual device vector for RAID-Z.
*
Expand Down Expand Up @@ -99,33 +102,6 @@
* or in concert to recover missing data columns.
*/

typedef struct raidz_col {
uint64_t rc_devidx; /* child device index for I/O */
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
void *rc_data; /* I/O data */
void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
} raidz_col_t;

typedef struct raidz_map {
uint64_t rm_cols; /* Regular column count */
uint64_t rm_scols; /* Count including skipped columns */
uint64_t rm_bigcols; /* Number of oversized columns */
uint64_t rm_asize; /* Actual total I/O size */
uint64_t rm_missingdata; /* Count of missing data devices */
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
uint64_t rm_nskip; /* Skipped sectors for padding */
uint64_t rm_skipstart; /* Column index of padding start */
void *rm_datacopy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
uint8_t rm_freed; /* map no longer has referencing ZIO */
uint8_t rm_ecksuminjected; /* checksum error was injected */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
} raidz_map_t;

#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
Expand Down Expand Up @@ -159,97 +135,14 @@ typedef struct raidz_map {
*/
int vdev_raidz_default_to_general;

/* Powers of 2 in the Galois field defined above. */
static const uint8_t vdev_raidz_pow2[256] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
};
/* Logs of 2 in the Galois field defined above. */
static const uint8_t vdev_raidz_log2[256] = {
0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
};

static void vdev_raidz_generate_parity(raidz_map_t *rm);

/*
* Multiply a given number by 2 raised to the given power.
* Select a new raidz implementation by default.
*/
static uint8_t
vdev_raidz_exp2(uint_t a, int exp)
{
if (a == 0)
return (0);
unsigned long zfs_raidz_new_math = 1;

ASSERT(exp >= 0);
ASSERT(vdev_raidz_log2[a] > 0 || a == 1);

exp += vdev_raidz_log2[a];
if (exp > 255)
exp -= 255;
static void vdev_raidz_generate_parity(raidz_map_t *rm);

return (vdev_raidz_pow2[exp]);
}

static void
vdev_raidz_map_free(raidz_map_t *rm)
Expand All @@ -272,6 +165,9 @@ vdev_raidz_map_free(raidz_map_t *rm)
if (rm->rm_datacopy != NULL)
zio_buf_free(rm->rm_datacopy, size);

/* Free the raidz math plan */
raidz_math_plan_free(rm->rm_mathplan);

kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
}

Expand Down Expand Up @@ -579,6 +475,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,

zio->io_vsd = rm;
zio->io_vsd_ops = &vdev_raidz_vsd_ops;

/* Math plan */
rm->rm_mathplan = raidz_math_plan_alloc(rm);

return (rm);
}

Expand Down Expand Up @@ -729,6 +629,11 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
static void
vdev_raidz_generate_parity(raidz_map_t *rm)
{
if(zfs_raidz_new_math != 0) {
raidz_math_generate(rm->rm_mathplan);
return;
}

switch (rm->rm_firstdatacol) {
case 1:
vdev_raidz_generate_parity_p(rm);
Expand Down Expand Up @@ -1435,6 +1340,14 @@ vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)

dt = &tgts[nbadparity];

/*
* Reconstruct using the new math implementation.
*/
if(zfs_raidz_new_math != 0) {
return (raidz_math_reconstruct(rm->rm_mathplan,
parity_valid, dt, nbaddata));
}

/*
* See if we can use any of our optimized reconstruction routines.
*/
Expand Down Expand Up @@ -2220,3 +2133,8 @@ vdev_ops_t vdev_raidz_ops = {
VDEV_TYPE_RAIDZ, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};

#if defined(_KERNEL) && defined(HAVE_SPL)
module_param(zfs_raidz_new_math, ulong, 0644);
MODULE_PARM_DESC(zfs_raidz_new_math, "Select raidz implementation: old(0), new(!=0)");
#endif
Loading

0 comments on commit 7e8471f

Please sign in to comment.