diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b9479092c842..2c385e29712e 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -24,6 +24,11 @@ * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ +#if defined(_KERNEL) && defined(__x86_64__) +#include +#include +#include +#endif #include #include #include @@ -582,8 +587,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, return (rm); } +void (*vdev_raidz_generate_parity_p)(raidz_map_t *rm); + static void -vdev_raidz_generate_parity_p(raidz_map_t *rm) +vdev_raidz_generate_parity_p_c(raidz_map_t *rm) { uint64_t *p, *src, pcount, ccount, i; int c; @@ -609,8 +616,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) } } +void (*vdev_raidz_generate_parity_pq)(raidz_map_t *rm); + static void -vdev_raidz_generate_parity_pq(raidz_map_t *rm) +vdev_raidz_generate_parity_pq_c(raidz_map_t *rm) { uint64_t *p, *q, *src, pcnt, ccnt, mask, i; int c; @@ -661,8 +670,10 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) } } +void (*vdev_raidz_generate_parity_pqr)(raidz_map_t *rm); + static void -vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +vdev_raidz_generate_parity_pqr_c(raidz_map_t *rm) { uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; int c; @@ -722,6 +733,1096 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) } } +#if defined(__x86_64__) +#define MAKE_CST32_SSE(reg, val) \ + asm volatile("movd %0,%%"#reg : : "r"(val)); \ + asm volatile("pshufd $0,%"#reg",%"#reg) + +#define COPY8P_SSE \ + asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("movdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("movdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(p+6))) + +#define COPY8PQ_SSE \ + asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("movdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("movdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(p+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(q+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(q+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(q+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(q+6))) + +#define COPY8PQR_SSE \ + asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("movdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("movdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(p+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(q+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(q+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(q+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(q+6))); \ + asm volatile("movdqa %%xmm0, %0" : "=m" (*(r+0))); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(r+2))); \ + asm volatile("movdqa %%xmm2, %0" : "=m" (*(r+4))); \ + asm volatile("movdqa %%xmm3, %0" : "=m" (*(r+6))) + +#define LOAD8_SRC_SSE \ + asm volatile("movdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("movdqa %0,%%xmm4" : : "m" (*(src+2))); \ + asm volatile("movdqa %0,%%xmm8" : : "m" (*(src+4))); \ + asm volatile("movdqa %0,%%xmm12" : : "m" (*(src+6))) + +#define COMPUTE8_P_SSE \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(p+0))); \ + asm volatile("movdqa %0,%%xmm5" : : "m" (*(p+2))); \ + asm volatile("movdqa %0,%%xmm9" : : "m" (*(p+4))); \ + asm volatile("movdqa %0,%%xmm13" : : "m" (*(p+6))); \ + asm volatile("pxor %xmm0,%xmm1"); \ + asm volatile("pxor %xmm4,%xmm5"); \ + asm volatile("pxor %xmm8,%xmm9"); \ + asm volatile("pxor %xmm12,%xmm13"); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(p+0))); \ + asm volatile("movdqa %%xmm5, %0" : "=m" (*(p+2))); \ + asm volatile("movdqa %%xmm9, %0" : "=m" (*(p+4))); \ + asm volatile("movdqa %%xmm13, %0" : "=m" (*(p+6))) + +#define COMPUTE8_Q_SSE \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(q+0))); \ + asm volatile("movdqa %0,%%xmm5" : : "m" (*(q+2))); \ + asm volatile("movdqa %0,%%xmm9" : : "m" (*(q+4))); \ + asm volatile("movdqa %0,%%xmm13" : : "m" (*(q+6))); \ + asm volatile("pxor %xmm2, %xmm2"); \ + asm volatile("pxor %xmm6, %xmm6"); \ + asm volatile("pxor %xmm10, %xmm10"); \ + asm volatile("pxor %xmm14, %xmm14"); \ + asm volatile("pcmpgtb %xmm1, %xmm2"); \ + asm volatile("pcmpgtb %xmm5, %xmm6"); \ + asm volatile("pcmpgtb %xmm9, %xmm10"); \ + asm volatile("pcmpgtb %xmm13, %xmm14"); \ + asm volatile("psllq $1,%xmm1"); \ + asm volatile("psllq $1,%xmm5"); \ + asm volatile("psllq $1,%xmm9"); \ + asm volatile("psllq $1,%xmm13"); \ + MAKE_CST32_SSE(xmm3, 0x1d1d1d1d); \ + asm volatile("pand %xmm3,%xmm2"); \ + asm volatile("pand %xmm3,%xmm6"); \ + asm volatile("pand %xmm3,%xmm10"); \ + asm volatile("pand %xmm3,%xmm14"); \ + MAKE_CST32_SSE(xmm3, 0xfefefefe); \ + asm volatile("pand %xmm3,%xmm1"); \ + asm volatile("pand %xmm3,%xmm5"); \ + asm volatile("pand %xmm3,%xmm9"); \ + asm volatile("pand %xmm3,%xmm13"); \ + asm volatile("pxor %xmm2, %xmm1"); \ + asm volatile("pxor %xmm6, %xmm5"); \ + asm volatile("pxor %xmm10, %xmm9"); \ + asm volatile("pxor %xmm14, %xmm13"); \ + asm volatile("pxor %xmm0, %xmm1"); \ + asm volatile("pxor %xmm4, %xmm5"); \ + asm volatile("pxor %xmm8, %xmm9"); \ + asm volatile("pxor %xmm12, %xmm13"); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(q+0))); \ + asm volatile("movdqa %%xmm5, %0" : "=m" (*(q+2))); \ + asm volatile("movdqa %%xmm9, %0" : "=m" (*(q+4))); \ + asm volatile("movdqa %%xmm13, %0" : "=m" (*(q+6))) + +#define COMPUTE8_R_SSE \ + asm volatile("movdqa %0,%%xmm1" : : "m" (*(r+0))); \ + asm volatile("movdqa %0,%%xmm5" : : "m" (*(r+2))); \ + asm volatile("movdqa %0,%%xmm9" : : "m" (*(r+4))); \ + asm volatile("movdqa %0,%%xmm13" : : "m" (*(r+6))); \ + for (j = 0; j < 2; j++) { \ + asm volatile("pxor %xmm2, %xmm2"); \ + asm volatile("pxor %xmm6, %xmm6"); \ + asm volatile("pxor %xmm10, %xmm10"); \ + asm volatile("pxor %xmm14, %xmm14"); \ + asm volatile("pcmpgtb %xmm1, %xmm2"); \ + asm volatile("pcmpgtb %xmm5, %xmm6"); \ + asm volatile("pcmpgtb %xmm9, %xmm10"); \ + asm volatile("pcmpgtb %xmm13, %xmm14"); \ + asm volatile("psllq $1,%xmm1"); \ + asm volatile("psllq $1,%xmm5"); \ + asm volatile("psllq $1,%xmm9"); \ + asm volatile("psllq $1,%xmm13"); \ + MAKE_CST32_SSE(xmm3, 0x1d1d1d1d); \ + asm volatile("pand %xmm3,%xmm2"); \ + asm volatile("pand %xmm3,%xmm6"); \ + asm volatile("pand %xmm3,%xmm10"); \ + asm volatile("pand %xmm3,%xmm14"); \ + MAKE_CST32_SSE(xmm3, 0xfefefefe); \ + asm volatile("pand %xmm3,%xmm1"); \ + asm volatile("pand %xmm3,%xmm5"); \ + asm volatile("pand %xmm3,%xmm9"); \ + asm volatile("pand %xmm3,%xmm13"); \ + asm volatile("pxor %xmm2, %xmm1"); \ + asm volatile("pxor %xmm6, %xmm5"); \ + asm volatile("pxor %xmm10, %xmm9"); \ + asm volatile("pxor %xmm14, %xmm13"); \ + } \ + asm volatile("pxor %xmm0, %xmm1"); \ + asm volatile("pxor %xmm4, %xmm5"); \ + asm volatile("pxor %xmm8, %xmm9"); \ + asm volatile("pxor %xmm12, %xmm13"); \ + asm volatile("movdqa %%xmm1, %0" : "=m" (*(r+0))); \ + asm volatile("movdqa %%xmm5, %0" : "=m" (*(r+2))); \ + asm volatile("movdqa %%xmm9, %0" : "=m" (*(r+4))); \ + asm volatile("movdqa %%xmm13, %0" : "=m" (*(r+6))) + + +static void +vdev_raidz_generate_parity_p_sse(raidz_map_t *rm) +{ + uint64_t *p, *src, pcount, ccount, i; + int c; + + pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccount == pcount); + i = 0; + if (ccount > 7) /* ccount is unsigned */ + for (; i < ccount-7; i += 8, src += 8, p += 8) { + COPY8P_SSE; + } + for (; i < ccount; i++, src++, p++) { + *p = *src; + } + } else { + ASSERT(ccount <= pcount); + i = 0; + if (ccount > 7) /* ccount is unsigned */ + for (; i < ccount-7; i += 8, src += 8, p += 8) { + LOAD8_SRC_SSE; + COMPUTE8_P_SSE; + } + for (; i < ccount; i++, src++, p++) { + *p ^= *src; + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} + +static void +vdev_raidz_generate_parity_pq_sse(raidz_map_t *rm) +{ + uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8) { + COPY8PQ_SSE; + } + for (; i < ccnt; i++, src++, p++, q++) { + *p = *src; + *q = *src; + } + for (; i < pcnt; i++, src++, p++, q++) { + *p = 0; + *q = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8) { + LOAD8_SRC_SSE; + COMPUTE8_P_SSE; + COMPUTE8_Q_SSE; + } + for (; i < ccnt; i++, src++, p++, q++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} + +static void +vdev_raidz_generate_parity_pqr_sse(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i, j; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, + q += 8, r += 8) { + COPY8PQR_SSE; + } + for (; i < ccnt; i++, src++, p++, q++, r++) { + *p = *src; + *q = *src; + *r = *src; + } + for (; i < pcnt; i++, src++, p++, q++, r++) { + *p = 0; + *q = 0; + *r = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, + q += 8, r += 8) { + LOAD8_SRC_SSE; + COMPUTE8_P_SSE; + COMPUTE8_Q_SSE; + COMPUTE8_R_SSE; + } + for (; i < ccnt; i++, src++, p++, q++, r++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + + VDEV_RAIDZ_64MUL_4(*r, mask); + *r ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++, r++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + VDEV_RAIDZ_64MUL_4(*r, mask); + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} +#undef MAKE_CST32_SSE +#undef COPY8P_SSE +#undef COPY8PQ_SSE +#undef COPY8PQR_SSE +#undef LOAD8_SRC_SSE +#undef COMPUTE8_P_SSE +#undef COMPUTE8_Q_SSE +#undef COMPUTE8_R_SSE + +#if defined(_KERNEL) && defined(CONFIG_AS_AVX2) +#define MAKE_CST32_AVX2(regx, regy, val) \ + asm volatile("vmovd %0,%%"#regx : : "r"(val)); \ + asm volatile("vpbroadcastd %"#regx",%"#regy) + +#define COPY16P_AVX2 \ + asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8))); \ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12))) + +#define COPY16PQ_AVX2 \ + asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8))); \ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(q+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(q+12))) + +#define COPY16PQR_AVX2 \ + asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%ymm2" : : "m" (*(src+8))); \ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (*(src+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(p+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(p+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(q+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(q+12))); \ + asm volatile("vmovdqa %%ymm0, %0" : "=m" (*(r+0))); \ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (*(r+4))); \ + asm volatile("vmovdqa %%ymm2, %0" : "=m" (*(r+8))); \ + asm volatile("vmovdqa %%ymm3, %0" : "=m" (*(r+12))) + +#define LOAD16_SRC_AVX2 \ + asm volatile("vmovdqa %0,%%ymm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%ymm4" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%ymm8" : : "m" (*(src+8))); \ + asm volatile("vmovdqa %0,%%ymm12" : : "m" (*(src+12))) + +#define COMPUTE16_P_AVX2 \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(p+0))); \ + asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(p+4))); \ + asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(p+8))); \ + asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(p+12))); \ + asm volatile("vpxor %ymm0,%ymm1,%ymm1"); \ + asm volatile("vpxor %ymm4,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm8,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm12,%ymm13,%ymm13"); \ + asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(p+8))); \ + asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(p+12))) + +#define COMPUTE16_Q_AVX2 \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(q+0))); \ + asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(q+4))); \ + asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(q+8))); \ + asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(q+12))); \ + asm volatile("vpxor %ymm14, %ymm14, %ymm14"); \ + asm volatile("vpcmpgtb %ymm1, %ymm14, %ymm2"); \ + asm volatile("vpcmpgtb %ymm5, %ymm14, %ymm6"); \ + asm volatile("vpcmpgtb %ymm9, %ymm14, %ymm10"); \ + asm volatile("vpcmpgtb %ymm13, %ymm14, %ymm14"); \ + asm volatile("vpsllq $1,%ymm1,%ymm1"); \ + asm volatile("vpsllq $1,%ymm5,%ymm5"); \ + asm volatile("vpsllq $1,%ymm9,%ymm9"); \ + asm volatile("vpsllq $1,%ymm13,%ymm13"); \ + MAKE_CST32_AVX2(xmm3, ymm3, 0x1d1d1d1d); \ + asm volatile("vpand %ymm3,%ymm2,%ymm2"); \ + asm volatile("vpand %ymm3,%ymm6,%ymm6"); \ + asm volatile("vpand %ymm3,%ymm10,%ymm10"); \ + asm volatile("vpand %ymm3,%ymm14,%ymm14"); \ + MAKE_CST32_AVX2(xmm3, ymm3, 0xfefefefe); \ + asm volatile("vpand %ymm3,%ymm1,%ymm1"); \ + asm volatile("vpand %ymm3,%ymm5,%ymm5"); \ + asm volatile("vpand %ymm3,%ymm9,%ymm9"); \ + asm volatile("vpand %ymm3,%ymm13,%ymm13"); \ + asm volatile("vpxor %ymm2,%ymm1,%ymm1"); \ + asm volatile("vpxor %ymm6,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm10,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm14,%ymm13,%ymm13"); \ + asm volatile("vpxor %ymm0,%ymm1,%ymm1"); \ + asm volatile("vpxor %ymm4,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm8,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm12,%ymm13,%ymm13"); \ + asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(q+8))); \ + asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(q+12))) + +#define COMPUTE16_R_AVX2 \ + asm volatile("vmovdqa %0,%%ymm1" : : "m" (*(r+0))); \ + asm volatile("vmovdqa %0,%%ymm5" : : "m" (*(r+4))); \ + asm volatile("vmovdqa %0,%%ymm9" : : "m" (*(r+8))); \ + asm volatile("vmovdqa %0,%%ymm13" : : "m" (*(r+12))); \ + for (j = 0; j < 2; j++) { \ + asm volatile("vpxor %ymm14, %ymm14, %ymm14"); \ + asm volatile("vpcmpgtb %ymm1, %ymm14, %ymm2"); \ + asm volatile("vpcmpgtb %ymm5, %ymm14, %ymm6"); \ + asm volatile("vpcmpgtb %ymm9, %ymm14, %ymm10"); \ + asm volatile("vpcmpgtb %ymm13, %ymm14, %ymm14"); \ + asm volatile("vpsllq $1,%ymm1,%ymm1"); \ + asm volatile("vpsllq $1,%ymm5,%ymm5"); \ + asm volatile("vpsllq $1,%ymm9,%ymm9"); \ + asm volatile("vpsllq $1,%ymm13,%ymm13"); \ + MAKE_CST32_AVX2(xmm3, ymm3, 0x1d1d1d1d); \ + asm volatile("vpand %ymm3,%ymm2,%ymm2"); \ + asm volatile("vpand %ymm3,%ymm6,%ymm6"); \ + asm volatile("vpand %ymm3,%ymm10,%ymm10"); \ + asm volatile("vpand %ymm3,%ymm14,%ymm14"); \ + MAKE_CST32_AVX2(xmm3, ymm3, 0xfefefefe); \ + asm volatile("vpand %ymm3,%ymm1,%ymm1"); \ + asm volatile("vpand %ymm3,%ymm5,%ymm5"); \ + asm volatile("vpand %ymm3,%ymm9,%ymm9"); \ + asm volatile("vpand %ymm3,%ymm13,%ymm13"); \ + asm volatile("vpxor %ymm2,%ymm1,%ymm1"); \ + asm volatile("vpxor %ymm6,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm10,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm14,%ymm13,%ymm13"); \ + } \ + asm volatile("vpxor %ymm0,%ymm1,%ymm1"); \ + asm volatile("vpxor %ymm4,%ymm5,%ymm5"); \ + asm volatile("vpxor %ymm8,%ymm9,%ymm9"); \ + asm volatile("vpxor %ymm12,%ymm13,%ymm13"); \ + asm volatile("vmovdqa %%ymm1,%0" : "=m" (*(r+0))); \ + asm volatile("vmovdqa %%ymm5,%0" : "=m" (*(r+4))); \ + asm volatile("vmovdqa %%ymm9,%0" : "=m" (*(r+8))); \ + asm volatile("vmovdqa %%ymm13,%0" : "=m" (*(r+12))) + +static void +vdev_raidz_generate_parity_p_avx2(raidz_map_t *rm) +{ + uint64_t *p, *src, pcount, ccount, i; + int c; + + pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccount == pcount); + i = 0; + if (ccount > 15) /* ccount is unsigned */ + for (; i < ccount-15; i += 16, src += 16, p += 16) { + COPY16P_AVX2; + } + for (; i < ccount; i++, src++, p++) { + *p = *src; + } + } else { + ASSERT(ccount <= pcount); + i = 0; + if (ccount > 15) /* ccount is unsigned */ + for (; i < ccount-15; i += 16, src += 16, p += 16) { + LOAD16_SRC_AVX2; + COMPUTE16_P_AVX2; + } + for (; i < ccount; i++, src++, p++) { + *p ^= *src; + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} + +static void +vdev_raidz_generate_parity_pq_avx2(raidz_map_t *rm) +{ + uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + i = 0; + if (ccnt > 15) /* ccnt is unsigned */ + for (; i < ccnt-15; i += 16, src += 16, + p += 16, q += 16) { + COPY16PQ_AVX2; + } + for (; i < ccnt; i++, src++, p++, q++) { + *p = *src; + *q = *src; + } + for (; i < pcnt; i++, src++, p++, q++) { + *p = 0; + *q = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + i = 0; + if (ccnt > 15) /* ccnt is unsigned */ + for (; i < ccnt-15; i += 16, src += 16, + p += 16, q += 16) { + LOAD16_SRC_AVX2; + COMPUTE16_P_AVX2; + COMPUTE16_Q_AVX2; + } + for (; i < ccnt; i++, src++, p++, q++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} + +static void +vdev_raidz_generate_parity_pqr_avx2(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i, j; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + i = 0; + if (ccnt > 15) /* ccnt is unsigned */ + for (; i < ccnt-15; i += 16, src += 16, + p += 16, q += 16, r += 16) { + COPY16PQR_AVX2; + } + for (; i < ccnt; i++, src++, p++, q++, r++) { + *p = *src; + *q = *src; + *r = *src; + } + for (; i < pcnt; i++, src++, p++, q++, r++) { + *p = 0; + *q = 0; + *r = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + i = 0; + if (ccnt > 15) /* ccnt is unsigned */ + for (; i < ccnt-15; i += 16, src += 16, + p += 16, q += 16, r += 16) { + LOAD16_SRC_AVX2; + COMPUTE16_P_AVX2; + COMPUTE16_Q_AVX2; + COMPUTE16_R_AVX2; + } + for (; i < ccnt; i++, src++, p++, q++, r++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + + VDEV_RAIDZ_64MUL_4(*r, mask); + *r ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++, r++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + VDEV_RAIDZ_64MUL_4(*r, mask); + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} +#undef MAKE_CST32_AVX2 +#undef COPY16P_AVX2 +#undef COPY16PQ_AVX2 +#undef COPY16PQR_AVX2 +#undef LOAD16_SRC_AVX2 +#undef COMPUTE16_P_AVX2 +#undef COMPUTE16_Q_AVX2 +#undef COMPUTE16_R_AVX2 +#endif // _KERNEL && CONFIG_AS_AVX2 + +#if defined(_KERNEL) && defined(CONFIG_AS_AVX) +#define MAKE_CST32_AVX128(reg, val) \ + asm volatile("vmovd %0,%%"#reg : : "r"(val)); \ + asm volatile("vpshufd $0,%"#reg",%"#reg) + +#define COPY8P_AVX128 \ + asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("vmovdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(p+6))) + +#define COPY8PQ_AVX128 \ + asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("vmovdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(p+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(q+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(q+6))) + +#define COPY8PQR_AVX128 \ + asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(src+2))); \ + asm volatile("vmovdqa %0,%%xmm2" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%xmm3" : : "m" (*(src+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(p+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(p+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(q+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(q+6))); \ + asm volatile("vmovdqa %%xmm0, %0" : "=m" (*(r+0))); \ + asm volatile("vmovdqa %%xmm1, %0" : "=m" (*(r+2))); \ + asm volatile("vmovdqa %%xmm2, %0" : "=m" (*(r+4))); \ + asm volatile("vmovdqa %%xmm3, %0" : "=m" (*(r+6))) + +#define LOAD8_SRC_AVX128 \ + asm volatile("vmovdqa %0,%%xmm0" : : "m" (*(src+0))); \ + asm volatile("vmovdqa %0,%%xmm4" : : "m" (*(src+2))); \ + asm volatile("vmovdqa %0,%%xmm8" : : "m" (*(src+4))); \ + asm volatile("vmovdqa %0,%%xmm12" : : "m" (*(src+6))) + +#define COMPUTE8_P_AVX128 \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(p+0))); \ + asm volatile("vmovdqa %0,%%xmm5" : : "m" (*(p+2))); \ + asm volatile("vmovdqa %0,%%xmm9" : : "m" (*(p+4))); \ + asm volatile("vmovdqa %0,%%xmm13" : : "m" (*(p+6))); \ + asm volatile("vpxor %xmm0,%xmm1,%xmm1"); \ + asm volatile("vpxor %xmm4,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm8,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm12,%xmm13,%xmm13"); \ + asm volatile("vmovdqa %%xmm1,%0" : "=m" (*(p+0))); \ + asm volatile("vmovdqa %%xmm5,%0" : "=m" (*(p+2))); \ + asm volatile("vmovdqa %%xmm9,%0" : "=m" (*(p+4))); \ + asm volatile("vmovdqa %%xmm13,%0" : "=m" (*(p+6))) + +#define COMPUTE8_Q_AVX128 \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(q+0))); \ + asm volatile("vmovdqa %0,%%xmm5" : : "m" (*(q+2))); \ + asm volatile("vmovdqa %0,%%xmm9" : : "m" (*(q+4))); \ + asm volatile("vmovdqa %0,%%xmm13" : : "m" (*(q+6))); \ + asm volatile("vpxor %xmm14, %xmm14, %xmm14"); \ + asm volatile("vpcmpgtb %xmm1, %xmm14, %xmm2"); \ + asm volatile("vpcmpgtb %xmm5, %xmm14, %xmm6"); \ + asm volatile("vpcmpgtb %xmm9, %xmm14, %xmm10"); \ + asm volatile("vpcmpgtb %xmm13, %xmm14, %xmm14"); \ + asm volatile("vpsllq $1,%xmm1,%xmm1"); \ + asm volatile("vpsllq $1,%xmm5,%xmm5"); \ + asm volatile("vpsllq $1,%xmm9,%xmm9"); \ + asm volatile("vpsllq $1,%xmm13,%xmm13"); \ + MAKE_CST32_AVX128(xmm3, 0x1d1d1d1d); \ + asm volatile("vpand %xmm3,%xmm2,%xmm2"); \ + asm volatile("vpand %xmm3,%xmm6,%xmm6"); \ + asm volatile("vpand %xmm3,%xmm10,%xmm10"); \ + asm volatile("vpand %xmm3,%xmm14,%xmm14"); \ + MAKE_CST32_AVX128(xmm3, 0xfefefefe); \ + asm volatile("vpand %xmm3,%xmm1,%xmm1"); \ + asm volatile("vpand %xmm3,%xmm5,%xmm5"); \ + asm volatile("vpand %xmm3,%xmm9,%xmm9"); \ + asm volatile("vpand %xmm3,%xmm13,%xmm13"); \ + asm volatile("vpxor %xmm2,%xmm1,%xmm1"); \ + asm volatile("vpxor %xmm6,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm10,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm14,%xmm13,%xmm13"); \ + asm volatile("vpxor %xmm0,%xmm1,%xmm1"); \ + asm volatile("vpxor %xmm4,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm8,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm12,%xmm13,%xmm13"); \ + asm volatile("vmovdqa %%xmm1,%0" : "=m" (*(q+0))); \ + asm volatile("vmovdqa %%xmm5,%0" : "=m" (*(q+2))); \ + asm volatile("vmovdqa %%xmm9,%0" : "=m" (*(q+4))); \ + asm volatile("vmovdqa %%xmm13,%0" : "=m" (*(q+6))) + +#define COMPUTE8_R_AVX128 \ + asm volatile("vmovdqa %0,%%xmm1" : : "m" (*(r+0))); \ + asm volatile("vmovdqa %0,%%xmm5" : : "m" (*(r+2))); \ + asm volatile("vmovdqa %0,%%xmm9" : : "m" (*(r+4))); \ + asm volatile("vmovdqa %0,%%xmm13" : : "m" (*(r+6))); \ + for (j = 0; j < 2; j++) { \ + asm volatile("vpxor %xmm14, %xmm14, %xmm14"); \ + asm volatile("vpcmpgtb %xmm1, %xmm14, %xmm2"); \ + asm volatile("vpcmpgtb %xmm5, %xmm14, %xmm6"); \ + asm volatile("vpcmpgtb %xmm9, %xmm14, %xmm10"); \ + asm volatile("vpcmpgtb %xmm13, %xmm14, %xmm14"); \ + asm volatile("vpsllq $1,%xmm1,%xmm1"); \ + asm volatile("vpsllq $1,%xmm5,%xmm5"); \ + asm volatile("vpsllq $1,%xmm9,%xmm9"); \ + asm volatile("vpsllq $1,%xmm13,%xmm13"); \ + MAKE_CST32_AVX128(xmm3, 0x1d1d1d1d); \ + asm volatile("vpand %xmm3,%xmm2,%xmm2"); \ + asm volatile("vpand %xmm3,%xmm6,%xmm6"); \ + asm volatile("vpand %xmm3,%xmm10,%xmm10"); \ + asm volatile("vpand %xmm3,%xmm14,%xmm14"); \ + MAKE_CST32_AVX128(xmm3, 0xfefefefe); \ + asm volatile("vpand %xmm3,%xmm1,%xmm1"); \ + asm volatile("vpand %xmm3,%xmm5,%xmm5"); \ + asm volatile("vpand %xmm3,%xmm9,%xmm9"); \ + asm volatile("vpand %xmm3,%xmm13,%xmm13"); \ + asm volatile("vpxor %xmm2,%xmm1,%xmm1"); \ + asm volatile("vpxor %xmm6,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm10,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm14,%xmm13,%xmm13"); \ + } \ + asm volatile("vpxor %xmm0,%xmm1,%xmm1"); \ + asm volatile("vpxor %xmm4,%xmm5,%xmm5"); \ + asm volatile("vpxor %xmm8,%xmm9,%xmm9"); \ + asm volatile("vpxor %xmm12,%xmm13,%xmm13"); \ + asm volatile("vmovdqa %%xmm1,%0" : "=m" (*(r+0))); \ + asm volatile("vmovdqa %%xmm5,%0" : "=m" (*(r+2))); \ + asm volatile("vmovdqa %%xmm9,%0" : "=m" (*(r+4))); \ + asm volatile("vmovdqa %%xmm13,%0" : "=m" (*(r+6))) + + +static void +vdev_raidz_generate_parity_p_avx128(raidz_map_t *rm) +{ + uint64_t *p, *src, pcount, ccount, i; + int c; + + pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccount == pcount); + i = 0; + if (ccount > 7) /* ccount is unsigned */ + for (; i < ccount-7; i += 8, src += 8, p += 8) { + COPY8P_AVX128; + } + for (; i < ccount; i++, src++, p++) { + *p = *src; + } + } else { + ASSERT(ccount <= pcount); + i = 0; + if (ccount > 7) /* ccount is unsigned */ + for (; i < ccount-7; i += 8, src += 8, p += 8) { + LOAD8_SRC_AVX128; + COMPUTE8_P_AVX128; + } + for (; i < ccount; i++, src++, p++) { + *p ^= *src; + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} + +static void +vdev_raidz_generate_parity_pq_avx128(raidz_map_t *rm) +{ + uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8) { + COPY8PQ_AVX128; + } + for (; i < ccnt; i++, src++, p++, q++) { + *p = *src; + *q = *src; + } + for (; i < pcnt; i++, src++, p++, q++) { + *p = 0; + *q = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, q += 8) { + LOAD8_SRC_AVX128; + COMPUTE8_P_AVX128; + COMPUTE8_Q_AVX128; + } + for (; i < ccnt; i++, src++, p++, q++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} + +static void +vdev_raidz_generate_parity_pqr_avx128(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i, j; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); +#if defined(_KERNEL) + kernel_fpu_begin(); +#endif + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, + q += 8, r += 8) { + COPY8PQR_AVX128; + } + for (; i < ccnt; i++, src++, p++, q++, r++) { + *p = *src; + *q = *src; + *r = *src; + } + for (; i < pcnt; i++, src++, p++, q++, r++) { + *p = 0; + *q = 0; + *r = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + i = 0; + if (ccnt > 7) /* ccnt is unsigned */ + for (; i < ccnt-7; i += 8, src += 8, p += 8, + q += 8, r += 8) { + LOAD8_SRC_AVX128; + COMPUTE8_P_AVX128; + COMPUTE8_Q_AVX128; + COMPUTE8_R_AVX128; + } + for (; i < ccnt; i++, src++, p++, q++, r++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + + VDEV_RAIDZ_64MUL_4(*r, mask); + *r ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++, r++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + VDEV_RAIDZ_64MUL_4(*r, mask); + } + } + } +#if defined(_KERNEL) + kernel_fpu_end(); +#endif +} +#undef MAKE_CST32_AVX128 +#undef COPY8P_AVX128 +#undef COPY8PQ_AVX128 +#undef COPY8PQR_AVX128 +#undef LOAD8_SRC_AVX128 +#undef COMPUTE8_P_AVX128 +#undef COMPUTE8_Q_AVX128 +#undef COMPUTE8_R_AVX128 + +#endif // _KERNEL && CONFIG_AS_AVX + +#endif // __x86_64__ + +static void vdev_raidz_pick_parity_functions(void) { + vdev_raidz_generate_parity_p = &vdev_raidz_generate_parity_p_c; + vdev_raidz_generate_parity_pq = &vdev_raidz_generate_parity_pq_c; + vdev_raidz_generate_parity_pqr = &vdev_raidz_generate_parity_pqr_c; +#if defined(__x86_64__) +#if defined(_KERNEL) && defined(CONFIG_AS_AVX2) + if (boot_cpu_has(X86_FEATURE_AVX2)) { + vdev_raidz_generate_parity_p = + &vdev_raidz_generate_parity_p_avx2; + vdev_raidz_generate_parity_pq = + &vdev_raidz_generate_parity_pq_avx2; + vdev_raidz_generate_parity_pqr = + &vdev_raidz_generate_parity_pqr_avx2; + printk("ZFS: using vdev_raidz_generate_parity_*_avx2\n"); + } else +#endif +#if defined(_KERNEL) && defined(CONFIG_AS_AVX) + if (boot_cpu_has(X86_FEATURE_AVX)) { + vdev_raidz_generate_parity_p = + &vdev_raidz_generate_parity_p_avx128; + vdev_raidz_generate_parity_pq = + &vdev_raidz_generate_parity_pq_avx128; + vdev_raidz_generate_parity_pqr = + &vdev_raidz_generate_parity_pqr_avx128; + printk("ZFS: using vdev_raidz_generate_parity_"\ + "*_avx128\n"); + } else +#endif + { + /* x86-64 always has SSE2 */ + vdev_raidz_generate_parity_p = + &vdev_raidz_generate_parity_p_sse; + vdev_raidz_generate_parity_pq = + &vdev_raidz_generate_parity_pq_sse; + vdev_raidz_generate_parity_pqr = + &vdev_raidz_generate_parity_pqr_sse; +#if defined(_KERNEL) + printk("ZFS: using vdev_raidz_generate_parity_*_sse\n"); +#endif + } +#endif +} + /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. @@ -1481,6 +2582,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, int lasterror = 0; int numerrors = 0; + /* + * Should probably be done elsewhere, + * to be done once per module load. + * This could cause a race condition + * on which function is used. + */ + vdev_raidz_pick_parity_functions(); + ASSERT(nparity > 0); if (nparity > VDEV_RAIDZ_MAXPARITY ||