diff --git a/cmd/ztest.c b/cmd/ztest.c index 170347b2830b..593e3238e922 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -137,6 +137,7 @@ #include #include #include +#include #include #if (__GLIBC__ && !__UCLIBC__) #include /* for backtrace() */ @@ -6574,6 +6575,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) int i, *ptr; uint32_t size; BLAKE3_CTX ctx; + const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); @@ -6598,7 +6600,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) void *res2 = &zc_res2; /* BLAKE3_KEY_LEN = 32 */ - VERIFY0(blake3_impl_setname("generic")); + VERIFY0(blake3->setname("generic")); templ = abd_checksum_blake3_tmpl_init(&salt); Blake3_InitKeyed(&ctx, salt_ptr); Blake3_Update(&ctx, buf, size); @@ -6607,7 +6609,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) ZIO_CHECKSUM_BSWAP(&zc_ref2); abd_checksum_blake3_tmpl_free(templ); - VERIFY0(blake3_impl_setname("cycle")); + VERIFY0(blake3->setname("cycle")); while (run_count-- > 0) { /* Test current implementation */ diff --git a/include/sys/blake3.h b/include/sys/blake3.h index ad65fc8db7b9..b981b18db943 100644 --- a/include/sys/blake3.h +++ b/include/sys/blake3.h @@ -22,11 +22,11 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor - * Copyright (c) 2021 Tino Reichardt + * Copyright (c) 2021-2022 Tino Reichardt */ -#ifndef BLAKE3_H -#define BLAKE3_H +#ifndef _SYS_BLAKE3_H +#define _SYS_BLAKE3_H #ifdef _KERNEL #include @@ -97,26 +97,8 @@ extern void **blake3_per_cpu_ctx; extern void blake3_per_cpu_ctx_init(void); extern void blake3_per_cpu_ctx_fini(void); -/* get count of supported implementations */ -extern uint32_t blake3_impl_getcnt(void); - -/* get id of selected implementation */ -extern uint32_t blake3_impl_getid(void); - -/* get name of selected implementation */ -extern const char *blake3_impl_getname(void); - -/* setup id as fastest implementation */ -extern void blake3_impl_set_fastest(uint32_t id); - -/* set implementation by id */ -extern void blake3_impl_setid(uint32_t id); - -/* set implementation by name */ -extern int blake3_impl_setname(const char *name); - #ifdef __cplusplus } #endif -#endif /* BLAKE3_H */ +#endif /* _SYS_BLAKE3_H */ diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c index 8e441f454a72..4f93e4ff2051 100644 --- a/module/icp/algs/blake3/blake3.c +++ b/module/icp/algs/blake3/blake3.c @@ -432,7 +432,7 @@ static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8], memcpy(ctx->key, key, BLAKE3_KEY_LEN); chunk_state_init(&ctx->chunk, key, flags); ctx->cv_stack_len = 0; - ctx->ops = blake3_impl_get_ops(); + ctx->ops = blake3_get_ops(); } /* diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c index 94a1f108236e..ca7197a26f39 100644 --- a/module/icp/algs/blake3/blake3_generic.c +++ b/module/icp/algs/blake3/blake3_generic.c @@ -187,7 +187,8 @@ static inline void blake3_hash_many_generic(const uint8_t * const *inputs, } } -static inline boolean_t blake3_is_generic_supported(void) +/* the generic implementation is always okay */ +static boolean_t blake3_is_supported(void) { return (B_TRUE); } @@ -196,7 +197,7 @@ const blake3_ops_t blake3_generic_impl = { .compress_in_place = blake3_compress_in_place_generic, .compress_xof = blake3_compress_xof_generic, .hash_many = blake3_hash_many_generic, - .is_supported = blake3_is_generic_supported, + .is_supported = blake3_is_supported, .degree = 4, .name = "generic" }; diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c index 7bc4db2c9806..f68a5edfeaa4 100644 --- a/module/icp/algs/blake3/blake3_impl.c +++ b/module/icp/algs/blake3/blake3_impl.c @@ -24,222 +24,266 @@ */ #include -#include +#include +#include +#include #include "blake3_impl.h" -static const blake3_ops_t *const blake3_impls[] = { - &blake3_generic_impl, #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - &blake3_sse2_impl, -#endif -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - &blake3_sse41_impl, -#endif -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) - &blake3_avx2_impl, -#endif -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) - &blake3_avx512_impl, -#endif -}; - -/* Select BLAKE3 implementation */ -#define IMPL_FASTEST (UINT32_MAX) -#define IMPL_CYCLE (UINT32_MAX - 1) - -#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) - -/* Indicate that benchmark has been done */ -static boolean_t blake3_initialized = B_FALSE; - -/* Implementation that contains the fastest methods */ -static blake3_ops_t blake3_fastest_impl = { - .name = "fastest" -}; -/* Hold all supported implementations */ -static const blake3_ops_t *blake3_supp_impls[ARRAY_SIZE(blake3_impls)]; -static uint32_t blake3_supp_impls_cnt = 0; +extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, + flags); + kfpu_end(); +} -/* Currently selected implementation */ -static uint32_t blake3_impl_chosen = IMPL_FASTEST; +static void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} -static struct blake3_impl_selector { - const char *name; - uint32_t sel; -} blake3_impl_selectors[] = { - { "cycle", IMPL_CYCLE }, - { "fastest", IMPL_FASTEST } -}; +static void blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} -/* check the supported implementations */ -static void blake3_impl_init(void) +static boolean_t blake3_is_sse2_supported(void) { - int i, c; - - /* init only once */ - if (likely(blake3_initialized)) - return; +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse2_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} - /* move supported implementations into blake3_supp_impls */ - for (i = 0, c = 0; i < ARRAY_SIZE(blake3_impls); i++) { - const blake3_ops_t *impl = blake3_impls[i]; +const blake3_ops_t blake3_sse2_impl = { + .compress_in_place = blake3_compress_in_place_sse2, + .compress_xof = blake3_compress_xof_sse2, + .hash_many = blake3_hash_many_sse2, + .is_supported = blake3_is_sse2_supported, + .degree = 4, + .name = "sse2" +}; +#endif - if (impl->is_supported && impl->is_supported()) - blake3_supp_impls[c++] = impl; - } - blake3_supp_impls_cnt = c; +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - /* first init generic impl, may be changed via set_fastest() */ - memcpy(&blake3_fastest_impl, blake3_impls[0], - sizeof (blake3_fastest_impl)); - blake3_initialized = B_TRUE; +extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, + flags); + kfpu_end(); } -/* get number of supported implementations */ -uint32_t -blake3_impl_getcnt(void) -{ - blake3_impl_init(); - return (blake3_supp_impls_cnt); +static void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, + out); + kfpu_end(); } -/* get id of selected implementation */ -uint32_t -blake3_impl_getid(void) -{ - return (IMPL_READ(blake3_impl_chosen)); +static void blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -/* get name of selected implementation */ -const char * -blake3_impl_getname(void) +static boolean_t blake3_is_sse41_supported(void) { - uint32_t impl = IMPL_READ(blake3_impl_chosen); - - blake3_impl_init(); - switch (impl) { - case IMPL_FASTEST: - return ("fastest"); - case IMPL_CYCLE: - return ("cycle"); - default: - return (blake3_supp_impls[impl]->name); - } +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse4_1_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif } -/* setup id as fastest implementation */ -void -blake3_impl_set_fastest(uint32_t id) -{ - /* setup fastest impl */ - memcpy(&blake3_fastest_impl, blake3_supp_impls[id], - sizeof (blake3_fastest_impl)); +const blake3_ops_t blake3_sse41_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_sse41, + .is_supported = blake3_is_sse41_supported, + .degree = 4, + .name = "sse41" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -/* set implementation by id */ -void -blake3_impl_setid(uint32_t id) +static boolean_t blake3_is_avx2_supported(void) { - blake3_impl_init(); - switch (id) { - case IMPL_FASTEST: - atomic_swap_32(&blake3_impl_chosen, IMPL_FASTEST); - break; - case IMPL_CYCLE: - atomic_swap_32(&blake3_impl_chosen, IMPL_CYCLE); - break; - default: - ASSERT3U(id, <, blake3_supp_impls_cnt); - atomic_swap_32(&blake3_impl_chosen, id); - break; - } + return (kfpu_allowed() && zfs_sse4_1_available() && + zfs_avx2_available()); } -/* set implementation by name */ -int -blake3_impl_setname(const char *val) -{ - uint32_t impl = IMPL_READ(blake3_impl_chosen); - size_t val_len; - int i, err = -EINVAL; - - blake3_impl_init(); - val_len = strlen(val); - while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ - val_len--; - - /* check mandatory implementations */ - for (i = 0; i < ARRAY_SIZE(blake3_impl_selectors); i++) { - const char *name = blake3_impl_selectors[i].name; - - if (val_len == strlen(name) && - strncmp(val, name, val_len) == 0) { - impl = blake3_impl_selectors[i].sel; - err = 0; - break; - } - } +const blake3_ops_t +blake3_avx2_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_avx2, + .is_supported = blake3_is_avx2_supported, + .degree = 8, + .name = "avx2" +}; +#endif - if (err != 0 && blake3_initialized) { - /* check all supported implementations */ - for (i = 0; i < blake3_supp_impls_cnt; i++) { - const char *name = blake3_supp_impls[i]->name; - - if (val_len == strlen(name) && - strncmp(val, name, val_len) == 0) { - impl = i; - err = 0; - break; - } - } - } +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, + flags); + kfpu_end(); +} - if (err == 0) { - atomic_swap_32(&blake3_impl_chosen, impl); - } +static void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} - return (err); +static void blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -const blake3_ops_t * -blake3_impl_get_ops(void) +static boolean_t blake3_is_avx512_supported(void) { - const blake3_ops_t *ops = NULL; - uint32_t impl = IMPL_READ(blake3_impl_chosen); - - blake3_impl_init(); - switch (impl) { - case IMPL_FASTEST: - ASSERT(blake3_initialized); - ops = &blake3_fastest_impl; - break; - case IMPL_CYCLE: - /* Cycle through supported implementations */ - ASSERT(blake3_initialized); - ASSERT3U(blake3_supp_impls_cnt, >, 0); - static uint32_t cycle_count = 0; - uint32_t idx = (++cycle_count) % blake3_supp_impls_cnt; - ops = blake3_supp_impls[idx]; - break; - default: - ASSERT3U(blake3_supp_impls_cnt, >, 0); - ASSERT3U(impl, <, blake3_supp_impls_cnt); - ops = blake3_supp_impls[impl]; - break; - } - - ASSERT3P(ops, !=, NULL); - return (ops); + return (kfpu_allowed() && zfs_avx512f_available() && + zfs_avx512vl_available()); } -#if defined(_KERNEL) +const blake3_ops_t blake3_avx512_impl = { + .compress_in_place = blake3_compress_in_place_avx512, + .compress_xof = blake3_compress_xof_avx512, + .hash_many = blake3_hash_many_avx512, + .is_supported = blake3_is_avx512_supported, + .degree = 16, + .name = "avx512" +}; +#endif + +extern const blake3_ops_t blake3_generic_impl; + +static const blake3_ops_t *const blake3_impls[] = { + &blake3_generic_impl, +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse2_impl, +#endif +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse41_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) + &blake3_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + &blake3_avx512_impl, +#endif +}; +/* use the generic implementation functions */ +#define IMPL_NAME "blake3" +#define IMPL_OPS_T blake3_ops_t +#define IMPL_ARRAY blake3_impls +#define IMPL_GET_OPS blake3_get_ops +#define ZFS_IMPL_OPS zfs_blake3_ops +#include + +#ifdef _KERNEL void **blake3_per_cpu_ctx; void @@ -253,9 +297,6 @@ blake3_per_cpu_ctx_init(void) blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX), KM_SLEEP); } - - /* init once in kernel mode */ - blake3_impl_init(); } void @@ -276,7 +317,7 @@ blake3_per_cpu_ctx_fini(void) static int blake3_param_get(char *buffer, zfs_kernel_param_t *unused) { - const uint32_t impl = IMPL_READ(blake3_impl_chosen); + const uint32_t impl = IMPL_READ(generic_impl_chosen); char *fmt; int cnt = 0; @@ -289,10 +330,11 @@ blake3_param_get(char *buffer, zfs_kernel_param_t *unused) cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest"); /* list all supported implementations */ - for (uint32_t i = 0; i < blake3_supp_impls_cnt; ++i) { + generic_impl_init(); + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { fmt = IMPL_FMT(impl, i); cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, - blake3_supp_impls[i]->name); + blake3_impls[i]->name); } return (cnt); @@ -302,7 +344,7 @@ static int blake3_param_set(const char *val, zfs_kernel_param_t *unused) { (void) unused; - return (blake3_impl_setname(val)); + return (generic_impl_setname(val)); } #elif defined(__FreeBSD__) @@ -314,8 +356,9 @@ blake3_param(ZFS_MODULE_PARAM_ARGS) { int err; + generic_impl_init(); if (req->newptr == NULL) { - const uint32_t impl = IMPL_READ(blake3_impl_chosen); + const uint32_t impl = IMPL_READ(generic_impl_chosen); const int init_buflen = 64; const char *fmt; struct sbuf *s; @@ -331,9 +374,9 @@ blake3_param(ZFS_MODULE_PARAM_ARGS) (void) sbuf_printf(s, fmt, "fastest"); /* list all supported implementations */ - for (uint32_t i = 0; i < blake3_supp_impls_cnt; ++i) { + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { fmt = IMPL_FMT(impl, i); - (void) sbuf_printf(s, fmt, blake3_supp_impls[i]->name); + (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); } err = sbuf_finish(s); @@ -349,7 +392,7 @@ blake3_param(ZFS_MODULE_PARAM_ARGS) return (err); } - return (-blake3_impl_setname(buf)); + return (-generic_impl_setname(buf)); } #endif diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h index ecb51e3a3010..90d508fac08f 100644 --- a/module/icp/algs/blake3/blake3_impl.h +++ b/module/icp/algs/blake3/blake3_impl.h @@ -25,14 +25,13 @@ * Copyright (c) 2021-2022 Tino Reichardt */ -#ifndef BLAKE3_IMPL_H +#ifndef BLAKE3_IMPL_H #define BLAKE3_IMPL_H #ifdef __cplusplus extern "C" { #endif -#include #include #include #include @@ -56,7 +55,7 @@ typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs, typedef boolean_t (*blake3_is_supported_f)(void); -typedef struct blake3_impl_ops { +typedef struct { blake3_compress_in_place_f compress_in_place; blake3_compress_xof_f compress_xof; blake3_hash_many_f hash_many; @@ -65,30 +64,8 @@ typedef struct blake3_impl_ops { const char *name; } blake3_ops_t; -/* Return selected BLAKE3 implementation ops */ -extern const blake3_ops_t *blake3_impl_get_ops(void); - -extern const blake3_ops_t blake3_generic_impl; - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern const blake3_ops_t blake3_sse2_impl; -#endif - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern const blake3_ops_t blake3_sse41_impl; -#endif - -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern const blake3_ops_t blake3_avx2_impl; -#endif - -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern const blake3_ops_t blake3_avx512_impl; -#endif +/* return selected BLAKE3 implementation ops */ +extern const blake3_ops_t *blake3_get_ops(void); #if defined(__x86_64) #define MAX_SIMD_DEGREE 16 diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c deleted file mode 100644 index 04a8b3333656..000000000000 --- a/module/icp/algs/blake3/blake3_x86-64.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2021-2022 Tino Reichardt - */ - -#include "blake3_impl.h" - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - -extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags); - -extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]); - -extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_compress_in_place_sse2(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags) { - kfpu_begin(); - zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, - flags); - kfpu_end(); -} - -static void blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]) { - kfpu_begin(); - zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, - out); - kfpu_end(); -} - -static void blake3_hash_many_sse2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_sse2_supported(void) -{ -#if defined(__x86_64) - return (kfpu_allowed() && zfs_sse2_available()); -#elif defined(__PPC64__) && defined(__linux__) - return (kfpu_allowed() && zfs_vsx_available()); -#else - return (kfpu_allowed()); -#endif -} - -const blake3_ops_t blake3_sse2_impl = { - .compress_in_place = blake3_compress_in_place_sse2, - .compress_xof = blake3_compress_xof_sse2, - .hash_many = blake3_hash_many_sse2, - .is_supported = blake3_is_sse2_supported, - .degree = 4, - .name = "sse2" -}; -#endif - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - -extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags); - -extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]); - -extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags) { - kfpu_begin(); - zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, - flags); - kfpu_end(); -} - -static void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]) { - kfpu_begin(); - zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, - out); - kfpu_end(); -} - -static void blake3_hash_many_sse41(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_sse41_supported(void) -{ -#if defined(__x86_64) - return (kfpu_allowed() && zfs_sse4_1_available()); -#elif defined(__PPC64__) && defined(__linux__) - return (kfpu_allowed() && zfs_vsx_available()); -#else - return (kfpu_allowed()); -#endif -} - -const blake3_ops_t blake3_sse41_impl = { - .compress_in_place = blake3_compress_in_place_sse41, - .compress_xof = blake3_compress_xof_sse41, - .hash_many = blake3_hash_many_sse41, - .is_supported = blake3_is_sse41_supported, - .degree = 4, - .name = "sse41" -}; -#endif - -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_hash_many_avx2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_avx2_supported(void) -{ - return (kfpu_allowed() && zfs_sse4_1_available() && - zfs_avx2_available()); -} - -const blake3_ops_t blake3_avx2_impl = { - .compress_in_place = blake3_compress_in_place_sse41, - .compress_xof = blake3_compress_xof_sse41, - .hash_many = blake3_hash_many_avx2, - .is_supported = blake3_is_avx2_supported, - .degree = 8, - .name = "avx2" -}; -#endif - -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags); - -extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]); - -extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags) { - kfpu_begin(); - zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, - flags); - kfpu_end(); -} - -static void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]) { - kfpu_begin(); - zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, - out); - kfpu_end(); -} - -static void blake3_hash_many_avx512(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_avx512_supported(void) -{ - return (kfpu_allowed() && zfs_avx512f_available() && - zfs_avx512vl_available()); -} - -const blake3_ops_t blake3_avx512_impl = { - .compress_in_place = blake3_compress_in_place_avx512, - .compress_xof = blake3_compress_xof_avx512, - .hash_many = blake3_hash_many_avx512, - .is_supported = blake3_is_avx512_supported, - .degree = 16, - .name = "avx512" -}; -#endif diff --git a/tests/zfs-tests/cmd/checksum/blake3_test.c b/tests/zfs-tests/cmd/checksum/blake3_test.c index 648e1faaaeb7..aebe0363cc6e 100644 --- a/tests/zfs-tests/cmd/checksum/blake3_test.c +++ b/tests/zfs-tests/cmd/checksum/blake3_test.c @@ -31,6 +31,8 @@ #include #include +#include + /* * set it to a define for debugging */ @@ -485,10 +487,14 @@ main(int argc, char *argv[]) uint8_t buffer[102400]; uint64_t cpu_mhz = 0; int id, i, j; + const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); if (argc == 2) cpu_mhz = atoi(argv[1]); + if (!blake3) + return (1); + /* fill test message */ for (i = 0, j = 0; i < sizeof (buffer); i++, j++) { if (j == 251) @@ -497,9 +503,9 @@ main(int argc, char *argv[]) } (void) printf("Running algorithm correctness tests:\n"); - for (id = 0; id < blake3_impl_getcnt(); id++) { - blake3_impl_setid(id); - const char *name = blake3_impl_getname(); + for (id = 0; id < blake3->getcnt(); id++) { + blake3->setid(id); + const char *name = blake3->getname(); dprintf("Result for BLAKE3-%s:\n", name); for (i = 0; TestArray[i].hash; i++) { blake3_test_t *cur = &TestArray[i]; @@ -565,9 +571,9 @@ main(int argc, char *argv[]) } while (0) printf("Running performance tests (hashing 1024 MiB of data):\n"); - for (id = 0; id < blake3_impl_getcnt(); id++) { - blake3_impl_setid(id); - const char *name = blake3_impl_getname(); + for (id = 0; id < blake3->getcnt(); id++) { + blake3->setid(id); + const char *name = blake3->getname(); BLAKE3_PERF_TEST(name, 256); }