From e76373de7b7384bb6e5c6fd5e04f15b54df20fb7 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 7 Jun 2021 12:02:47 -0400 Subject: [PATCH] More aggsum optimizations - Avoid atomic_add() when updating as_lower_bound/as_upper_bound. Previous code was excessively strong on 64bit systems while not strong enough on 32bit ones. Instead introduce and use real atomic_load() and atomic_store() operations, just an assignments on 64bit machines, but using proper atomics on 32bit ones to avoid torn reads/writes. - Reduce number of buckets on large systems. Extra buckets not as much improve add speed, as hurt reads. Unlike wmsum for aggsum reads are still important. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Closes #12145 --- include/os/linux/spl/sys/atomic.h | 4 + include/sys/aggsum.h | 7 +- lib/libspl/asm-generic/atomic.c | 13 ++++ lib/libspl/include/atomic.h | 43 ++++++++++ module/zfs/aggsum.c | 125 ++++++++++++++++-------------- 5 files changed, 129 insertions(+), 63 deletions(-) diff --git a/include/os/linux/spl/sys/atomic.h b/include/os/linux/spl/sys/atomic.h index 2d21cbb3e140..8f7fa5aeda11 100644 --- a/include/os/linux/spl/sys/atomic.h +++ b/include/os/linux/spl/sys/atomic.h @@ -48,6 +48,8 @@ #define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v)) #define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y) #define atomic_swap_32(v, x) atomic_xchg((atomic_t *)(v), x) +#define atomic_load_32(v) atomic_read((atomic_t *)(v)) +#define atomic_store_32(v, x) atomic_set((atomic_t *)(v), x) #define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v)) #define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v)) #define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v)) @@ -58,6 +60,8 @@ #define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v)) #define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y) #define atomic_swap_64(v, x) atomic64_xchg((atomic64_t *)(v), x) +#define atomic_load_64(v) atomic64_read((atomic64_t *)(v)) +#define atomic_store_64(v, x) atomic64_set((atomic64_t *)(v), x) #ifdef _LP64 static __inline__ void * diff --git a/include/sys/aggsum.h b/include/sys/aggsum.h index cb43727f1df4..65800058cbf6 100644 --- a/include/sys/aggsum.h +++ b/include/sys/aggsum.h @@ -39,15 +39,16 @@ struct aggsum_bucket { typedef struct aggsum { kmutex_t as_lock; int64_t as_lower_bound; - int64_t as_upper_bound; + uint64_t as_upper_bound; + aggsum_bucket_t *as_buckets ____cacheline_aligned; uint_t as_numbuckets; - aggsum_bucket_t *as_buckets; + uint_t as_bucketshift; } aggsum_t; void aggsum_init(aggsum_t *, uint64_t); void aggsum_fini(aggsum_t *); int64_t aggsum_lower_bound(aggsum_t *); -int64_t aggsum_upper_bound(aggsum_t *); +uint64_t aggsum_upper_bound(aggsum_t *); int aggsum_compare(aggsum_t *, uint64_t); uint64_t aggsum_value(aggsum_t *); void aggsum_add(aggsum_t *, int64_t); diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/asm-generic/atomic.c index 35535ea49c79..504422b8e226 100644 --- a/lib/libspl/asm-generic/atomic.c +++ b/lib/libspl/asm-generic/atomic.c @@ -390,6 +390,19 @@ atomic_swap_ptr(volatile void *target, void *bits) return (old); } +#ifndef _LP64 +uint64_t +atomic_load_64(volatile uint64_t *target) +{ + return (__atomic_load_n(target, __ATOMIC_RELAXED)); +} + +void +atomic_store_64(volatile uint64_t *target, uint64_t bits) +{ + return (__atomic_store_n(target, bits, __ATOMIC_RELAXED)); +} +#endif int atomic_set_long_excl(volatile ulong_t *target, uint_t value) diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h index f8c257f9696b..8dd1d654a486 100644 --- a/lib/libspl/include/atomic.h +++ b/lib/libspl/include/atomic.h @@ -245,6 +245,49 @@ extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t); extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t); #endif +/* + * Atomically read variable. + */ +#define atomic_load_char(p) (*(volatile uchar_t *)(p)) +#define atomic_load_short(p) (*(volatile ushort_t *)(p)) +#define atomic_load_int(p) (*(volatile uint_t *)(p)) +#define atomic_load_long(p) (*(volatile ulong_t *)(p)) +#define atomic_load_ptr(p) (*(volatile __typeof(*p) *)(p)) +#define atomic_load_8(p) (*(volatile uint8_t *)(p)) +#define atomic_load_16(p) (*(volatile uint16_t *)(p)) +#define atomic_load_32(p) (*(volatile uint32_t *)(p)) +#ifdef _LP64 +#define atomic_load_64(p) (*(volatile uint64_t *)(p)) +#elif defined(_INT64_TYPE) +extern uint64_t atomic_load_64(volatile uint64_t *); +#endif + +/* + * Atomically write variable. + */ +#define atomic_store_char(p, v) \ + (*(volatile uchar_t *)(p) = (uchar_t)(v)) +#define atomic_store_short(p, v) \ + (*(volatile ushort_t *)(p) = (ushort_t)(v)) +#define atomic_store_int(p, v) \ + (*(volatile uint_t *)(p) = (uint_t)(v)) +#define atomic_store_long(p, v) \ + (*(volatile ulong_t *)(p) = (ulong_t)(v)) +#define atomic_store_ptr(p, v) \ + (*(volatile __typeof(*p) *)(p) = (v)) +#define atomic_store_8(p, v) \ + (*(volatile uint8_t *)(p) = (uint8_t)(v)) +#define atomic_store_16(p, v) \ + (*(volatile uint16_t *)(p) = (uint16_t)(v)) +#define atomic_store_32(p, v) \ + (*(volatile uint32_t *)(p) = (uint32_t)(v)) +#ifdef _LP64 +#define atomic_store_64(p, v) \ + (*(volatile uint64_t *)(p) = (uint64_t)(v)) +#elif defined(_INT64_TYPE) +extern void atomic_store_64(volatile uint64_t *, uint64_t); +#endif + /* * Perform an exclusive atomic bit set/clear on a target. * Returns 0 if bit was successfully set/cleared, or -1 diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c index e46da95f676c..c4ea4f86fc5f 100644 --- a/module/zfs/aggsum.c +++ b/module/zfs/aggsum.c @@ -78,11 +78,11 @@ */ /* - * We will borrow aggsum_borrow_multiplier times the current request, so we will - * have to get the as_lock approximately every aggsum_borrow_multiplier calls to - * aggsum_delta(). + * We will borrow 2^aggsum_borrow_shift times the current request, so we will + * have to get the as_lock approximately every 2^aggsum_borrow_shift calls to + * aggsum_add(). */ -static uint_t aggsum_borrow_multiplier = 10; +static uint_t aggsum_borrow_shift = 4; void aggsum_init(aggsum_t *as, uint64_t value) @@ -90,9 +90,14 @@ aggsum_init(aggsum_t *as, uint64_t value) bzero(as, sizeof (*as)); as->as_lower_bound = as->as_upper_bound = value; mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); - as->as_numbuckets = boot_ncpus; - as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t), - KM_SLEEP); + /* + * Too many buckets may hurt read performance without improving + * write. From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc. + */ + as->as_bucketshift = highbit64(boot_ncpus / 6) / 2; + as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1; + as->as_buckets = kmem_zalloc(as->as_numbuckets * + sizeof (aggsum_bucket_t), KM_SLEEP); for (int i = 0; i < as->as_numbuckets; i++) { mutex_init(&as->as_buckets[i].asc_lock, NULL, MUTEX_DEFAULT, NULL); @@ -111,59 +116,49 @@ aggsum_fini(aggsum_t *as) int64_t aggsum_lower_bound(aggsum_t *as) { - return (as->as_lower_bound); + return (atomic_load_64((volatile uint64_t *)&as->as_lower_bound)); } -int64_t +uint64_t aggsum_upper_bound(aggsum_t *as) { - return (as->as_upper_bound); -} - -static void -aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb) -{ - ASSERT(MUTEX_HELD(&as->as_lock)); - ASSERT(MUTEX_HELD(&asb->asc_lock)); - - /* - * We use atomic instructions for this because we read the upper and - * lower bounds without the lock, so we need stores to be atomic. - */ - atomic_add_64((volatile uint64_t *)&as->as_lower_bound, - asb->asc_delta + asb->asc_borrowed); - atomic_add_64((volatile uint64_t *)&as->as_upper_bound, - asb->asc_delta - asb->asc_borrowed); - asb->asc_delta = 0; - asb->asc_borrowed = 0; + return (atomic_load_64(&as->as_upper_bound)); } uint64_t aggsum_value(aggsum_t *as) { - int64_t rv; + int64_t lb; + uint64_t ub; mutex_enter(&as->as_lock); - if (as->as_lower_bound == as->as_upper_bound) { - rv = as->as_lower_bound; + lb = as->as_lower_bound; + ub = as->as_upper_bound; + if (lb == ub) { for (int i = 0; i < as->as_numbuckets; i++) { ASSERT0(as->as_buckets[i].asc_delta); ASSERT0(as->as_buckets[i].asc_borrowed); } mutex_exit(&as->as_lock); - return (rv); + return (lb); } for (int i = 0; i < as->as_numbuckets; i++) { struct aggsum_bucket *asb = &as->as_buckets[i]; + if (asb->asc_borrowed == 0) + continue; mutex_enter(&asb->asc_lock); - aggsum_flush_bucket(as, asb); + lb += asb->asc_delta + asb->asc_borrowed; + ub += asb->asc_delta - asb->asc_borrowed; + asb->asc_delta = 0; + asb->asc_borrowed = 0; mutex_exit(&asb->asc_lock); } - VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); - rv = as->as_lower_bound; + ASSERT3U(lb, ==, ub); + atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb); + atomic_store_64(&as->as_upper_bound, lb); mutex_exit(&as->as_lock); - return (rv); + return (lb); } void @@ -172,7 +167,8 @@ aggsum_add(aggsum_t *as, int64_t delta) struct aggsum_bucket *asb; int64_t borrow; - asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets]; + asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) % + as->as_numbuckets]; /* Try fast path if we already borrowed enough before. */ mutex_enter(&asb->asc_lock); @@ -188,21 +184,22 @@ aggsum_add(aggsum_t *as, int64_t delta) * We haven't borrowed enough. Take the global lock and borrow * considering what is requested now and what we borrowed before. */ - borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier; + borrow = (delta < 0 ? -delta : delta); + borrow <<= aggsum_borrow_shift + as->as_bucketshift; mutex_enter(&as->as_lock); - mutex_enter(&asb->asc_lock); - delta += asb->asc_delta; - asb->asc_delta = 0; if (borrow >= asb->asc_borrowed) borrow -= asb->asc_borrowed; else borrow = (borrow - (int64_t)asb->asc_borrowed) / 4; + mutex_enter(&asb->asc_lock); + delta += asb->asc_delta; + asb->asc_delta = 0; asb->asc_borrowed += borrow; - atomic_add_64((volatile uint64_t *)&as->as_lower_bound, - delta - borrow); - atomic_add_64((volatile uint64_t *)&as->as_upper_bound, - delta + borrow); mutex_exit(&asb->asc_lock); + atomic_store_64((volatile uint64_t *)&as->as_lower_bound, + as->as_lower_bound + delta - borrow); + atomic_store_64(&as->as_upper_bound, + as->as_upper_bound + delta + borrow); mutex_exit(&as->as_lock); } @@ -214,27 +211,35 @@ aggsum_add(aggsum_t *as, int64_t delta) int aggsum_compare(aggsum_t *as, uint64_t target) { - if (as->as_upper_bound < target) + int64_t lb; + uint64_t ub; + int i; + + if (atomic_load_64(&as->as_upper_bound) < target) return (-1); - if (as->as_lower_bound > target) + lb = atomic_load_64((volatile uint64_t *)&as->as_lower_bound); + if (lb > 0 && (uint64_t)lb > target) return (1); mutex_enter(&as->as_lock); - for (int i = 0; i < as->as_numbuckets; i++) { + lb = as->as_lower_bound; + ub = as->as_upper_bound; + for (i = 0; i < as->as_numbuckets; i++) { struct aggsum_bucket *asb = &as->as_buckets[i]; + if (asb->asc_borrowed == 0) + continue; mutex_enter(&asb->asc_lock); - aggsum_flush_bucket(as, asb); + lb += asb->asc_delta + asb->asc_borrowed; + ub += asb->asc_delta - asb->asc_borrowed; + asb->asc_delta = 0; + asb->asc_borrowed = 0; mutex_exit(&asb->asc_lock); - if (as->as_upper_bound < target) { - mutex_exit(&as->as_lock); - return (-1); - } - if (as->as_lower_bound > target) { - mutex_exit(&as->as_lock); - return (1); - } + if (ub < target || (lb > 0 && (uint64_t)lb > target)) + break; } - VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); - ASSERT3U(as->as_lower_bound, ==, target); + if (i >= as->as_numbuckets) + ASSERT3U(lb, ==, ub); + atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb); + atomic_store_64(&as->as_upper_bound, ub); mutex_exit(&as->as_lock); - return (0); + return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0); }